From 890370f5479299940f505e1247c408064f789bd5 Mon Sep 17 00:00:00 2001 From: Matteo Romano Date: Mon, 12 May 2025 12:14:30 +0200 Subject: feat: add reverse grapheme iterator Closes #53 --- src/unicode_tests.zig | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'src/unicode_tests.zig') diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 2249007..828559a 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -219,3 +219,77 @@ test "Segmentation GraphemeIterator" { } } } + +test "Segmentation ReverseGraphemeIterator" { + const allocator = std.testing.allocator; + var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); + defer file.close(); + var buf_reader = std.io.bufferedReader(file.reader()); + var input_stream = buf_reader.reader(); + + const data = try Graphemes.init(allocator); + defer data.deinit(allocator); + + var buf: [4096]u8 = undefined; + var line_no: usize = 1; + + while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { + // Skip comments or empty lines. + if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; + + // Clean up. + var line = std.mem.trimLeft(u8, raw, "÷ "); + if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { + line = line[0..octo]; + } + // Iterate over fields. + var want = std.ArrayList(Grapheme).init(allocator); + defer want.deinit(); + + var all_bytes = std.ArrayList(u8).init(allocator); + defer all_bytes.deinit(); + + var graphemes = std.mem.splitSequence(u8, line, " ÷ "); + var bytes_index: u32 = 0; + + while (graphemes.next()) |field| { + var code_points = std.mem.splitScalar(u8, field, ' '); + var cp_buf: [4]u8 = undefined; + var cp_index: u32 = 0; + var gc_len: u8 = 0; + + while (code_points.next()) |code_point| { + if (std.mem.eql(u8, code_point, "×")) continue; + const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); + const len = try unicode.utf8Encode(cp, &cp_buf); + try all_bytes.appendSlice(cp_buf[0..len]); + cp_index += len; + gc_len += len; + } + + try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); + bytes_index += cp_index; + } + + // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); + var iter = data.reverseIterator(all_bytes.items); + + // Check. + var i: usize = want.items.len; + while (i > 0) { + i -= 1; + const want_gc = want.items[i]; + const got_gc = iter.prev() orelse { + std.debug.print("line {d} grapheme {d}: expected {any} found null\n", .{ line_no, i, want_gc }); + return error.TestExpectedEqual; + }; + std.testing.expectEqualStrings( + want_gc.bytes(all_bytes.items), + got_gc.bytes(all_bytes.items), + ) catch |err| { + std.debug.print("line {d} grapheme {d}: expected {any} found {any}\n", .{ line_no, i, want_gc, got_gc }); + return err; + }; + } + } +} -- cgit v1.2.3 From 470e896483300d099c7650f9cd8a13e236c63864 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Sun, 11 May 2025 17:26:50 -0400 Subject: Refactor in unicode_tests The comments in WordBreak and SentenceBreak tests get really long, the provided buffer would be inadequate. So this just provides a sub- iterator which will strip comments and comment lines, while keeping an eye on line numbers for any debugging. --- src/unicode_tests.zig | 77 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 28 deletions(-) (limited to 'src/unicode_tests.zig') diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 2249007..ee259a3 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -1,17 +1,4 @@ -const std = @import("std"); -const fmt = std.fmt; -const fs = std.fs; -const io = std.io; -const heap = std.heap; -const mem = std.mem; -const testing = std.testing; -const unicode = std.unicode; - -const grapheme = @import("Graphemes"); -const Grapheme = @import("Graphemes").Grapheme; -const Graphemes = @import("Graphemes"); -const GraphemeIterator = @import("Graphemes").Iterator; -const Normalize = @import("Normalize"); +const dbg_print = false; comptime { testing.refAllDecls(grapheme); @@ -50,16 +37,14 @@ test "Unicode normalization tests" { var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); defer file.close(); var buf_reader = io.bufferedReader(file.reader()); - const input_stream = buf_reader.reader(); + var input_stream = buf_reader.reader(); - var line_no: usize = 0; var buf: [4096]u8 = undefined; var cp_buf: [4]u8 = undefined; - while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| { - line_no += 1; - // Skip comments or empty lines. - if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; + var line_iter: IterRead = .{ .read = &input_stream }; + + while (try line_iter.next(&buf)) |line| { // Iterate over fields. var fields = mem.splitScalar(u8, line, ';'); var field_index: usize = 0; @@ -80,7 +65,7 @@ test "Unicode normalization tests" { input = try i_buf.toOwnedSlice(); } else if (field_index == 1) { - //debug.print("\n*** {s} ***\n", .{line}); + if (dbg_print) debug.print("\n*** {s} ***\n", .{line}); // NFC, time to test. var w_buf = std.ArrayList(u8).init(allocator); defer w_buf.deinit(); @@ -166,16 +151,15 @@ test "Segmentation GraphemeIterator" { defer data.deinit(allocator); var buf: [4096]u8 = undefined; - var line_no: usize = 1; + var line_iter: IterRead = .{ .read = &input_stream }; - while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { + while (try line_iter.next(&buf)) |raw| { // Skip comments or empty lines. - if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; - + // if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; // Clean up. var line = std.mem.trimLeft(u8, raw, "÷ "); - if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { - line = line[0..octo]; + if (std.mem.indexOf(u8, line, " ÷\t")) |final| { + line = line[0..final]; } // Iterate over fields. var want = std.ArrayList(Grapheme).init(allocator); @@ -206,7 +190,6 @@ test "Segmentation GraphemeIterator" { bytes_index += cp_index; } - // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); var iter = data.iterator(all_bytes.items); // Check. @@ -219,3 +202,41 @@ test "Segmentation GraphemeIterator" { } } } + +const IterRead = struct { + read: *Reader, + line: usize = 0, + + pub fn next(iter: *IterRead, buf: []u8) !?[]const u8 { + defer iter.line += 1; + const maybe_line = try iter.read.readUntilDelimiterOrEof(buf, '#'); + if (maybe_line) |this_line| { + try iter.read.skipUntilDelimiterOrEof('\n'); + if (this_line.len == 0 or this_line[0] == '@') { + // comment, next line + return iter.next(buf); + } else { + return this_line; + } + } else { + return null; + } + } +}; + +const std = @import("std"); +const fmt = std.fmt; +const fs = std.fs; +const io = std.io; +const Reader = io.BufferedReader(4096, fs.File.Reader).Reader; +const heap = std.heap; +const mem = std.mem; +const debug = std.debug; +const testing = std.testing; +const unicode = std.unicode; + +const grapheme = @import("Graphemes"); +const Grapheme = @import("Graphemes").Grapheme; +const Graphemes = @import("Graphemes"); +const GraphemeIterator = @import("Graphemes").Iterator; +const Normalize = @import("Normalize"); -- cgit v1.2.3 From cf8d8fe5d640511f6c4134fdaa36e930232ca7da Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Mon, 12 May 2025 15:22:37 -0400 Subject: Begin conformance test I'm not sure the details of this strategy can actually be made to work. But, something can. --- src/unicode_tests.zig | 102 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 70 insertions(+), 32 deletions(-) (limited to 'src/unicode_tests.zig') diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index ee259a3..7ce2b4e 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -1,31 +1,5 @@ const dbg_print = false; -comptime { - testing.refAllDecls(grapheme); -} - -test "Iterator.peek" { - const peek_seq = "aΔ👨🏻‍🌾→"; - const data = try Graphemes.init(std.testing.allocator); - defer data.deinit(std.testing.allocator); - - var iter = data.iterator(peek_seq); - const peek_a = iter.peek().?; - const next_a = iter.next().?; - try std.testing.expectEqual(peek_a, next_a); - try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq)); - const peek_d1 = iter.peek().?; - const peek_d2 = iter.peek().?; - try std.testing.expectEqual(peek_d1, peek_d2); - const next_d = iter.next().?; - try std.testing.expectEqual(peek_d2, next_d); - try std.testing.expectEqual(iter.peek(), iter.next()); - try std.testing.expectEqual(iter.peek(), iter.next()); - try std.testing.expectEqual(null, iter.peek()); - try std.testing.expectEqual(null, iter.peek()); - try std.testing.expectEqual(iter.peek(), iter.next()); -} - test "Unicode normalization tests" { var arena = heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -147,15 +121,13 @@ test "Segmentation GraphemeIterator" { var buf_reader = std.io.bufferedReader(file.reader()); var input_stream = buf_reader.reader(); - const data = try Graphemes.init(allocator); - defer data.deinit(allocator); + const graph = try Graphemes.init(allocator); + defer graph.deinit(allocator); var buf: [4096]u8 = undefined; var line_iter: IterRead = .{ .read = &input_stream }; while (try line_iter.next(&buf)) |raw| { - // Skip comments or empty lines. - // if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; // Clean up. var line = std.mem.trimLeft(u8, raw, "÷ "); if (std.mem.indexOf(u8, line, " ÷\t")) |final| { @@ -190,7 +162,7 @@ test "Segmentation GraphemeIterator" { bytes_index += cp_index; } - var iter = data.iterator(all_bytes.items); + var iter = graph.iterator(all_bytes.items); // Check. for (want.items) |want_gc| { @@ -203,6 +175,71 @@ test "Segmentation GraphemeIterator" { } } +test "Segmentation Word Iterator" { + const allocator = std.testing.allocator; + var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{}); + defer file.close(); + var buf_reader = std.io.bufferedReader(file.reader()); + var input_stream = buf_reader.reader(); + + const wb = try WordBreak.init(allocator); + defer wb.deinit(allocator); + + var buf: [4096]u8 = undefined; + var line_iter: IterRead = .{ .read = &input_stream }; + + while (try line_iter.next(&buf)) |raw| { + // Clean up. + var line = std.mem.trimLeft(u8, raw, "÷ "); + if (std.mem.indexOf(u8, line, " ÷\t")) |final| { + line = line[0..final]; + } + // Iterate over fields. + var want = std.ArrayList(Grapheme).init(allocator); + defer want.deinit(); + + var all_bytes = std.ArrayList(u8).init(allocator); + defer all_bytes.deinit(); + + var words = std.mem.splitSequence(u8, line, " ÷ "); + var bytes_index: u32 = 0; + + while (words.next()) |field| { + var code_points = std.mem.splitScalar(u8, field, ' '); + var cp_buf: [4]u8 = undefined; + var cp_index: u32 = 0; + var gc_len: u8 = 0; + + while (code_points.next()) |code_point| { + if (std.mem.eql(u8, code_point, "×")) continue; + const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); + const len = try unicode.utf8Encode(cp, &cp_buf); + try all_bytes.appendSlice(cp_buf[0..len]); + cp_index += len; + gc_len += len; + } + + try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); + bytes_index += cp_index; + } + + var iter = wb.iterator(all_bytes.items); + + // Check. + for (want.items, 1..) |want_word, i| { + const got_word = (iter.next()).?; + std.testing.expectEqualSlices( + u8, + want_word.bytes(all_bytes.items), + got_word.bytes(all_bytes.items), + ) catch |err| { + debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); + return err; + }; + } + } +} + const IterRead = struct { read: *Reader, line: usize = 0, @@ -235,8 +272,9 @@ const debug = std.debug; const testing = std.testing; const unicode = std.unicode; -const grapheme = @import("Graphemes"); const Grapheme = @import("Graphemes").Grapheme; const Graphemes = @import("Graphemes"); const GraphemeIterator = @import("Graphemes").Iterator; const Normalize = @import("Normalize"); + +const WordBreak = @import("WordBreak"); -- cgit v1.2.3 From a7f6990a8d433c6c8d34892a2126e94cdb31541f Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Mon, 12 May 2025 18:10:02 -0400 Subject: Rewrite, passes WordBreakTest After fixing a bug in Runicode which was fenceposting codepoints off the end of ranges. As one does. --- src/unicode_tests.zig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'src/unicode_tests.zig') diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 7ce2b4e..59f0c6f 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -228,8 +228,7 @@ test "Segmentation Word Iterator" { // Check. for (want.items, 1..) |want_word, i| { const got_word = (iter.next()).?; - std.testing.expectEqualSlices( - u8, + std.testing.expectEqualStrings( want_word.bytes(all_bytes.items), got_word.bytes(all_bytes.items), ) catch |err| { -- cgit v1.2.3 From 5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Tue, 13 May 2025 17:19:56 -0400 Subject: Hooked up break test, some bugs squashed The handling of ignorables is really different, because they 'adhere' to the future of the iteration, not the past. --- src/unicode_tests.zig | 49 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 15 deletions(-) (limited to 'src/unicode_tests.zig') diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 59f0c6f..8661bfd 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -195,7 +195,7 @@ test "Segmentation Word Iterator" { line = line[0..final]; } // Iterate over fields. - var want = std.ArrayList(Grapheme).init(allocator); + var want = std.ArrayList(Word).init(allocator); defer want.deinit(); var all_bytes = std.ArrayList(u8).init(allocator); @@ -219,22 +219,40 @@ test "Segmentation Word Iterator" { gc_len += len; } - try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); + try want.append(Word{ .len = gc_len, .offset = bytes_index }); bytes_index += cp_index; } - - var iter = wb.iterator(all_bytes.items); - - // Check. - for (want.items, 1..) |want_word, i| { - const got_word = (iter.next()).?; - std.testing.expectEqualStrings( - want_word.bytes(all_bytes.items), - got_word.bytes(all_bytes.items), - ) catch |err| { - debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); - return err; - }; + { + var iter = wb.iterator(all_bytes.items); + + // Check. + for (want.items, 1..) |want_word, i| { + const got_word = (iter.next()).?; + std.testing.expectEqualStrings( + want_word.bytes(all_bytes.items), + got_word.bytes(all_bytes.items), + ) catch |err| { + debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); + return err; + }; + } + } + { + var r_iter = wb.reverseIterator(all_bytes.items); + var idx = want.items.len - 1; + while (true) : (idx -= 1) { + const want_word = want.items[idx]; + const got_word = r_iter.prev().?; + std.testing.expectEqualSlices( + u8, + want_word.bytes(all_bytes.items), + got_word.bytes(all_bytes.items), + ) catch |err| { + debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 }); + return err; + }; + if (idx == 0) break; + } } } } @@ -277,3 +295,4 @@ const GraphemeIterator = @import("Graphemes").Iterator; const Normalize = @import("Normalize"); const WordBreak = @import("WordBreak"); +const Word = WordBreak.Word; -- cgit v1.2.3 From b1d67fab5c3dd3ed1d47ee63ab45a600b19f7a3c Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 14 May 2025 10:46:25 -0400 Subject: Peek tests for word iterators --- src/unicode_tests.zig | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'src/unicode_tests.zig') diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 8661bfd..ef459bf 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -224,6 +224,7 @@ test "Segmentation Word Iterator" { } { var iter = wb.iterator(all_bytes.items); + var peeked: ?Word = iter.peek(); // Check. for (want.items, 1..) |want_word, i| { @@ -235,11 +236,21 @@ test "Segmentation Word Iterator" { debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); return err; }; + std.testing.expectEqualStrings( + peeked.?.bytes(all_bytes.items), + got_word.bytes(all_bytes.items), + ) catch |err| { + debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, i }); + return err; + }; + peeked = iter.peek(); } } { var r_iter = wb.reverseIterator(all_bytes.items); + var peeked: ?Word = r_iter.peek(); var idx = want.items.len - 1; + while (true) : (idx -= 1) { const want_word = want.items[idx]; const got_word = r_iter.prev().?; @@ -251,6 +262,14 @@ test "Segmentation Word Iterator" { debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 }); return err; }; + std.testing.expectEqualStrings( + peeked.?.bytes(all_bytes.items), + got_word.bytes(all_bytes.items), + ) catch |err| { + debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx + 1 }); + return err; + }; + peeked = r_iter.peek(); if (idx == 0) break; } } -- cgit v1.2.3 From 736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Thu, 15 May 2025 10:57:33 -0400 Subject: wordAtIndex passes conformance I removed the initAtIndex functions from the public vocabulary, because the last couple of days of sweat and blood prove that it's hard to use correctly. That's probably it for WordBreak, now to fix the overlong bug on v0.14 and get this integrated with the new reverse grapheme iterator. --- src/unicode_tests.zig | 76 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 13 deletions(-) (limited to 'src/unicode_tests.zig') diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index ef459bf..8b02e98 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -222,32 +222,58 @@ test "Segmentation Word Iterator" { try want.append(Word{ .len = gc_len, .offset = bytes_index }); bytes_index += cp_index; } + const this_str = all_bytes.items; + { - var iter = wb.iterator(all_bytes.items); + var iter = wb.iterator(this_str); var peeked: ?Word = iter.peek(); // Check. - for (want.items, 1..) |want_word, i| { + for (want.items, 1..) |want_word, idx| { const got_word = (iter.next()).?; std.testing.expectEqualStrings( - want_word.bytes(all_bytes.items), - got_word.bytes(all_bytes.items), + want_word.bytes(this_str), + got_word.bytes(this_str), ) catch |err| { - debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); + debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx }); return err; }; std.testing.expectEqualStrings( - peeked.?.bytes(all_bytes.items), - got_word.bytes(all_bytes.items), + peeked.?.bytes(this_str), + got_word.bytes(this_str), ) catch |err| { - debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, i }); + debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx }); return err; }; + var r_iter = iter.reverseIterator(); + const if_r_word = r_iter.prev(); + if (if_r_word) |r_word| { + std.testing.expectEqualStrings( + want_word.bytes(this_str), + r_word.bytes(this_str), + ) catch |err| { + debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx }); + return err; + }; + } else { + try testing.expect(false); + } + for (got_word.offset..got_word.offset + got_word.len) |i| { + const this_word = wb.wordAtIndex(this_str, i); + std.testing.expectEqualSlices( + u8, + got_word.bytes(this_str), + this_word.bytes(this_str), + ) catch |err| { + debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i }); + return err; + }; + } peeked = iter.peek(); } } { - var r_iter = wb.reverseIterator(all_bytes.items); + var r_iter = wb.reverseIterator(this_str); var peeked: ?Word = r_iter.peek(); var idx = want.items.len - 1; @@ -256,19 +282,43 @@ test "Segmentation Word Iterator" { const got_word = r_iter.prev().?; std.testing.expectEqualSlices( u8, - want_word.bytes(all_bytes.items), - got_word.bytes(all_bytes.items), + want_word.bytes(this_str), + got_word.bytes(this_str), ) catch |err| { debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 }); return err; }; std.testing.expectEqualStrings( - peeked.?.bytes(all_bytes.items), - got_word.bytes(all_bytes.items), + peeked.?.bytes(this_str), + got_word.bytes(this_str), ) catch |err| { debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx + 1 }); return err; }; + var f_iter = r_iter.forwardIterator(); + const if_f_word = f_iter.next(); + if (if_f_word) |f_word| { + std.testing.expectEqualStrings( + want_word.bytes(this_str), + f_word.bytes(this_str), + ) catch |err| { + debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx }); + return err; + }; + } else { + try testing.expect(false); + } + for (got_word.offset..got_word.offset + got_word.len) |i| { + const this_word = wb.wordAtIndex(this_str, i); + std.testing.expectEqualSlices( + u8, + got_word.bytes(this_str), + this_word.bytes(this_str), + ) catch |err| { + debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i }); + return err; + }; + } peeked = r_iter.peek(); if (idx == 0) break; } -- cgit v1.2.3 From 713c01c22c7c4051cfc2bd83811fd969b1ccaddc Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Thu, 15 May 2025 23:20:50 -0400 Subject: Merge Grapheme Segmentation Iterator Tests --- src/unicode_tests.zig | 113 +++++++++++++++----------------------------------- 1 file changed, 34 insertions(+), 79 deletions(-) (limited to 'src/unicode_tests.zig') diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 0204b92..7139d4c 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -162,89 +162,44 @@ test "Segmentation GraphemeIterator" { bytes_index += cp_index; } - var iter = graph.iterator(all_bytes.items); - - // Check. - for (want.items) |want_gc| { - const got_gc = (iter.next()).?; - try std.testing.expectEqualStrings( - want_gc.bytes(all_bytes.items), - got_gc.bytes(all_bytes.items), - ); - } - } -} - -test "Segmentation ReverseGraphemeIterator" { - const allocator = std.testing.allocator; - var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); - defer file.close(); - var buf_reader = std.io.bufferedReader(file.reader()); - var input_stream = buf_reader.reader(); - - const data = try Graphemes.init(allocator); - defer data.deinit(allocator); - - var buf: [4096]u8 = undefined; - var line_no: usize = 1; - - while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { - // Skip comments or empty lines. - if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; - - // Clean up. - var line = std.mem.trimLeft(u8, raw, "÷ "); - if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { - line = line[0..octo]; - } - // Iterate over fields. - var want = std.ArrayList(Grapheme).init(allocator); - defer want.deinit(); - - var all_bytes = std.ArrayList(u8).init(allocator); - defer all_bytes.deinit(); - - var graphemes = std.mem.splitSequence(u8, line, " ÷ "); - var bytes_index: u32 = 0; - - while (graphemes.next()) |field| { - var code_points = std.mem.splitScalar(u8, field, ' '); - var cp_buf: [4]u8 = undefined; - var cp_index: u32 = 0; - var gc_len: u8 = 0; + { + var iter = graph.iterator(all_bytes.items); - while (code_points.next()) |code_point| { - if (std.mem.eql(u8, code_point, "×")) continue; - const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); - const len = try unicode.utf8Encode(cp, &cp_buf); - try all_bytes.appendSlice(cp_buf[0..len]); - cp_index += len; - gc_len += len; + // Check. + for (want.items) |want_gc| { + const got_gc = (iter.next()).?; + try std.testing.expectEqualStrings( + want_gc.bytes(all_bytes.items), + got_gc.bytes(all_bytes.items), + ); } - - try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); - bytes_index += cp_index; } + { + var iter = graph.reverseIterator(all_bytes.items); - // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); - var iter = data.reverseIterator(all_bytes.items); - - // Check. - var i: usize = want.items.len; - while (i > 0) { - i -= 1; - const want_gc = want.items[i]; - const got_gc = iter.prev() orelse { - std.debug.print("line {d} grapheme {d}: expected {any} found null\n", .{ line_no, i, want_gc }); - return error.TestExpectedEqual; - }; - std.testing.expectEqualStrings( - want_gc.bytes(all_bytes.items), - got_gc.bytes(all_bytes.items), - ) catch |err| { - std.debug.print("line {d} grapheme {d}: expected {any} found {any}\n", .{ line_no, i, want_gc, got_gc }); - return err; - }; + // Check. + var i: usize = want.items.len; + while (i > 0) { + i -= 1; + const want_gc = want.items[i]; + const got_gc = iter.prev() orelse { + std.debug.print( + "line {d} grapheme {d}: expected {any} found null\n", + .{ line_iter.line, i, want_gc }, + ); + return error.TestExpectedEqual; + }; + std.testing.expectEqualStrings( + want_gc.bytes(all_bytes.items), + got_gc.bytes(all_bytes.items), + ) catch |err| { + std.debug.print( + "line {d} grapheme {d}: expected {any} found {any}\n", + .{ line_iter.line, i, want_gc, got_gc }, + ); + return err; + }; + } } } } -- cgit v1.2.3 From aa20bebade8eeb3ca75199dc252feb3edb203fb1 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 16 May 2025 12:06:36 -0400 Subject: Words module In keeping with the new nomenclature, we're calling the module "Words", not "WordBreak". The latter is Unicode jargon, the module provides word iterators. Words are the figure, word breaks are the ground. --- src/unicode_tests.zig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/unicode_tests.zig') diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 7139d4c..18f1814 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -211,7 +211,7 @@ test "Segmentation Word Iterator" { var buf_reader = std.io.bufferedReader(file.reader()); var input_stream = buf_reader.reader(); - const wb = try WordBreak.init(allocator); + const wb = try Words.init(allocator); defer wb.deinit(allocator); var buf: [4096]u8 = undefined; @@ -392,5 +392,5 @@ const Graphemes = @import("Graphemes"); const GraphemeIterator = @import("Graphemes").Iterator; const Normalize = @import("Normalize"); -const WordBreak = @import("WordBreak"); -const Word = WordBreak.Word; +const Words = @import("Words"); +const Word = Words.Word; -- cgit v1.2.3 From ef27c51b8e46f3909a27fd137429b717797f1fd9 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 23 May 2025 16:48:55 -0400 Subject: Add iterateBefore and iterateAfter These create reverse or forward iterators before or after a Word. So this way, the user can get the word at an index, then iterate forward or back from that word. Also: Fixes #59 Which was fixed awhile back, but I don't feel like doing repo surgery to tag the fix where it happened. We have blame for that kind of thing. --- src/unicode_tests.zig | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'src/unicode_tests.zig') diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 18f1814..195fdcb 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -287,6 +287,25 @@ test "Segmentation Word Iterator" { } else { try testing.expect(false); } + var peek_iter = wb.iterateAfter(this_str, got_word); + const peek_1 = peek_iter.next(); + if (peek_1) |p1| { + const peek_2 = iter.peek(); + if (peek_2) |p2| { + std.testing.expectEqualSlices( + u8, + p1.bytes(this_str), + p2.bytes(this_str), + ) catch |err| { + debug.print("Bad peek on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, idx }); + return err; + }; + } else { + try testing.expect(false); + } + } else { + try testing.expectEqual(null, iter.peek()); + } for (got_word.offset..got_word.offset + got_word.len) |i| { const this_word = wb.wordAtIndex(this_str, i); std.testing.expectEqualSlices( @@ -337,6 +356,25 @@ test "Segmentation Word Iterator" { } else { try testing.expect(false); } + var peek_iter = wb.iterateBefore(this_str, got_word); + const peek_1 = peek_iter.prev(); + if (peek_1) |p1| { + const peek_2 = r_iter.peek(); + if (peek_2) |p2| { + std.testing.expectEqualSlices( + u8, + p1.bytes(this_str), + p2.bytes(this_str), + ) catch |err| { + debug.print("Bad peek on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, idx }); + return err; + }; + } else { + try testing.expect(false); + } + } else { + try testing.expectEqual(null, r_iter.peek()); + } for (got_word.offset..got_word.offset + got_word.len) |i| { const this_word = wb.wordAtIndex(this_str, i); std.testing.expectEqualSlices( -- cgit v1.2.3 From c9a1b3392973ee30e6a9a532f1da8605619b5b06 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 23 May 2025 18:46:30 -0400 Subject: Make offset size configurable Hopefully I can talk users out of taking advantage of this configuration but I'll have better luck with that if it's available. --- src/unicode_tests.zig | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'src/unicode_tests.zig') diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 195fdcb..c463dcc 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -141,12 +141,12 @@ test "Segmentation GraphemeIterator" { defer all_bytes.deinit(); var graphemes = std.mem.splitSequence(u8, line, " ÷ "); - var bytes_index: u32 = 0; + var bytes_index: uoffset = 0; while (graphemes.next()) |field| { var code_points = std.mem.splitScalar(u8, field, ' '); var cp_buf: [4]u8 = undefined; - var cp_index: u32 = 0; + var cp_index: uoffset = 0; var gc_len: u8 = 0; while (code_points.next()) |code_point| { @@ -231,12 +231,12 @@ test "Segmentation Word Iterator" { defer all_bytes.deinit(); var words = std.mem.splitSequence(u8, line, " ÷ "); - var bytes_index: u32 = 0; + var bytes_index: uoffset = 0; while (words.next()) |field| { var code_points = std.mem.splitScalar(u8, field, ' '); var cp_buf: [4]u8 = undefined; - var cp_index: u32 = 0; + var cp_index: uoffset = 0; var gc_len: u8 = 0; while (code_points.next()) |code_point| { @@ -425,6 +425,8 @@ const debug = std.debug; const testing = std.testing; const unicode = std.unicode; +const uoffset = @FieldType(Word, "offset"); + const Grapheme = @import("Graphemes").Grapheme; const Graphemes = @import("Graphemes"); const GraphemeIterator = @import("Graphemes").Iterator; -- cgit v1.2.3 From 8f5209fa095c2ed9114ce102b2f9b2cc90d66b13 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Sun, 1 Jun 2025 14:08:25 -0400 Subject: Add graphemeAtIndex + iterate before and after That completes the set. I do think it's possible to bum a few more cycles from the implementation, but, I'm not going to. It passes the acceptance suite and that's what it needs to do. --- src/unicode_tests.zig | 69 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 10 deletions(-) (limited to 'src/unicode_tests.zig') diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index c463dcc..ae177a9 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -162,20 +162,51 @@ test "Segmentation GraphemeIterator" { bytes_index += cp_index; } + const this_str = all_bytes.items; + { - var iter = graph.iterator(all_bytes.items); + var iter = graph.iterator(this_str); // Check. - for (want.items) |want_gc| { + for (want.items, 1..) |want_gc, idx| { const got_gc = (iter.next()).?; try std.testing.expectEqualStrings( - want_gc.bytes(all_bytes.items), - got_gc.bytes(all_bytes.items), + want_gc.bytes(this_str), + got_gc.bytes(this_str), ); + for (got_gc.offset..got_gc.offset + got_gc.len) |i| { + const this_gc = graph.graphemeAtIndex(this_str, i); + std.testing.expectEqualSlices( + u8, + got_gc.bytes(this_str), + this_gc.bytes(this_str), + ) catch |err| { + debug.print("Wrong grapheme on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i }); + return err; + }; + } + var after_iter = graph.iterateAfterGrapheme(this_str, got_gc); + if (after_iter.next()) |next_gc| { + if (iter.peek()) |next_peek| { + std.testing.expectEqualSlices( + u8, + next_gc.bytes(this_str), + next_peek.bytes(this_str), + ) catch |err| { + debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, idx }); + return err; + }; + } else { + debug.print("Mismatch: peek missing, next found, line {d} #{d}\n", .{ line_iter.line, idx }); + try testing.expect(false); + } + } else { + try testing.expectEqual(null, iter.peek()); + } } } { - var iter = graph.reverseIterator(all_bytes.items); + var iter = graph.reverseIterator(this_str); // Check. var i: usize = want.items.len; @@ -190,8 +221,8 @@ test "Segmentation GraphemeIterator" { return error.TestExpectedEqual; }; std.testing.expectEqualStrings( - want_gc.bytes(all_bytes.items), - got_gc.bytes(all_bytes.items), + want_gc.bytes(this_str), + got_gc.bytes(this_str), ) catch |err| { std.debug.print( "line {d} grapheme {d}: expected {any} found {any}\n", @@ -199,6 +230,24 @@ test "Segmentation GraphemeIterator" { ); return err; }; + var before_iter = graph.iterateBeforeGrapheme(this_str, got_gc); + if (before_iter.prev()) |prev_gc| { + if (iter.peek()) |prev_peek| { + std.testing.expectEqualSlices( + u8, + prev_gc.bytes(this_str), + prev_peek.bytes(this_str), + ) catch |err| { + debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, i }); + return err; + }; + } else { + debug.print("Mismatch: peek missing, prev found, line {d} #{d}\n", .{ line_iter.line, i }); + try testing.expect(false); + } + } else { + try testing.expectEqual(null, iter.peek()); + } } } } @@ -287,7 +336,7 @@ test "Segmentation Word Iterator" { } else { try testing.expect(false); } - var peek_iter = wb.iterateAfter(this_str, got_word); + var peek_iter = wb.iterateAfterWord(this_str, got_word); const peek_1 = peek_iter.next(); if (peek_1) |p1| { const peek_2 = iter.peek(); @@ -313,7 +362,7 @@ test "Segmentation Word Iterator" { got_word.bytes(this_str), this_word.bytes(this_str), ) catch |err| { - debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i }); + debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i }); return err; }; } @@ -356,7 +405,7 @@ test "Segmentation Word Iterator" { } else { try testing.expect(false); } - var peek_iter = wb.iterateBefore(this_str, got_word); + var peek_iter = wb.iterateBeforeWord(this_str, got_word); const peek_1 = peek_iter.prev(); if (peek_1) |p1| { const peek_2 = r_iter.peek(); -- cgit v1.2.3