Begin conformance test

I'm not sure the details of this strategy can actually be made to work. But, something can.
author: Sam Atman 2025-05-12 15:22:37 -0400
committer: Sam Atman 2025-05-15 15:31:16 -0400
commit: cf8d8fe5d640511f6c4134fdaa36e930232ca7da (patch)
tree: 410a3c5195ea0780b637f740ebcb6e80e63db09c
parent: Implement Word iterator (diff)
download: zg-cf8d8fe5d640511f6c4134fdaa36e930232ca7da.tar.gz
zg-cf8d8fe5d640511f6c4134fdaa36e930232ca7da.tar.xz
zg-cf8d8fe5d640511f6c4134fdaa36e930232ca7da.zip
6 files changed, 362 insertions, 58 deletions
diff --git a/build.zig b/build.zig
index f89e90c..387b4c3 100644
--- a/build.zig
+++ b/build.zig
@@ -471,6 +471,7 @@ pub fn build(b: *std.Build) void {
    });
    unicode_tests.root_module.addImport("Graphemes", graphemes);
    unicode_tests.root_module.addImport("Normalize", norm);
+    unicode_tests.root_module.addImport("WordBreak", word_break);
    const run_unicode_tests = b.addRunArtifact(unicode_tests);
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 1ce1ea6..1f67fc6 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -364,3 +364,25 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
        try std.testing.expectEqual(@as(usize, 2), i);
    }
 }
+test "Iterator.peek" {
+    const peek_seq = "aΔ👨🏻‍🌾→";
+    const data = try Graphemes.init(std.testing.allocator);
+    defer data.deinit(std.testing.allocator);
+    var iter = data.iterator(peek_seq);
+    const peek_a = iter.peek().?;
+    const next_a = iter.next().?;
+    try std.testing.expectEqual(peek_a, next_a);
+    try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq));
+    const peek_d1 = iter.peek().?;
+    const peek_d2 = iter.peek().?;
+    try std.testing.expectEqual(peek_d1, peek_d2);
+    const next_d = iter.next().?;
+    try std.testing.expectEqual(peek_d2, next_d);
+    try std.testing.expectEqual(iter.peek(), iter.next());
+    try std.testing.expectEqual(iter.peek(), iter.next());
+    try std.testing.expectEqual(null, iter.peek());
+    try std.testing.expectEqual(null, iter.peek());
+    try std.testing.expectEqual(iter.peek(), iter.next());
+}
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 84fd1f7..53db76b 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -88,6 +88,11 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
    return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
 }
+/// Returns an iterator over words in `slice`
+pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator {
+    return Iterator.init(wordbreak, slice);
+}
 const IterState = packed struct {
    mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter
    mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric
@@ -113,7 +118,7 @@ pub const Iterator = struct {
    pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
        var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb };
        wb_iter.advance();
-        return wb;
+        return wb_iter;
    }
    pub fn next(iter: *Iterator) ?Word {
@@ -132,12 +137,18 @@ pub const Iterator = struct {
        scan: while (true) : (iter.advance()) {
            const this = iter.this.?;
            word_len += this.len;
+            var ignored = false;
            if (iter.that) |that| {
                const that_p = iter.wb.breakProperty(that.code);
                const this_p = this_p: {
                    if (!isIgnorable(that_p) and iter.cache != null) {
+                        // TODO: might not need these what with peekPast
+                        ignored = true;
                        defer iter.cache = null;
-                        break :this_p iter.cache.?;
+                        // Fixup some state, apply pre-4 rules
+                        const restore = iter.cache.?;
+                        if (restore == .WSegSpace) break :this_p .none;
+                        break :this_p restore;
                    } else {
                        break :this_p iter.wb.breakProperty(this.code);
                    }
@@ -149,11 +160,22 @@ pub const Iterator = struct {
                // WB3b  ÷ (Newline | CR | LF)
                if (isNewline(that_p)) break :scan;
                // WB3c  ZWJ × \p{Extended_Pictographic}
-                // The right way to do this one is a RuneSet, TODO: circle back
+                if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) {
+                    // Invalid after ignoring
+                    if (ignored) break :scan else continue :scan;
+                }
                // WB3d  WSegSpace × WSegSpace
                if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
                // WB4  X (Extend | Format | ZWJ)* → X
                if (isIgnorable(that_p)) {
+                    if (that_p == .ZWJ) {
+                        const next_val = iter.peekPast();
+                        if (next_val) |next_cp| {
+                            if (ext_pict.isMatch(next_cp.bytes(iter.cp_iter.bytes))) {
+                                continue :scan;
+                            }
+                        }
+                    }
                    if (iter.cache == null) {
                        iter.cache = this_p;
                    }
@@ -164,14 +186,14 @@ pub const Iterator = struct {
                    if (isAHLetter(that_p)) continue :scan;
                    // WB6  AHLetter × (MidLetter | MidNumLetQ) AHLetter
                    if (isMidVal(that_p)) {
-                        const next_val = iter.cp_iter.peek();
+                        const next_val = iter.peekPast();
                        if (next_val) |next_cp| {
                            const next_p = iter.wb.breakProperty(next_cp.code);
                            if (isAHLetter(next_p)) {
                                state.mid_punct = true;
                                continue :scan;
                            }
-                        } else break :scan;
+                        }
                    }
                }
                // AHLetter (MidLetter | MidNumLetQ) × AHLetter
@@ -187,7 +209,7 @@ pub const Iterator = struct {
                    if (that_p == .Single_Quote) continue :scan;
                    // WB7b  Hebrew_Letter × Double_Quote Hebrew_Letter
                    if (that_p == .Double_Quote) {
-                        const next_val = iter.cp_iter.peek();
+                        const next_val = iter.peekPast();
                        if (next_val) |next_cp| {
                            const next_p = iter.wb.breakProperty(next_cp.code);
                            if (next_p == .Hebrew_Letter) {
@@ -212,8 +234,8 @@ pub const Iterator = struct {
                // WB10  Numeric ×  AHLetter
                if (this_p == .Numeric and isAHLetter(that_p)) continue :scan;
                // WB12  Numeric × (MidNum | MidNumLetQ) Numeric
-                if (this_p == .Numeric and isMidVal(that_p)) {
+                if (this_p == .Numeric and isMidNum(that_p)) {
-                    const next_val = iter.cp_iter.peek();
+                    const next_val = iter.peekPast();
                    if (next_val) |next_cp| {
                        const next_p = iter.wb.breakProperty(next_cp.code);
                        if (next_p == .Numeric) {
@@ -224,7 +246,7 @@ pub const Iterator = struct {
                }
                // WB11  Numeric (MidNum | MidNumLetQ) × Numeric
                if (state.mid_num) {
-                    assert(isMidVal(this_p));
+                    assert(isMidNum(this_p));
                    assert(that_p == .Numeric);
                    state.mid_num = false;
                    continue :scan;
@@ -235,25 +257,18 @@ pub const Iterator = struct {
                if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan;
                // WB13b  ExtendNumLet × (AHLetter | Numeric | Katakana)
                if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
-                // WB15, WB16  ([^RI] ! sot) (RI RI)* RI × RI
+                // WB15, WB16  ([^RI] | sot) (RI RI)* RI × RI
-                if (that_p == .Regional_Indicator) {
+                if (this_p == .Regional_Indicator) {
-                    if (this_p == .Regional_Indicator) {
+                    if (that_p == .Regional_Indicator) {
-                        if (state.regional) {
+                        if (state.regional == true or this.offset == 0) {
                            state.regional = false;
                            continue :scan;
-                        } else {
-                            break :scan;
                        }
                    } else {
-                        const next_val = iter.cp_iter.peek();
+                        state.regional = true;
-                        if (next_val) |next_cp| {
-                            const next_p = iter.wb.breakProperty(next_cp.code);
-                            if (next_p == .Regional_Indicator) {
-                                state.regional = true;
-                                continue :scan;
-                            }
-                        } else break :scan;
                    }
+                } else if (that_p == .Regional_Indicator) {
+                    state.regional = true;
                }
                // WB999  Any ÷ Any
                break :scan;
@@ -265,9 +280,19 @@ pub const Iterator = struct {
        return Word{ .len = word_len, .offset = word_start };
    }
-    fn advance(wb_iter: *Iterator) void {
+    fn advance(iter: *Iterator) void {
-        wb_iter.this = wb_iter.that;
+        iter.this = iter.that;
-        wb_iter.that = wb_iter.cp_iter.next();
+        iter.that = iter.cp_iter.next();
+    }
+    fn peekPast(iter: *Iterator) ?CodePoint {
+        const save_cp = iter.cp_iter;
+        defer iter.cp_iter = save_cp;
+        while (iter.cp_iter.peek()) |peeked| {
+            if (!isIgnorable(iter.wb.breakProperty(peeked.code))) return peeked;
+            _ = iter.cp_iter.next();
+        }
+        return null;
    }
 };
@@ -292,6 +317,10 @@ inline fn isMidVal(wbp: WordBreakProperty) bool {
    return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
 }
+inline fn isMidNum(wbp: WordBreakProperty) bool {
+    return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote;
+}
 inline fn isExtensible(wbp: WordBreakProperty) bool {
    return switch (wbp) {
        .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
@@ -328,3 +357,5 @@ const testing = std.testing;
 const code_point = @import("code_point");
 const CodepointIterator = code_point.Iterator;
 const CodePoint = code_point.CodePoint;
+const ext_pict = @import("micro_runeset.zig").Extended_Pictographic;
diff --git a/src/code_point.zig b/src/code_point.zig
index b3c9aea..79ee5cd 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -10,6 +10,11 @@ pub const CodePoint = struct {
    code: u21,
    len: u3,
    offset: u32,
+    /// Return the slice of this codepoint, given the original string.
+    pub fn bytes(cp: CodePoint, str: []const u8) []const u8 {
+        return str[cp.offset..][0..cp.len];
+    }
 };
 /// This function is deprecated and will be removed in a later release.
diff --git a/src/micro_runeset.zig b/src/micro_runeset.zig
new file mode 100644
index 0000000..34fbcd3
--- /dev/null
+++ b/src/micro_runeset.zig
@@ -0,0 +1,207 @@
+//! Minimal RuneSet implementation
+//!
+//! This is copied from the full RuneSet module, so that `zg` doesn't
+//! depend on it.  There's one spot in the WordBreak algorithm which
+//! needs to identify the emoji Extended_Pictographic property, which
+//! is not otherwise used in ZG.  The Runeset is 89 words, while the
+//! trie lookup used throughout ZG would be much larger.
+//!
+//! The RuneSet is borrowed from Runicode, which encodes Unicode things
+//! in RuneSet form.  This will need updating for each version of Unicode.
+pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x8000060, 0x10000000000000, 0x8001000200600000, 0x7800985090, 0x801022055ef2d, 0xedf57effffffdf57, 0xaffd75bd6f7d001f, 0xdbffffbbbff7ff7f, 0x7d7fddd76f56dfb5, 0x3800000000000001, 0x40040000000000, 0x4, 0x30bae0000008000, 0x100, 0x10004000000, 0x20001f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xfffffffffffffff7, 0xffffffffffffffff, 0xffffffffffffffff, 0x7fffffffffffbfff, 0x800000006000, 0x4001700000000000, 0xffffe00003fe4000, 0x1fffffffff, 0x73fc800004007ffa, 0xfffffffffffd7e00, 0xffffffffffffffff, 0x7fffffffffffffff, 0xffd56ff6bedfafff, 0x77ffffffffff7bff, 0xffffffff5757ffff, 0x3fafff77ff7bfef, 0xbffffdfffffab77f, 0xffffd7efffffffff, 0xff5fefffffffffff, 0xef6fd7ffffffffff, 0x1fffd7ffffefff7b, 0xfdfabf7ff7ffbac0, 0xf7faff77ffaf5dbf, 0x7dfbbf7eb7f6ffed, 0xfff7775fbfefdebf, 0x7fee, 0xbedddfddfbf7f7db, 0x6ebb6edf776b7bdf, 0x7ff0000000000000, 0x7fff77ff7fe00000, 0x7000, 0x7c007f00, 0xffffc00000007f00, 0x7fffffffffffffff, 0xb3fb7f7fbeff7000, 0x7ebef7ffbfff779f, 0x7dff5bebff7dffef, 0x7fffffbfffff7bfb, 0xffffffffffffffff, 0x6b777fffffffffff, 0xdbbf6effffdfbebb, 0x7ebf7f7fb5bf5fdb, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x1fffffffffffffff } };
+// Meaningful names for the T1 slots
+const LOW = 0;
+const HI = 1;
+const LEAD = 2;
+const T4_OFF = 3;
+/// Minimum Viable Runeset.  Must be statically created, strictly boolean matching.
+pub const RuneSet = struct {
+    body: []const u64,
+    // Returns whether the slice is a match.  This assumes the validity of the
+    // string, which can be ensured by, in particular, deriving it from a CodePoint.
+    pub fn isMatch(runeset: RuneSet, str: []const u8) bool {
+        const set = runeset.body;
+        const a = codeunit(str[0]);
+        switch (a.kind) {
+            .follow => return false,
+            .low => {
+                const mask = toMask(set[LOW]);
+                if (mask.isIn(a))
+                    return true
+                else
+                    return false;
+            },
+            .hi => {
+                const mask = toMask(set[HI]);
+                if (mask.isIn(a))
+                    return true
+                else
+                    return false;
+            },
+            .lead => {
+                const nB = a.nMultiBytes().?;
+                const a_mask = toMask(set[LEAD]);
+                if (!a_mask.isIn(a)) return false;
+                const b = codeunit(str[1]);
+                const b_loc = 4 + a_mask.lowerThan(a).?;
+                const b_mask = toMask(set[b_loc]);
+                if (!b_mask.isIn(b)) return false;
+                if (nB == 2) return true;
+                const t3_off = 4 + @popCount(set[LEAD]);
+                const c = codeunit(str[2]);
+                // Slice is safe because we know the T2 span has at least one word.
+                const c_off = b_mask.higherThan(b).? + popCountSlice(set[b_loc + 1 .. t3_off]);
+                const c_loc = t3_off + c_off;
+                const c_mask = toMask(set[c_loc]);
+                if (!c_mask.isIn(c)) return false;
+                if (nB == 3) return true;
+                const d_off = c_mask.lowerThan(c).? + popCountSlice(set[t3_off..c_loc]);
+                const d_loc = set[T4_OFF] + d_off;
+                const d = codeunit(str[3]);
+                const d_mask = toMask(set[d_loc]);
+                if (d_mask.isIn(d)) return true else return false;
+            },
+        }
+    }
+};
+/// Kinds of most significant bits in UTF-8
+const RuneKind = enum(u2) {
+    low,
+    hi,
+    follow,
+    lead,
+};
+/// Packed `u8` struct representing one codeunit of UTF-8.
+const CodeUnit = packed struct(u8) {
+    body: u6,
+    kind: RuneKind,
+    /// Mask to check presence
+    pub inline fn inMask(self: *const CodeUnit) u64 {
+        return @as(u64, 1) << self.body;
+    }
+    // TODO consider an nMultiBytesFast, for the cases where we
+    // know that invalid lead bytes are never present (such as in set)
+    // operations, where we may assume that (and will assert that) the
+    // LEAD mask contains no such bytes.
+    /// Number of bytes in known multi-byte rune.
+    ///
+    /// Caller guarantees that the CodeUnit is a lead byte
+    /// of a multi-byte rune: `cu.kind == .lead`.
+    ///
+    /// Invalid lead bytes will return null.
+    pub inline fn nMultiBytes(self: *const CodeUnit) ?u8 {
+        return switch (self.body) {
+            // 0 and 1 are invalid for overlong reasons,
+            // but RuneSet supports overlong encodings
+            0...31 => 2,
+            32...47 => 3,
+            48...55 => 4,
+            // Wasted space 56...61 is due entirely to Microsoft's
+            // lack of vision and insistence on a substandard
+            // and utterly inadequate encoding for Unicode
+            // "64k should be enough for anyone" <spits>
+            56...63 => null,
+        };
+    }
+    /// Given a valid lead byte, return the number of bytes that should
+    /// make up the code unit sequence.  Will return `null` if the lead
+    /// byte is invalid.
+    pub inline fn nBytes(self: *const CodeUnit) ?u8 {
+        switch (self.kind) {
+            .low, .hi => return 1,
+            .lead => return self.nMultiBytes(),
+            .follow => return null,
+        }
+    }
+    /// Mask off all bits >= cu.body
+    pub inline fn hiMask(self: *const CodeUnit) u64 {
+        return (@as(u64, 1) << self.body) - 1;
+    }
+    /// Mask off all bits <= cu.body
+    pub inline fn lowMask(self: *const CodeUnit) u64 {
+        if (self.body == 63)
+            return 0
+        else
+            return ~((@as(u64, 1) << (self.body + 1)) - 1);
+    }
+    /// Cast the `CodeUnit` to its backing `u8`.
+    pub inline fn byte(self: *const CodeUnit) u8 {
+        return @bitCast(self.*);
+    }
+};
+/// Cast raw byte to CodeUnit
+inline fn codeunit(b: u8) CodeUnit {
+    return @bitCast(b);
+}
+inline fn toMask(w: u64) Mask {
+    return Mask.toMask(w);
+}
+/// Bitmask for runesets
+///
+/// We define our own bitset, because the operations we need to
+/// perform only overlap with IntegerBitSet for trivial one-liners,
+/// and furthermore, we need nondestructive versions of the basic
+/// operations, which aren't a part of the IntegerBitSet interface.
+///
+/// Note that Masks do not track which kind of byte they apply to,
+/// since they will be stored as ordinary u64s.  User code must
+/// ensure that CodeUnits tested against a Mask are of the appropriate
+/// type, and otherwise valid for the test performed.
+///
+const Mask = struct {
+    m: u64,
+    pub fn toMask(w: u64) Mask {
+        return Mask{ .m = w };
+    }
+    /// Test if a CodeUnit's low bytes are present in mask
+    pub inline fn isIn(self: Mask, cu: CodeUnit) bool {
+        return self.m | cu.inMask() == self.m;
+    }
+    /// Return number of bytes lower than cu.body in mask,
+    /// if cu inhabits the mask.  Otherwise return null.
+    pub inline fn lowerThan(self: Mask, cu: CodeUnit) ?u64 {
+        if (self.isIn(cu)) {
+            const m = cu.hiMask();
+            return @popCount(self.m & m);
+        } else {
+            return null;
+        }
+    }
+    /// Return number of bytes higher than cu.body in mask,
+    /// if cu inhabits the mask.  Otherwise return null.
+    pub inline fn higherThan(self: Mask, cu: CodeUnit) ?u64 {
+        if (self.isIn(cu)) {
+            const m = cu.lowMask();
+            return @popCount(self.m & m);
+        } else {
+            return null;
+        }
+    }
+};
+/// Sum of @popCount of all words in region.
+inline fn popCountSlice(region: []const u64) usize {
+    var ct: usize = 0;
+    for (region) |w| ct += @popCount(w);
+    return ct;
+}
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index ee259a3..7ce2b4e 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -1,31 +1,5 @@
 const dbg_print = false;
-comptime {
-    testing.refAllDecls(grapheme);
-}
-test "Iterator.peek" {
-    const peek_seq = "aΔ👨🏻‍🌾→";
-    const data = try Graphemes.init(std.testing.allocator);
-    defer data.deinit(std.testing.allocator);
-    var iter = data.iterator(peek_seq);
-    const peek_a = iter.peek().?;
-    const next_a = iter.next().?;
-    try std.testing.expectEqual(peek_a, next_a);
-    try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq));
-    const peek_d1 = iter.peek().?;
-    const peek_d2 = iter.peek().?;
-    try std.testing.expectEqual(peek_d1, peek_d2);
-    const next_d = iter.next().?;
-    try std.testing.expectEqual(peek_d2, next_d);
-    try std.testing.expectEqual(iter.peek(), iter.next());
-    try std.testing.expectEqual(iter.peek(), iter.next());
-    try std.testing.expectEqual(null, iter.peek());
-    try std.testing.expectEqual(null, iter.peek());
-    try std.testing.expectEqual(iter.peek(), iter.next());
-}
 test "Unicode normalization tests" {
    var arena = heap.ArenaAllocator.init(testing.allocator);
    defer arena.deinit();
@@ -147,15 +121,13 @@ test "Segmentation GraphemeIterator" {
    var buf_reader = std.io.bufferedReader(file.reader());
    var input_stream = buf_reader.reader();
-    const data = try Graphemes.init(allocator);
+    const graph = try Graphemes.init(allocator);
-    defer data.deinit(allocator);
+    defer graph.deinit(allocator);
    var buf: [4096]u8 = undefined;
    var line_iter: IterRead = .{ .read = &input_stream };
    while (try line_iter.next(&buf)) |raw| {
-        // Skip comments or empty lines.
-        // if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
        // Clean up.
        var line = std.mem.trimLeft(u8, raw, "÷ ");
        if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
@@ -190,7 +162,7 @@ test "Segmentation GraphemeIterator" {
            bytes_index += cp_index;
        }
-        var iter = data.iterator(all_bytes.items);
+        var iter = graph.iterator(all_bytes.items);
        // Check.
        for (want.items) |want_gc| {
@@ -203,6 +175,71 @@ test "Segmentation GraphemeIterator" {
    }
 }
+test "Segmentation Word Iterator" {
+    const allocator = std.testing.allocator;
+    var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{});
+    defer file.close();
+    var buf_reader = std.io.bufferedReader(file.reader());
+    var input_stream = buf_reader.reader();
+    const wb = try WordBreak.init(allocator);
+    defer wb.deinit(allocator);
+    var buf: [4096]u8 = undefined;
+    var line_iter: IterRead = .{ .read = &input_stream };
+    while (try line_iter.next(&buf)) |raw| {
+        // Clean up.
+        var line = std.mem.trimLeft(u8, raw, "÷ ");
+        if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
+            line = line[0..final];
+        }
+        // Iterate over fields.
+        var want = std.ArrayList(Grapheme).init(allocator);
+        defer want.deinit();
+        var all_bytes = std.ArrayList(u8).init(allocator);
+        defer all_bytes.deinit();
+        var words = std.mem.splitSequence(u8, line, " ÷ ");
+        var bytes_index: u32 = 0;
+        while (words.next()) |field| {
+            var code_points = std.mem.splitScalar(u8, field, ' ');
+            var cp_buf: [4]u8 = undefined;
+            var cp_index: u32 = 0;
+            var gc_len: u8 = 0;
+            while (code_points.next()) |code_point| {
+                if (std.mem.eql(u8, code_point, "×")) continue;
+                const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
+                const len = try unicode.utf8Encode(cp, &cp_buf);
+                try all_bytes.appendSlice(cp_buf[0..len]);
+                cp_index += len;
+                gc_len += len;
+            }
+            try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
+            bytes_index += cp_index;
+        }
+        var iter = wb.iterator(all_bytes.items);
+        // Check.
+        for (want.items, 1..) |want_word, i| {
+            const got_word = (iter.next()).?;
+            std.testing.expectEqualSlices(
+                u8,
+                want_word.bytes(all_bytes.items),
+                got_word.bytes(all_bytes.items),
+            ) catch |err| {
+                debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i });
+                return err;
+            };
+        }
+    }
+}
 const IterRead = struct {
    read: *Reader,
    line: usize = 0,
@@ -235,8 +272,9 @@ const debug = std.debug;
 const testing = std.testing;
 const unicode = std.unicode;
-const grapheme = @import("Graphemes");
 const Grapheme = @import("Graphemes").Grapheme;
 const Graphemes = @import("Graphemes");
 const GraphemeIterator = @import("Graphemes").Iterator;
 const Normalize = @import("Normalize");
+const WordBreak = @import("WordBreak");
author	Sam Atman	2025-05-12 15:22:37 -0400
committer	Sam Atman	2025-05-15 15:31:16 -0400
commit	cf8d8fe5d640511f6c4134fdaa36e930232ca7da (patch)
tree	410a3c5195ea0780b637f740ebcb6e80e63db09c
parent	Implement Word iterator (diff)
download	zg-cf8d8fe5d640511f6c4134fdaa36e930232ca7da.tar.gz zg-cf8d8fe5d640511f6c4134fdaa36e930232ca7da.tar.xz zg-cf8d8fe5d640511f6c4134fdaa36e930232ca7da.zip