From cf8d8fe5d640511f6c4134fdaa36e930232ca7da Mon Sep 17 00:00:00 2001
From: Sam Atman
Date: Mon, 12 May 2025 15:22:37 -0400
Subject: Begin conformance test

I'm not sure the details of this strategy can actually be made to work.
But, something can.
---
 src/Graphemes.zig     |  22 ++++++
 src/WordBreak.zig     |  83 +++++++++++++-------
 src/code_point.zig    |   5 ++
 src/micro_runeset.zig | 207 ++++++++++++++++++++++++++++++++++++++++++++++++++
 src/unicode_tests.zig | 102 +++++++++++++++++--------
 5 files changed, 361 insertions(+), 58 deletions(-)
 create mode 100644 src/micro_runeset.zig

(limited to 'src')

diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 1ce1ea6..1f67fc6 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -364,3 +364,25 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
         try std.testing.expectEqual(@as(usize, 2), i);
     }
 }
+
+test "Iterator.peek" {
+    const peek_seq = "aΔ👨🏻‍🌾→";
+    const data = try Graphemes.init(std.testing.allocator);
+    defer data.deinit(std.testing.allocator);
+
+    var iter = data.iterator(peek_seq);
+    const peek_a = iter.peek().?;
+    const next_a = iter.next().?;
+    try std.testing.expectEqual(peek_a, next_a);
+    try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq));
+    const peek_d1 = iter.peek().?;
+    const peek_d2 = iter.peek().?;
+    try std.testing.expectEqual(peek_d1, peek_d2);
+    const next_d = iter.next().?;
+    try std.testing.expectEqual(peek_d2, next_d);
+    try std.testing.expectEqual(iter.peek(), iter.next());
+    try std.testing.expectEqual(iter.peek(), iter.next());
+    try std.testing.expectEqual(null, iter.peek());
+    try std.testing.expectEqual(null, iter.peek());
+    try std.testing.expectEqual(iter.peek(), iter.next());
+}
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 84fd1f7..53db76b 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -88,6 +88,11 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
     return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
 }
 
+/// Returns an iterator over words in `slice`
+pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator {
+    return Iterator.init(wordbreak, slice);
+}
+
 const IterState = packed struct {
     mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter
     mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric
@@ -113,7 +118,7 @@ pub const Iterator = struct {
     pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
         var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb };
         wb_iter.advance();
-        return wb;
+        return wb_iter;
     }
 
     pub fn next(iter: *Iterator) ?Word {
@@ -132,12 +137,18 @@ pub const Iterator = struct {
         scan: while (true) : (iter.advance()) {
             const this = iter.this.?;
             word_len += this.len;
+            var ignored = false;
             if (iter.that) |that| {
                 const that_p = iter.wb.breakProperty(that.code);
                 const this_p = this_p: {
                     if (!isIgnorable(that_p) and iter.cache != null) {
+                        // TODO: might not need these what with peekPast
+                        ignored = true;
                         defer iter.cache = null;
-                        break :this_p iter.cache.?;
+                        // Fixup some state, apply pre-4 rules
+                        const restore = iter.cache.?;
+                        if (restore == .WSegSpace) break :this_p .none;
+                        break :this_p restore;
                     } else {
                         break :this_p iter.wb.breakProperty(this.code);
                     }
@@ -149,11 +160,22 @@ pub const Iterator = struct {
                 // WB3b  ÷ (Newline | CR | LF)
                 if (isNewline(that_p)) break :scan;
                 // WB3c  ZWJ × \p{Extended_Pictographic}
-                // The right way to do this one is a RuneSet, TODO: circle back
+                if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) {
+                    // Invalid after ignoring
+                    if (ignored) break :scan else continue :scan;
+                }
                 // WB3d  WSegSpace × WSegSpace
                 if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
                 // WB4  X (Extend | Format | ZWJ)* → X
                 if (isIgnorable(that_p)) {
+                    if (that_p == .ZWJ) {
+                        const next_val = iter.peekPast();
+                        if (next_val) |next_cp| {
+                            if (ext_pict.isMatch(next_cp.bytes(iter.cp_iter.bytes))) {
+                                continue :scan;
+                            }
+                        }
+                    }
                     if (iter.cache == null) {
                         iter.cache = this_p;
                     }
@@ -164,14 +186,14 @@ pub const Iterator = struct {
                     if (isAHLetter(that_p)) continue :scan;
                     // WB6  AHLetter × (MidLetter | MidNumLetQ) AHLetter
                     if (isMidVal(that_p)) {
-                        const next_val = iter.cp_iter.peek();
+                        const next_val = iter.peekPast();
                         if (next_val) |next_cp| {
                             const next_p = iter.wb.breakProperty(next_cp.code);
                             if (isAHLetter(next_p)) {
                                 state.mid_punct = true;
                                 continue :scan;
                             }
-                        } else break :scan;
+                        }
                     }
                 }
                 // AHLetter (MidLetter | MidNumLetQ) × AHLetter
@@ -187,7 +209,7 @@ pub const Iterator = struct {
                     if (that_p == .Single_Quote) continue :scan;
                     // WB7b  Hebrew_Letter × Double_Quote Hebrew_Letter
                     if (that_p == .Double_Quote) {
-                        const next_val = iter.cp_iter.peek();
+                        const next_val = iter.peekPast();
                         if (next_val) |next_cp| {
                             const next_p = iter.wb.breakProperty(next_cp.code);
                             if (next_p == .Hebrew_Letter) {
@@ -212,8 +234,8 @@ pub const Iterator = struct {
                 // WB10  Numeric ×  AHLetter
                 if (this_p == .Numeric and isAHLetter(that_p)) continue :scan;
                 // WB12  Numeric × (MidNum | MidNumLetQ) Numeric
-                if (this_p == .Numeric and isMidVal(that_p)) {
-                    const next_val = iter.cp_iter.peek();
+                if (this_p == .Numeric and isMidNum(that_p)) {
+                    const next_val = iter.peekPast();
                     if (next_val) |next_cp| {
                         const next_p = iter.wb.breakProperty(next_cp.code);
                         if (next_p == .Numeric) {
@@ -224,7 +246,7 @@ pub const Iterator = struct {
                 }
                 // WB11  Numeric (MidNum | MidNumLetQ) × Numeric
                 if (state.mid_num) {
-                    assert(isMidVal(this_p));
+                    assert(isMidNum(this_p));
                     assert(that_p == .Numeric);
                     state.mid_num = false;
                     continue :scan;
@@ -235,25 +257,18 @@ pub const Iterator = struct {
                 if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan;
                 // WB13b  ExtendNumLet × (AHLetter | Numeric | Katakana)
                 if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
-                // WB15, WB16  ([^RI] ! sot) (RI RI)* RI × RI
-                if (that_p == .Regional_Indicator) {
-                    if (this_p == .Regional_Indicator) {
-                        if (state.regional) {
+                // WB15, WB16  ([^RI] | sot) (RI RI)* RI × RI
+                if (this_p == .Regional_Indicator) {
+                    if (that_p == .Regional_Indicator) {
+                        if (state.regional == true or this.offset == 0) {
                             state.regional = false;
                             continue :scan;
-                        } else {
-                            break :scan;
                         }
                     } else {
-                        const next_val = iter.cp_iter.peek();
-                        if (next_val) |next_cp| {
-                            const next_p = iter.wb.breakProperty(next_cp.code);
-                            if (next_p == .Regional_Indicator) {
-                                state.regional = true;
-                                continue :scan;
-                            }
-                        } else break :scan;
+                        state.regional = true;
                     }
+                } else if (that_p == .Regional_Indicator) {
+                    state.regional = true;
                 }
                 // WB999  Any ÷ Any
                 break :scan;
@@ -265,9 +280,19 @@ pub const Iterator = struct {
         return Word{ .len = word_len, .offset = word_start };
     }
 
-    fn advance(wb_iter: *Iterator) void {
-        wb_iter.this = wb_iter.that;
-        wb_iter.that = wb_iter.cp_iter.next();
+    fn advance(iter: *Iterator) void {
+        iter.this = iter.that;
+        iter.that = iter.cp_iter.next();
+    }
+
+    fn peekPast(iter: *Iterator) ?CodePoint {
+        const save_cp = iter.cp_iter;
+        defer iter.cp_iter = save_cp;
+        while (iter.cp_iter.peek()) |peeked| {
+            if (!isIgnorable(iter.wb.breakProperty(peeked.code))) return peeked;
+            _ = iter.cp_iter.next();
+        }
+        return null;
     }
 };
 
@@ -292,6 +317,10 @@ inline fn isMidVal(wbp: WordBreakProperty) bool {
     return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
 }
 
+inline fn isMidNum(wbp: WordBreakProperty) bool {
+    return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote;
+}
+
 inline fn isExtensible(wbp: WordBreakProperty) bool {
     return switch (wbp) {
         .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
@@ -328,3 +357,5 @@ const testing = std.testing;
 const code_point = @import("code_point");
 const CodepointIterator = code_point.Iterator;
 const CodePoint = code_point.CodePoint;
+
+const ext_pict = @import("micro_runeset.zig").Extended_Pictographic;
diff --git a/src/code_point.zig b/src/code_point.zig
index b3c9aea..79ee5cd 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -10,6 +10,11 @@ pub const CodePoint = struct {
     code: u21,
     len: u3,
     offset: u32,
+
+    /// Return the slice of this codepoint, given the original string.
+    pub fn bytes(cp: CodePoint, str: []const u8) []const u8 {
+        return str[cp.offset..][0..cp.len];
+    }
 };
 
 /// This function is deprecated and will be removed in a later release.
diff --git a/src/micro_runeset.zig b/src/micro_runeset.zig
new file mode 100644
index 0000000..34fbcd3
--- /dev/null
+++ b/src/micro_runeset.zig
@@ -0,0 +1,207 @@
+//! Minimal RuneSet implementation
+//!
+//! This is copied from the full RuneSet module, so that `zg` doesn't
+//! depend on it.  There's one spot in the WordBreak algorithm which
+//! needs to identify the emoji Extended_Pictographic property, which
+//! is not otherwise used in ZG.  The Runeset is 89 words, while the
+//! trie lookup used throughout ZG would be much larger.
+//!
+//! The RuneSet is borrowed from Runicode, which encodes Unicode things
+//! in RuneSet form.  This will need updating for each version of Unicode.
+
+pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x8000060, 0x10000000000000, 0x8001000200600000, 0x7800985090, 0x801022055ef2d, 0xedf57effffffdf57, 0xaffd75bd6f7d001f, 0xdbffffbbbff7ff7f, 0x7d7fddd76f56dfb5, 0x3800000000000001, 0x40040000000000, 0x4, 0x30bae0000008000, 0x100, 0x10004000000, 0x20001f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xfffffffffffffff7, 0xffffffffffffffff, 0xffffffffffffffff, 0x7fffffffffffbfff, 0x800000006000, 0x4001700000000000, 0xffffe00003fe4000, 0x1fffffffff, 0x73fc800004007ffa, 0xfffffffffffd7e00, 0xffffffffffffffff, 0x7fffffffffffffff, 0xffd56ff6bedfafff, 0x77ffffffffff7bff, 0xffffffff5757ffff, 0x3fafff77ff7bfef, 0xbffffdfffffab77f, 0xffffd7efffffffff, 0xff5fefffffffffff, 0xef6fd7ffffffffff, 0x1fffd7ffffefff7b, 0xfdfabf7ff7ffbac0, 0xf7faff77ffaf5dbf, 0x7dfbbf7eb7f6ffed, 0xfff7775fbfefdebf, 0x7fee, 0xbedddfddfbf7f7db, 0x6ebb6edf776b7bdf, 0x7ff0000000000000, 0x7fff77ff7fe00000, 0x7000, 0x7c007f00, 0xffffc00000007f00, 0x7fffffffffffffff, 0xb3fb7f7fbeff7000, 0x7ebef7ffbfff779f, 0x7dff5bebff7dffef, 0x7fffffbfffff7bfb, 0xffffffffffffffff, 0x6b777fffffffffff, 0xdbbf6effffdfbebb, 0x7ebf7f7fb5bf5fdb, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x1fffffffffffffff } };
+
+// Meaningful names for the T1 slots
+const LOW = 0;
+const HI = 1;
+const LEAD = 2;
+const T4_OFF = 3;
+
+/// Minimum Viable Runeset.  Must be statically created, strictly boolean matching.
+pub const RuneSet = struct {
+    body: []const u64,
+
+    // Returns whether the slice is a match.  This assumes the validity of the
+    // string, which can be ensured by, in particular, deriving it from a CodePoint.
+    pub fn isMatch(runeset: RuneSet, str: []const u8) bool {
+        const set = runeset.body;
+        const a = codeunit(str[0]);
+        switch (a.kind) {
+            .follow => return false,
+            .low => {
+                const mask = toMask(set[LOW]);
+                if (mask.isIn(a))
+                    return true
+                else
+                    return false;
+            },
+            .hi => {
+                const mask = toMask(set[HI]);
+                if (mask.isIn(a))
+                    return true
+                else
+                    return false;
+            },
+            .lead => {
+                const nB = a.nMultiBytes().?;
+                const a_mask = toMask(set[LEAD]);
+                if (!a_mask.isIn(a)) return false;
+                const b = codeunit(str[1]);
+                const b_loc = 4 + a_mask.lowerThan(a).?;
+                const b_mask = toMask(set[b_loc]);
+                if (!b_mask.isIn(b)) return false;
+                if (nB == 2) return true;
+                const t3_off = 4 + @popCount(set[LEAD]);
+                const c = codeunit(str[2]);
+                // Slice is safe because we know the T2 span has at least one word.
+                const c_off = b_mask.higherThan(b).? + popCountSlice(set[b_loc + 1 .. t3_off]);
+                const c_loc = t3_off + c_off;
+                const c_mask = toMask(set[c_loc]);
+                if (!c_mask.isIn(c)) return false;
+                if (nB == 3) return true;
+                const d_off = c_mask.lowerThan(c).? + popCountSlice(set[t3_off..c_loc]);
+                const d_loc = set[T4_OFF] + d_off;
+                const d = codeunit(str[3]);
+                const d_mask = toMask(set[d_loc]);
+                if (d_mask.isIn(d)) return true else return false;
+            },
+        }
+    }
+};
+
+/// Kinds of most significant bits in UTF-8
+const RuneKind = enum(u2) {
+    low,
+    hi,
+    follow,
+    lead,
+};
+
+/// Packed `u8` struct representing one codeunit of UTF-8.
+const CodeUnit = packed struct(u8) {
+    body: u6,
+    kind: RuneKind,
+
+    /// Mask to check presence
+    pub inline fn inMask(self: *const CodeUnit) u64 {
+        return @as(u64, 1) << self.body;
+    }
+
+    // TODO consider an nMultiBytesFast, for the cases where we
+    // know that invalid lead bytes are never present (such as in set)
+    // operations, where we may assume that (and will assert that) the
+    // LEAD mask contains no such bytes.
+
+    /// Number of bytes in known multi-byte rune.
+    ///
+    /// Caller guarantees that the CodeUnit is a lead byte
+    /// of a multi-byte rune: `cu.kind == .lead`.
+    ///
+    /// Invalid lead bytes will return null.
+    pub inline fn nMultiBytes(self: *const CodeUnit) ?u8 {
+        return switch (self.body) {
+            // 0 and 1 are invalid for overlong reasons,
+            // but RuneSet supports overlong encodings
+            0...31 => 2,
+            32...47 => 3,
+            48...55 => 4,
+            // Wasted space 56...61 is due entirely to Microsoft's
+            // lack of vision and insistence on a substandard
+            // and utterly inadequate encoding for Unicode
+            // "64k should be enough for anyone" <spits>
+            56...63 => null,
+        };
+    }
+
+    /// Given a valid lead byte, return the number of bytes that should
+    /// make up the code unit sequence.  Will return `null` if the lead
+    /// byte is invalid.
+    pub inline fn nBytes(self: *const CodeUnit) ?u8 {
+        switch (self.kind) {
+            .low, .hi => return 1,
+            .lead => return self.nMultiBytes(),
+            .follow => return null,
+        }
+    }
+
+    /// Mask off all bits >= cu.body
+    pub inline fn hiMask(self: *const CodeUnit) u64 {
+        return (@as(u64, 1) << self.body) - 1;
+    }
+
+    /// Mask off all bits <= cu.body
+    pub inline fn lowMask(self: *const CodeUnit) u64 {
+        if (self.body == 63)
+            return 0
+        else
+            return ~((@as(u64, 1) << (self.body + 1)) - 1);
+    }
+
+    /// Cast the `CodeUnit` to its backing `u8`.
+    pub inline fn byte(self: *const CodeUnit) u8 {
+        return @bitCast(self.*);
+    }
+};
+
+/// Cast raw byte to CodeUnit
+inline fn codeunit(b: u8) CodeUnit {
+    return @bitCast(b);
+}
+
+inline fn toMask(w: u64) Mask {
+    return Mask.toMask(w);
+}
+
+/// Bitmask for runesets
+///
+/// We define our own bitset, because the operations we need to
+/// perform only overlap with IntegerBitSet for trivial one-liners,
+/// and furthermore, we need nondestructive versions of the basic
+/// operations, which aren't a part of the IntegerBitSet interface.
+///
+/// Note that Masks do not track which kind of byte they apply to,
+/// since they will be stored as ordinary u64s.  User code must
+/// ensure that CodeUnits tested against a Mask are of the appropriate
+/// type, and otherwise valid for the test performed.
+///
+const Mask = struct {
+    m: u64,
+
+    pub fn toMask(w: u64) Mask {
+        return Mask{ .m = w };
+    }
+
+    /// Test if a CodeUnit's low bytes are present in mask
+    pub inline fn isIn(self: Mask, cu: CodeUnit) bool {
+        return self.m | cu.inMask() == self.m;
+    }
+
+    /// Return number of bytes lower than cu.body in mask,
+    /// if cu inhabits the mask.  Otherwise return null.
+    pub inline fn lowerThan(self: Mask, cu: CodeUnit) ?u64 {
+        if (self.isIn(cu)) {
+            const m = cu.hiMask();
+            return @popCount(self.m & m);
+        } else {
+            return null;
+        }
+    }
+
+    /// Return number of bytes higher than cu.body in mask,
+    /// if cu inhabits the mask.  Otherwise return null.
+    pub inline fn higherThan(self: Mask, cu: CodeUnit) ?u64 {
+        if (self.isIn(cu)) {
+            const m = cu.lowMask();
+            return @popCount(self.m & m);
+        } else {
+            return null;
+        }
+    }
+};
+
+/// Sum of @popCount of all words in region.
+inline fn popCountSlice(region: []const u64) usize {
+    var ct: usize = 0;
+    for (region) |w| ct += @popCount(w);
+    return ct;
+}
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index ee259a3..7ce2b4e 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -1,31 +1,5 @@
 const dbg_print = false;
 
-comptime {
-    testing.refAllDecls(grapheme);
-}
-
-test "Iterator.peek" {
-    const peek_seq = "aΔ👨🏻‍🌾→";
-    const data = try Graphemes.init(std.testing.allocator);
-    defer data.deinit(std.testing.allocator);
-
-    var iter = data.iterator(peek_seq);
-    const peek_a = iter.peek().?;
-    const next_a = iter.next().?;
-    try std.testing.expectEqual(peek_a, next_a);
-    try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq));
-    const peek_d1 = iter.peek().?;
-    const peek_d2 = iter.peek().?;
-    try std.testing.expectEqual(peek_d1, peek_d2);
-    const next_d = iter.next().?;
-    try std.testing.expectEqual(peek_d2, next_d);
-    try std.testing.expectEqual(iter.peek(), iter.next());
-    try std.testing.expectEqual(iter.peek(), iter.next());
-    try std.testing.expectEqual(null, iter.peek());
-    try std.testing.expectEqual(null, iter.peek());
-    try std.testing.expectEqual(iter.peek(), iter.next());
-}
-
 test "Unicode normalization tests" {
     var arena = heap.ArenaAllocator.init(testing.allocator);
     defer arena.deinit();
@@ -147,15 +121,13 @@ test "Segmentation GraphemeIterator" {
     var buf_reader = std.io.bufferedReader(file.reader());
     var input_stream = buf_reader.reader();
 
-    const data = try Graphemes.init(allocator);
-    defer data.deinit(allocator);
+    const graph = try Graphemes.init(allocator);
+    defer graph.deinit(allocator);
 
     var buf: [4096]u8 = undefined;
     var line_iter: IterRead = .{ .read = &input_stream };
 
     while (try line_iter.next(&buf)) |raw| {
-        // Skip comments or empty lines.
-        // if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
         // Clean up.
         var line = std.mem.trimLeft(u8, raw, "÷ ");
         if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
@@ -190,7 +162,7 @@ test "Segmentation GraphemeIterator" {
             bytes_index += cp_index;
         }
 
-        var iter = data.iterator(all_bytes.items);
+        var iter = graph.iterator(all_bytes.items);
 
         // Check.
         for (want.items) |want_gc| {
@@ -203,6 +175,71 @@ test "Segmentation GraphemeIterator" {
     }
 }
 
+test "Segmentation Word Iterator" {
+    const allocator = std.testing.allocator;
+    var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{});
+    defer file.close();
+    var buf_reader = std.io.bufferedReader(file.reader());
+    var input_stream = buf_reader.reader();
+
+    const wb = try WordBreak.init(allocator);
+    defer wb.deinit(allocator);
+
+    var buf: [4096]u8 = undefined;
+    var line_iter: IterRead = .{ .read = &input_stream };
+
+    while (try line_iter.next(&buf)) |raw| {
+        // Clean up.
+        var line = std.mem.trimLeft(u8, raw, "÷ ");
+        if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
+            line = line[0..final];
+        }
+        // Iterate over fields.
+        var want = std.ArrayList(Grapheme).init(allocator);
+        defer want.deinit();
+
+        var all_bytes = std.ArrayList(u8).init(allocator);
+        defer all_bytes.deinit();
+
+        var words = std.mem.splitSequence(u8, line, " ÷ ");
+        var bytes_index: u32 = 0;
+
+        while (words.next()) |field| {
+            var code_points = std.mem.splitScalar(u8, field, ' ');
+            var cp_buf: [4]u8 = undefined;
+            var cp_index: u32 = 0;
+            var gc_len: u8 = 0;
+
+            while (code_points.next()) |code_point| {
+                if (std.mem.eql(u8, code_point, "×")) continue;
+                const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
+                const len = try unicode.utf8Encode(cp, &cp_buf);
+                try all_bytes.appendSlice(cp_buf[0..len]);
+                cp_index += len;
+                gc_len += len;
+            }
+
+            try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
+            bytes_index += cp_index;
+        }
+
+        var iter = wb.iterator(all_bytes.items);
+
+        // Check.
+        for (want.items, 1..) |want_word, i| {
+            const got_word = (iter.next()).?;
+            std.testing.expectEqualSlices(
+                u8,
+                want_word.bytes(all_bytes.items),
+                got_word.bytes(all_bytes.items),
+            ) catch |err| {
+                debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i });
+                return err;
+            };
+        }
+    }
+}
+
 const IterRead = struct {
     read: *Reader,
     line: usize = 0,
@@ -235,8 +272,9 @@ const debug = std.debug;
 const testing = std.testing;
 const unicode = std.unicode;
 
-const grapheme = @import("Graphemes");
 const Grapheme = @import("Graphemes").Grapheme;
 const Graphemes = @import("Graphemes");
 const GraphemeIterator = @import("Graphemes").Iterator;
 const Normalize = @import("Normalize");
+
+const WordBreak = @import("WordBreak");
-- 
cgit v1.2.3