From a832b90bc00503a201bf457df1f49dc338314e7b Mon Sep 17 00:00:00 2001
From: Sam Atman
Date: Mon, 12 May 2025 12:57:49 -0400
Subject: Implement Word iterator

A by-the-book implmentation of the word break rules from tr29.  This is
superficially inefficient, but compilers are more than able to handle
the common subexpression folding ignored by this approach.

Now to port the WordBreakPropertyTests, and clean up the inevitable bugs
in the implementation.
---
 src/WordBreak.zig | 228 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 228 insertions(+)

(limited to 'src/WordBreak.zig')

diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 9044740..84fd1f7 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -71,11 +71,234 @@ pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void {
     allocator.free(wordbreak.s2);
 }
 
+/// Represents a Unicode word span, as an offset into the source string
+/// and the length of the word.
+pub const Word = struct {
+    offset: u32,
+    len: u32,
+
+    /// Returns a slice of the word given the source string.
+    pub fn bytes(self: Word, src: []const u8) []const u8 {
+        return src[self.offset..][0..self.len];
+    }
+};
+
 /// Returns the word break property type for `cp`.
 pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
     return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
 }
 
+const IterState = packed struct {
+    mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter
+    mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric
+    quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter
+    regional: bool, // [^RI] (RI RI)* RI × RI
+
+    pub const initial: IterState = .{
+        .mid_punct = false,
+        .mid_num = false,
+        .quote_heb = false,
+        .regional = false,
+    };
+};
+
+pub const Iterator = struct {
+    this: ?CodePoint = null,
+    that: ?CodePoint = null,
+    cache: ?WordBreakProperty = null,
+    cp_iter: CodepointIterator,
+    wb: *const WordBreak,
+
+    /// Assumes `str` is valid UTF-8.
+    pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
+        var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb };
+        wb_iter.advance();
+        return wb;
+    }
+
+    pub fn next(iter: *Iterator) ?Word {
+        iter.advance();
+
+        // Done?
+        if (iter.this == null) return null;
+        // Last?
+        if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
+
+        const word_start = iter.this.?.offset;
+        var word_len: u32 = 0;
+
+        var state: IterState = .initial;
+
+        scan: while (true) : (iter.advance()) {
+            const this = iter.this.?;
+            word_len += this.len;
+            if (iter.that) |that| {
+                const that_p = iter.wb.breakProperty(that.code);
+                const this_p = this_p: {
+                    if (!isIgnorable(that_p) and iter.cache != null) {
+                        defer iter.cache = null;
+                        break :this_p iter.cache.?;
+                    } else {
+                        break :this_p iter.wb.breakProperty(this.code);
+                    }
+                };
+                // WB3  CR × LF
+                if (this_p == .CR and that_p == .LF) continue :scan;
+                // WB3a  (Newline | CR | LF) ÷
+                if (isNewline(this_p)) break :scan;
+                // WB3b  ÷ (Newline | CR | LF)
+                if (isNewline(that_p)) break :scan;
+                // WB3c  ZWJ × \p{Extended_Pictographic}
+                // The right way to do this one is a RuneSet, TODO: circle back
+                // WB3d  WSegSpace × WSegSpace
+                if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
+                // WB4  X (Extend | Format | ZWJ)* → X
+                if (isIgnorable(that_p)) {
+                    if (iter.cache == null) {
+                        iter.cache = this_p;
+                    }
+                    continue :scan;
+                }
+                if (isAHLetter(this_p)) {
+                    // WB5  AHLetter × AHLetter
+                    if (isAHLetter(that_p)) continue :scan;
+                    // WB6  AHLetter × (MidLetter | MidNumLetQ) AHLetter
+                    if (isMidVal(that_p)) {
+                        const next_val = iter.cp_iter.peek();
+                        if (next_val) |next_cp| {
+                            const next_p = iter.wb.breakProperty(next_cp.code);
+                            if (isAHLetter(next_p)) {
+                                state.mid_punct = true;
+                                continue :scan;
+                            }
+                        } else break :scan;
+                    }
+                }
+                // AHLetter (MidLetter | MidNumLetQ) × AHLetter
+                if (state.mid_punct) {
+                    // Should always be true:
+                    assert(isMidVal(this_p));
+                    assert(isAHLetter(that_p));
+                    state.mid_punct = false;
+                    continue :scan;
+                }
+                if (this_p == .Hebrew_Letter) {
+                    // WB7a  Hebrew_Letter × Single_Quote
+                    if (that_p == .Single_Quote) continue :scan;
+                    // WB7b  Hebrew_Letter × Double_Quote Hebrew_Letter
+                    if (that_p == .Double_Quote) {
+                        const next_val = iter.cp_iter.peek();
+                        if (next_val) |next_cp| {
+                            const next_p = iter.wb.breakProperty(next_cp.code);
+                            if (next_p == .Hebrew_Letter) {
+                                state.quote_heb = true;
+                                continue :scan;
+                            }
+                        } else break :scan;
+                    }
+                }
+                // WB7c  Hebrew_Letter Double_Quote × Hebrew_Letter
+                if (state.quote_heb) {
+                    // Should always be true:
+                    assert(this_p == .Double_Quote);
+                    assert(that_p == .Hebrew_Letter);
+                    state.quote_heb = false;
+                    continue :scan;
+                }
+                // WB8  Numeric × Numeric
+                if (this_p == .Numeric and that_p == .Numeric) continue :scan;
+                // WB9  AHLetter × Numeric
+                if (isAHLetter(this_p) and that_p == .Numeric) continue :scan;
+                // WB10  Numeric ×  AHLetter
+                if (this_p == .Numeric and isAHLetter(that_p)) continue :scan;
+                // WB12  Numeric × (MidNum | MidNumLetQ) Numeric
+                if (this_p == .Numeric and isMidVal(that_p)) {
+                    const next_val = iter.cp_iter.peek();
+                    if (next_val) |next_cp| {
+                        const next_p = iter.wb.breakProperty(next_cp.code);
+                        if (next_p == .Numeric) {
+                            state.mid_num = true;
+                            continue :scan;
+                        }
+                    } else break :scan;
+                }
+                // WB11  Numeric (MidNum | MidNumLetQ) × Numeric
+                if (state.mid_num) {
+                    assert(isMidVal(this_p));
+                    assert(that_p == .Numeric);
+                    state.mid_num = false;
+                    continue :scan;
+                }
+                // WB13  Katakana × Katakana
+                if (this_p == .Katakana and that_p == .Katakana) continue :scan;
+                // WB13a  (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+                if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan;
+                // WB13b  ExtendNumLet × (AHLetter | Numeric | Katakana)
+                if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
+                // WB15, WB16  ([^RI] ! sot) (RI RI)* RI × RI
+                if (that_p == .Regional_Indicator) {
+                    if (this_p == .Regional_Indicator) {
+                        if (state.regional) {
+                            state.regional = false;
+                            continue :scan;
+                        } else {
+                            break :scan;
+                        }
+                    } else {
+                        const next_val = iter.cp_iter.peek();
+                        if (next_val) |next_cp| {
+                            const next_p = iter.wb.breakProperty(next_cp.code);
+                            if (next_p == .Regional_Indicator) {
+                                state.regional = true;
+                                continue :scan;
+                            }
+                        } else break :scan;
+                    }
+                }
+                // WB999  Any ÷ Any
+                break :scan;
+            } else { // iter.that == null
+                break :scan;
+            }
+        }
+
+        return Word{ .len = word_len, .offset = word_start };
+    }
+
+    fn advance(wb_iter: *Iterator) void {
+        wb_iter.this = wb_iter.that;
+        wb_iter.that = wb_iter.cp_iter.next();
+    }
+};
+
+//| Predicates
+
+inline fn isNewline(wbp: WordBreakProperty) bool {
+    return wbp == .CR or wbp == .LF or wbp == .Newline;
+}
+
+inline fn isIgnorable(wbp: WordBreakProperty) bool {
+    return switch (wbp) {
+        .Format, .Extend, .ZWJ => true,
+        else => false,
+    };
+}
+
+inline fn isAHLetter(wbp: WordBreakProperty) bool {
+    return wbp == .ALetter or wbp == .Hebrew_Letter;
+}
+
+inline fn isMidVal(wbp: WordBreakProperty) bool {
+    return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
+}
+
+inline fn isExtensible(wbp: WordBreakProperty) bool {
+    return switch (wbp) {
+        .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
+        else => false,
+    };
+}
+
 test "Word Break Properties" {
     const wb = try WordBreak.init(testing.allocator);
     defer wb.deinit(testing.allocator);
@@ -99,4 +322,9 @@ const builtin = @import("builtin");
 const compress = std.compress;
 const mem = std.mem;
 const Allocator = mem.Allocator;
+const assert = std.debug.assert;
 const testing = std.testing;
+
+const code_point = @import("code_point");
+const CodepointIterator = code_point.Iterator;
+const CodePoint = code_point.CodePoint;
-- 
cgit v1.2.3