From a832b90bc00503a201bf457df1f49dc338314e7b Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Mon, 12 May 2025 12:57:49 -0400 Subject: Implement Word iterator A by-the-book implmentation of the word break rules from tr29. This is superficially inefficient, but compilers are more than able to handle the common subexpression folding ignored by this approach. Now to port the WordBreakPropertyTests, and clean up the inevitable bugs in the implementation. --- src/WordBreak.zig | 228 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 228 insertions(+) (limited to 'src/WordBreak.zig') diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 9044740..84fd1f7 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -71,11 +71,234 @@ pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { allocator.free(wordbreak.s2); } +/// Represents a Unicode word span, as an offset into the source string +/// and the length of the word. +pub const Word = struct { + offset: u32, + len: u32, + + /// Returns a slice of the word given the source string. + pub fn bytes(self: Word, src: []const u8) []const u8 { + return src[self.offset..][0..self.len]; + } +}; + /// Returns the word break property type for `cp`. pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); } +const IterState = packed struct { + mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter + mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric + quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter + regional: bool, // [^RI] (RI RI)* RI × RI + + pub const initial: IterState = .{ + .mid_punct = false, + .mid_num = false, + .quote_heb = false, + .regional = false, + }; +}; + +pub const Iterator = struct { + this: ?CodePoint = null, + that: ?CodePoint = null, + cache: ?WordBreakProperty = null, + cp_iter: CodepointIterator, + wb: *const WordBreak, + + /// Assumes `str` is valid UTF-8. + pub fn init(wb: *const WordBreak, str: []const u8) Iterator { + var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; + wb_iter.advance(); + return wb; + } + + pub fn next(iter: *Iterator) ?Word { + iter.advance(); + + // Done? + if (iter.this == null) return null; + // Last? + if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; + + const word_start = iter.this.?.offset; + var word_len: u32 = 0; + + var state: IterState = .initial; + + scan: while (true) : (iter.advance()) { + const this = iter.this.?; + word_len += this.len; + if (iter.that) |that| { + const that_p = iter.wb.breakProperty(that.code); + const this_p = this_p: { + if (!isIgnorable(that_p) and iter.cache != null) { + defer iter.cache = null; + break :this_p iter.cache.?; + } else { + break :this_p iter.wb.breakProperty(this.code); + } + }; + // WB3 CR × LF + if (this_p == .CR and that_p == .LF) continue :scan; + // WB3a (Newline | CR | LF) ÷ + if (isNewline(this_p)) break :scan; + // WB3b ÷ (Newline | CR | LF) + if (isNewline(that_p)) break :scan; + // WB3c ZWJ × \p{Extended_Pictographic} + // The right way to do this one is a RuneSet, TODO: circle back + // WB3d WSegSpace × WSegSpace + if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; + // WB4 X (Extend | Format | ZWJ)* → X + if (isIgnorable(that_p)) { + if (iter.cache == null) { + iter.cache = this_p; + } + continue :scan; + } + if (isAHLetter(this_p)) { + // WB5 AHLetter × AHLetter + if (isAHLetter(that_p)) continue :scan; + // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter + if (isMidVal(that_p)) { + const next_val = iter.cp_iter.peek(); + if (next_val) |next_cp| { + const next_p = iter.wb.breakProperty(next_cp.code); + if (isAHLetter(next_p)) { + state.mid_punct = true; + continue :scan; + } + } else break :scan; + } + } + // AHLetter (MidLetter | MidNumLetQ) × AHLetter + if (state.mid_punct) { + // Should always be true: + assert(isMidVal(this_p)); + assert(isAHLetter(that_p)); + state.mid_punct = false; + continue :scan; + } + if (this_p == .Hebrew_Letter) { + // WB7a Hebrew_Letter × Single_Quote + if (that_p == .Single_Quote) continue :scan; + // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter + if (that_p == .Double_Quote) { + const next_val = iter.cp_iter.peek(); + if (next_val) |next_cp| { + const next_p = iter.wb.breakProperty(next_cp.code); + if (next_p == .Hebrew_Letter) { + state.quote_heb = true; + continue :scan; + } + } else break :scan; + } + } + // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter + if (state.quote_heb) { + // Should always be true: + assert(this_p == .Double_Quote); + assert(that_p == .Hebrew_Letter); + state.quote_heb = false; + continue :scan; + } + // WB8 Numeric × Numeric + if (this_p == .Numeric and that_p == .Numeric) continue :scan; + // WB9 AHLetter × Numeric + if (isAHLetter(this_p) and that_p == .Numeric) continue :scan; + // WB10 Numeric × AHLetter + if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; + // WB12 Numeric × (MidNum | MidNumLetQ) Numeric + if (this_p == .Numeric and isMidVal(that_p)) { + const next_val = iter.cp_iter.peek(); + if (next_val) |next_cp| { + const next_p = iter.wb.breakProperty(next_cp.code); + if (next_p == .Numeric) { + state.mid_num = true; + continue :scan; + } + } else break :scan; + } + // WB11 Numeric (MidNum | MidNumLetQ) × Numeric + if (state.mid_num) { + assert(isMidVal(this_p)); + assert(that_p == .Numeric); + state.mid_num = false; + continue :scan; + } + // WB13 Katakana × Katakana + if (this_p == .Katakana and that_p == .Katakana) continue :scan; + // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet + if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; + // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) + if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; + // WB15, WB16 ([^RI] ! sot) (RI RI)* RI × RI + if (that_p == .Regional_Indicator) { + if (this_p == .Regional_Indicator) { + if (state.regional) { + state.regional = false; + continue :scan; + } else { + break :scan; + } + } else { + const next_val = iter.cp_iter.peek(); + if (next_val) |next_cp| { + const next_p = iter.wb.breakProperty(next_cp.code); + if (next_p == .Regional_Indicator) { + state.regional = true; + continue :scan; + } + } else break :scan; + } + } + // WB999 Any ÷ Any + break :scan; + } else { // iter.that == null + break :scan; + } + } + + return Word{ .len = word_len, .offset = word_start }; + } + + fn advance(wb_iter: *Iterator) void { + wb_iter.this = wb_iter.that; + wb_iter.that = wb_iter.cp_iter.next(); + } +}; + +//| Predicates + +inline fn isNewline(wbp: WordBreakProperty) bool { + return wbp == .CR or wbp == .LF or wbp == .Newline; +} + +inline fn isIgnorable(wbp: WordBreakProperty) bool { + return switch (wbp) { + .Format, .Extend, .ZWJ => true, + else => false, + }; +} + +inline fn isAHLetter(wbp: WordBreakProperty) bool { + return wbp == .ALetter or wbp == .Hebrew_Letter; +} + +inline fn isMidVal(wbp: WordBreakProperty) bool { + return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; +} + +inline fn isExtensible(wbp: WordBreakProperty) bool { + return switch (wbp) { + .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, + else => false, + }; +} + test "Word Break Properties" { const wb = try WordBreak.init(testing.allocator); defer wb.deinit(testing.allocator); @@ -99,4 +322,9 @@ const builtin = @import("builtin"); const compress = std.compress; const mem = std.mem; const Allocator = mem.Allocator; +const assert = std.debug.assert; const testing = std.testing; + +const code_point = @import("code_point"); +const CodepointIterator = code_point.Iterator; +const CodePoint = code_point.CodePoint; -- cgit v1.2.3