From a7f6990a8d433c6c8d34892a2126e94cdb31541f Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Mon, 12 May 2025 18:10:02 -0400 Subject: Rewrite, passes WordBreakTest After fixing a bug in Runicode which was fenceposting codepoints off the end of ranges. As one does. --- src/WordBreak.zig | 111 +++++++++++++++++--------------------------------- src/micro_runeset.zig | 4 +- src/unicode_tests.zig | 3 +- 3 files changed, 40 insertions(+), 78 deletions(-) (limited to 'src') diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 53db76b..a2be011 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -132,28 +132,21 @@ pub const Iterator = struct { const word_start = iter.this.?.offset; var word_len: u32 = 0; - var state: IterState = .initial; + // state variables + var last_p: WordBreakProperty = .none; + var last_last_p: WordBreakProperty = .none; + var ri_count: usize = 0; scan: while (true) : (iter.advance()) { const this = iter.this.?; word_len += this.len; - var ignored = false; if (iter.that) |that| { + const this_p = iter.wb.breakProperty(this.code); // WB3 CR × LF const that_p = iter.wb.breakProperty(that.code); - const this_p = this_p: { - if (!isIgnorable(that_p) and iter.cache != null) { - // TODO: might not need these what with peekPast - ignored = true; - defer iter.cache = null; - // Fixup some state, apply pre-4 rules - const restore = iter.cache.?; - if (restore == .WSegSpace) break :this_p .none; - break :this_p restore; - } else { - break :this_p iter.wb.breakProperty(this.code); - } - }; - // WB3 CR × LF + if (!isIgnorable(this_p)) { + last_last_p = last_p; + last_p = this_p; + } if (this_p == .CR and that_p == .LF) continue :scan; // WB3a (Newline | CR | LF) ÷ if (isNewline(this_p)) break :scan; @@ -161,27 +154,15 @@ pub const Iterator = struct { if (isNewline(that_p)) break :scan; // WB3c ZWJ × \p{Extended_Pictographic} if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { - // Invalid after ignoring - if (ignored) break :scan else continue :scan; + continue :scan; } // WB3d WSegSpace × WSegSpace if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; // WB4 X (Extend | Format | ZWJ)* → X if (isIgnorable(that_p)) { - if (that_p == .ZWJ) { - const next_val = iter.peekPast(); - if (next_val) |next_cp| { - if (ext_pict.isMatch(next_cp.bytes(iter.cp_iter.bytes))) { - continue :scan; - } - } - } - if (iter.cache == null) { - iter.cache = this_p; - } continue :scan; - } - if (isAHLetter(this_p)) { + } // Now we use last_p instead of this_p for ignorable's sake + if (isAHLetter(last_p)) { // WB5 AHLetter × AHLetter if (isAHLetter(that_p)) continue :scan; // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter @@ -190,21 +171,16 @@ pub const Iterator = struct { if (next_val) |next_cp| { const next_p = iter.wb.breakProperty(next_cp.code); if (isAHLetter(next_p)) { - state.mid_punct = true; continue :scan; } } } } - // AHLetter (MidLetter | MidNumLetQ) × AHLetter - if (state.mid_punct) { - // Should always be true: - assert(isMidVal(this_p)); - assert(isAHLetter(that_p)); - state.mid_punct = false; + // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter + if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) { continue :scan; } - if (this_p == .Hebrew_Letter) { + if (last_p == .Hebrew_Letter) { // WB7a Hebrew_Letter × Single_Quote if (that_p == .Single_Quote) continue :scan; // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter @@ -213,62 +189,44 @@ pub const Iterator = struct { if (next_val) |next_cp| { const next_p = iter.wb.breakProperty(next_cp.code); if (next_p == .Hebrew_Letter) { - state.quote_heb = true; continue :scan; } - } else break :scan; + } } } // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter - if (state.quote_heb) { - // Should always be true: - assert(this_p == .Double_Quote); - assert(that_p == .Hebrew_Letter); - state.quote_heb = false; + if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter) continue :scan; - } // WB8 Numeric × Numeric - if (this_p == .Numeric and that_p == .Numeric) continue :scan; + if (last_p == .Numeric and that_p == .Numeric) continue :scan; // WB9 AHLetter × Numeric - if (isAHLetter(this_p) and that_p == .Numeric) continue :scan; + if (isAHLetter(last_p) and that_p == .Numeric) continue :scan; // WB10 Numeric × AHLetter - if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; + if (last_p == .Numeric and isAHLetter(that_p)) continue :scan; + // WB11 Numeric (MidNum | MidNumLetQ) × Numeric + if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric) + continue :scan; // WB12 Numeric × (MidNum | MidNumLetQ) Numeric - if (this_p == .Numeric and isMidNum(that_p)) { + if (last_p == .Numeric and isMidNum(that_p)) { const next_val = iter.peekPast(); if (next_val) |next_cp| { const next_p = iter.wb.breakProperty(next_cp.code); if (next_p == .Numeric) { - state.mid_num = true; continue :scan; } - } else break :scan; - } - // WB11 Numeric (MidNum | MidNumLetQ) × Numeric - if (state.mid_num) { - assert(isMidNum(this_p)); - assert(that_p == .Numeric); - state.mid_num = false; - continue :scan; + } } // WB13 Katakana × Katakana - if (this_p == .Katakana and that_p == .Katakana) continue :scan; + if (last_p == .Katakana and that_p == .Katakana) continue :scan; // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet - if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; + if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan; // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) - if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; + if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI - if (this_p == .Regional_Indicator) { - if (that_p == .Regional_Indicator) { - if (state.regional == true or this.offset == 0) { - state.regional = false; - continue :scan; - } - } else { - state.regional = true; - } - } else if (that_p == .Regional_Indicator) { - state.regional = true; + const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator; + if (maybe_flag) { + ri_count += 1; + if (ri_count % 2 == 1) continue :scan; } // WB999 Any ÷ Any break :scan; @@ -337,6 +295,11 @@ test "Word Break Properties" { try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); } +test "ext_pic" { + try testing.expect(ext_pict.isMatch("👇")); + try testing.expect(ext_pict.isMatch("\u{2704}")); +} + fn testAllocations(allocator: Allocator) !void { const wb = try WordBreak.init(allocator); wb.deinit(allocator); diff --git a/src/micro_runeset.zig b/src/micro_runeset.zig index 34fbcd3..80ce4bf 100644 --- a/src/micro_runeset.zig +++ b/src/micro_runeset.zig @@ -9,7 +9,7 @@ //! The RuneSet is borrowed from Runicode, which encodes Unicode things //! in RuneSet form. This will need updating for each version of Unicode. -pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x8000060, 0x10000000000000, 0x8001000200600000, 0x7800985090, 0x801022055ef2d, 0xedf57effffffdf57, 0xaffd75bd6f7d001f, 0xdbffffbbbff7ff7f, 0x7d7fddd76f56dfb5, 0x3800000000000001, 0x40040000000000, 0x4, 0x30bae0000008000, 0x100, 0x10004000000, 0x20001f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xfffffffffffffff7, 0xffffffffffffffff, 0xffffffffffffffff, 0x7fffffffffffbfff, 0x800000006000, 0x4001700000000000, 0xffffe00003fe4000, 0x1fffffffff, 0x73fc800004007ffa, 0xfffffffffffd7e00, 0xffffffffffffffff, 0x7fffffffffffffff, 0xffd56ff6bedfafff, 0x77ffffffffff7bff, 0xffffffff5757ffff, 0x3fafff77ff7bfef, 0xbffffdfffffab77f, 0xffffd7efffffffff, 0xff5fefffffffffff, 0xef6fd7ffffffffff, 0x1fffd7ffffefff7b, 0xfdfabf7ff7ffbac0, 0xf7faff77ffaf5dbf, 0x7dfbbf7eb7f6ffed, 0xfff7775fbfefdebf, 0x7fee, 0xbedddfddfbf7f7db, 0x6ebb6edf776b7bdf, 0x7ff0000000000000, 0x7fff77ff7fe00000, 0x7000, 0x7c007f00, 0xffffc00000007f00, 0x7fffffffffffffff, 0xb3fb7f7fbeff7000, 0x7ebef7ffbfff779f, 0x7dff5bebff7dffef, 0x7fffffbfffff7bfb, 0xffffffffffffffff, 0x6b777fffffffffff, 0xdbbf6effffdfbebb, 0x7ebf7f7fb5bf5fdb, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x1fffffffffffffff } }; +pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x180000e0, 0x30000000000000, 0x8001000200e00000, 0xf800b85090, 0x1801022057ff3f, 0xffffffffffffffff, 0xffffffffffff003f, 0xffffffffffffffff, 0xfffffffffff7ffbf, 0x7800000000000001, 0x400c0000000000, 0x4, 0x70ffe0000008000, 0x100, 0x1000c000000, 0x60003f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x80000000e000, 0xc003f00000000000, 0xffffe00007fe4000, 0x3fffffffff, 0xf7fc80000400fffe, 0xfffffffffffffe00, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x7ffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff, 0xffffffffffffffc0, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xfff0000000000000, 0xffffffffffe00000, 0xf000, 0xfc00ff00, 0xffffc0000000ff00, 0xffffffffffffffff, 0xf7fffffffffff000, 0xffffffffffffffbf, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff } }; // Meaningful names for the T1 slots const LOW = 0; @@ -27,7 +27,7 @@ pub const RuneSet = struct { const set = runeset.body; const a = codeunit(str[0]); switch (a.kind) { - .follow => return false, + .follow => unreachable, .low => { const mask = toMask(set[LOW]); if (mask.isIn(a)) diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 7ce2b4e..59f0c6f 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -228,8 +228,7 @@ test "Segmentation Word Iterator" { // Check. for (want.items, 1..) |want_word, i| { const got_word = (iter.next()).?; - std.testing.expectEqualSlices( - u8, + std.testing.expectEqualStrings( want_word.bytes(all_bytes.items), got_word.bytes(all_bytes.items), ) catch |err| { -- cgit v1.2.3