From 49843d6dc6cde85fe670085b809d41db314ec47d Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Tue, 13 May 2025 12:15:18 -0400 Subject: Add wordAtCursor This is not actually the way to do it, and can break on some crafted strings. The way to actually do it: implement a reverse word search iterator, then do next() to find a word break, prev() to find a _valid_ word start, then next() again to find the valid end of said word. Maybe 2+, 2-, 1+ actually. I can probably write a test to see if the cursor spot is ambiguous, and apply an extra round if so. Need to mull the rules over before making any rash moves. --- src/WordBreak.zig | 148 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 100 insertions(+), 48 deletions(-) (limited to 'src') diff --git a/src/WordBreak.zig b/src/WordBreak.zig index a2be011..f0da30d 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -42,30 +42,6 @@ pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { }; } -inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { - const decompressor = compress.flate.inflate.decompressor; - const in_bytes = @embedFile("wbp"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = decompressor(.raw, in_fbs.reader()); - var reader = in_decomp.reader(); - - const endian = builtin.cpu.arch.endian(); - - const stage_1_len: u16 = try reader.readInt(u16, endian); - wb.s1 = try allocator.alloc(u16, stage_1_len); - errdefer allocator.free(wb.s1); - for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); - - const stage_2_len: u16 = try reader.readInt(u16, endian); - wb.s2 = try allocator.alloc(u5, stage_2_len); - errdefer allocator.free(wb.s2); - for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); - var count_0: usize = 0; - for (wb.s2) |nyb| { - if (nyb == 0) count_0 += 1; - } -} - pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { allocator.free(wordbreak.s1); allocator.free(wordbreak.s2); @@ -88,29 +64,48 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); } +fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { + return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); +} + +/// Returns the Word at the given index. Asserts that the index is valid for +/// the provided string. Always returns a word. This algorithm is O(n^2) for +/// the size of the word before the cursor. +pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { + assert(index < string.len); + var i = index; + var iter1 = wordbreak.iterator(string[index..]); + var last_word = iter1.next(); + if (index == 0) return if (last_word) |word| word else Word{ .offset = 0, .len = 0 }; + var this_word: ?Word = undefined; + while (last_word) |l_word| : (i -= 1) { + var iter2 = wordbreak.iterator(string[i..]); + this_word = iter2.next(); + if (this_word) |t_word| { + if (false) std.debug.print("last_word: '{s}'\n", .{l_word.bytes(string[i + 1 ..])}); + if (false) std.debug.print("this_word: '{s}'\n", .{t_word.bytes(string[i..])}); + // Check if we've captured a previous word + const t_end = t_word.offset + t_word.len + i - 1; + const l_start = l_word.offset + i + 1; + if (false) std.debug.print("t_end {d}, l_start {d}\n", .{ t_end, l_start }); + if (t_end < l_start) { + return .{ .offset = @intCast(l_start), .len = l_word.len }; + } + last_word = this_word; + } else unreachable; + if (i == 0) break; + } + return this_word.?; +} + /// Returns an iterator over words in `slice` pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { return Iterator.init(wordbreak, slice); } -const IterState = packed struct { - mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter - mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric - quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter - regional: bool, // [^RI] (RI RI)* RI × RI - - pub const initial: IterState = .{ - .mid_punct = false, - .mid_num = false, - .quote_heb = false, - .regional = false, - }; -}; - pub const Iterator = struct { this: ?CodePoint = null, that: ?CodePoint = null, - cache: ?WordBreakProperty = null, cp_iter: CodepointIterator, wb: *const WordBreak, @@ -121,6 +116,16 @@ pub const Iterator = struct { return wb_iter; } + /// Returns the next word segment, without advancing. + pub fn peek(iter: *Iterator) ?Word { + const cache = .{ iter.this, iter.that, iter.cp_iter }; + defer { + iter.this, iter.that, iter.cp_iter = cache; + } + return iter.next(); + } + + /// Returns the next word segment. pub fn next(iter: *Iterator) ?Word { iter.advance(); @@ -132,7 +137,7 @@ pub const Iterator = struct { const word_start = iter.this.?.offset; var word_len: u32 = 0; - // state variables + // State variables. var last_p: WordBreakProperty = .none; var last_last_p: WordBreakProperty = .none; var ri_count: usize = 0; @@ -141,12 +146,13 @@ pub const Iterator = struct { const this = iter.this.?; word_len += this.len; if (iter.that) |that| { - const this_p = iter.wb.breakProperty(this.code); // WB3 CR × LF - const that_p = iter.wb.breakProperty(that.code); + const this_p = iter.wb.breakProp(this); + const that_p = iter.wb.breakProp(that); if (!isIgnorable(this_p)) { last_last_p = last_p; last_p = this_p; } + // WB3 CR × LF if (this_p == .CR and that_p == .LF) continue :scan; // WB3a (Newline | CR | LF) ÷ if (isNewline(this_p)) break :scan; @@ -169,7 +175,7 @@ pub const Iterator = struct { if (isMidVal(that_p)) { const next_val = iter.peekPast(); if (next_val) |next_cp| { - const next_p = iter.wb.breakProperty(next_cp.code); + const next_p = iter.wb.breakProp(next_cp); if (isAHLetter(next_p)) { continue :scan; } @@ -187,7 +193,7 @@ pub const Iterator = struct { if (that_p == .Double_Quote) { const next_val = iter.peekPast(); if (next_val) |next_cp| { - const next_p = iter.wb.breakProperty(next_cp.code); + const next_p = iter.wb.breakProp(next_cp); if (next_p == .Hebrew_Letter) { continue :scan; } @@ -210,7 +216,7 @@ pub const Iterator = struct { if (last_p == .Numeric and isMidNum(that_p)) { const next_val = iter.peekPast(); if (next_val) |next_cp| { - const next_p = iter.wb.breakProperty(next_cp.code); + const next_p = iter.wb.breakProp(next_cp); if (next_p == .Numeric) { continue :scan; } @@ -247,13 +253,37 @@ pub const Iterator = struct { const save_cp = iter.cp_iter; defer iter.cp_iter = save_cp; while (iter.cp_iter.peek()) |peeked| { - if (!isIgnorable(iter.wb.breakProperty(peeked.code))) return peeked; + if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; _ = iter.cp_iter.next(); } return null; } }; +inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { + const decompressor = compress.flate.inflate.decompressor; + const in_bytes = @embedFile("wbp"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = decompressor(.raw, in_fbs.reader()); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + const stage_1_len: u16 = try reader.readInt(u16, endian); + wb.s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(wb.s1); + for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); + + const stage_2_len: u16 = try reader.readInt(u16, endian); + wb.s2 = try allocator.alloc(u5, stage_2_len); + errdefer allocator.free(wb.s2); + for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); + var count_0: usize = 0; + for (wb.s2) |nyb| { + if (nyb == 0) count_0 += 1; + } +} + //| Predicates inline fn isNewline(wbp: WordBreakProperty) bool { @@ -293,11 +323,33 @@ test "Word Break Properties" { try testing.expectEqual(.LF, wb.breakProperty('\n')); try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); + var iter = wb.iterator("xxx"); + _ = iter.peek(); } -test "ext_pic" { +test "ext_pict" { try testing.expect(ext_pict.isMatch("👇")); - try testing.expect(ext_pict.isMatch("\u{2704}")); + try testing.expect(ext_pict.isMatch("\u{2701}")); +} + +test wordAtCursor { + const wb = try WordBreak.init(testing.allocator); + defer wb.deinit(testing.allocator); + const t_string = "first second third"; + const second = wb.wordAtCursor(t_string, 8); + try testing.expectEqualStrings("second", second.bytes(t_string)); + const third = wb.wordAtCursor(t_string, 14); + try testing.expectEqualStrings("third", third.bytes(t_string)); + { + const first = wb.wordAtCursor(t_string, 3); + try testing.expectEqualStrings("first", first.bytes(t_string)); + } + { + const first = wb.wordAtCursor(t_string, 0); + try testing.expectEqualStrings("first", first.bytes(t_string)); + } + const last = wb.wordAtCursor(t_string, 14); + try testing.expectEqualStrings("third", last.bytes(t_string)); } fn testAllocations(allocator: Allocator) !void { -- cgit v1.2.3