diff options
| author | 2025-05-13 12:15:18 -0400 | |
|---|---|---|
| committer | 2025-05-15 15:31:16 -0400 | |
| commit | 49843d6dc6cde85fe670085b809d41db314ec47d (patch) | |
| tree | bb36ee23debffab23c8066b5e8aee0a2b1b35ae1 | |
| parent | Rewrite, passes WordBreakTest (diff) | |
| download | zg-49843d6dc6cde85fe670085b809d41db314ec47d.tar.gz zg-49843d6dc6cde85fe670085b809d41db314ec47d.tar.xz zg-49843d6dc6cde85fe670085b809d41db314ec47d.zip | |
Add wordAtCursor
This is not actually the way to do it, and can break on some crafted
strings. The way to actually do it: implement a reverse word search
iterator, then do next() to find a word break, prev() to find a
_valid_ word start, then next() again to find the valid end of said
word. Maybe 2+, 2-, 1+ actually.
I can probably write a test to see if the cursor spot is ambiguous,
and apply an extra round if so. Need to mull the rules over before
making any rash moves.
| -rw-r--r-- | src/WordBreak.zig | 148 |
1 files changed, 100 insertions, 48 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig index a2be011..f0da30d 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig | |||
| @@ -42,30 +42,6 @@ pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { | |||
| 42 | }; | 42 | }; |
| 43 | } | 43 | } |
| 44 | 44 | ||
| 45 | inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { | ||
| 46 | const decompressor = compress.flate.inflate.decompressor; | ||
| 47 | const in_bytes = @embedFile("wbp"); | ||
| 48 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 49 | var in_decomp = decompressor(.raw, in_fbs.reader()); | ||
| 50 | var reader = in_decomp.reader(); | ||
| 51 | |||
| 52 | const endian = builtin.cpu.arch.endian(); | ||
| 53 | |||
| 54 | const stage_1_len: u16 = try reader.readInt(u16, endian); | ||
| 55 | wb.s1 = try allocator.alloc(u16, stage_1_len); | ||
| 56 | errdefer allocator.free(wb.s1); | ||
| 57 | for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); | ||
| 58 | |||
| 59 | const stage_2_len: u16 = try reader.readInt(u16, endian); | ||
| 60 | wb.s2 = try allocator.alloc(u5, stage_2_len); | ||
| 61 | errdefer allocator.free(wb.s2); | ||
| 62 | for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); | ||
| 63 | var count_0: usize = 0; | ||
| 64 | for (wb.s2) |nyb| { | ||
| 65 | if (nyb == 0) count_0 += 1; | ||
| 66 | } | ||
| 67 | } | ||
| 68 | |||
| 69 | pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { | 45 | pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { |
| 70 | allocator.free(wordbreak.s1); | 46 | allocator.free(wordbreak.s1); |
| 71 | allocator.free(wordbreak.s2); | 47 | allocator.free(wordbreak.s2); |
| @@ -88,29 +64,48 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { | |||
| 88 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); | 64 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); |
| 89 | } | 65 | } |
| 90 | 66 | ||
| 67 | fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { | ||
| 68 | return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); | ||
| 69 | } | ||
| 70 | |||
| 71 | /// Returns the Word at the given index. Asserts that the index is valid for | ||
| 72 | /// the provided string. Always returns a word. This algorithm is O(n^2) for | ||
| 73 | /// the size of the word before the cursor. | ||
| 74 | pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { | ||
| 75 | assert(index < string.len); | ||
| 76 | var i = index; | ||
| 77 | var iter1 = wordbreak.iterator(string[index..]); | ||
| 78 | var last_word = iter1.next(); | ||
| 79 | if (index == 0) return if (last_word) |word| word else Word{ .offset = 0, .len = 0 }; | ||
| 80 | var this_word: ?Word = undefined; | ||
| 81 | while (last_word) |l_word| : (i -= 1) { | ||
| 82 | var iter2 = wordbreak.iterator(string[i..]); | ||
| 83 | this_word = iter2.next(); | ||
| 84 | if (this_word) |t_word| { | ||
| 85 | if (false) std.debug.print("last_word: '{s}'\n", .{l_word.bytes(string[i + 1 ..])}); | ||
| 86 | if (false) std.debug.print("this_word: '{s}'\n", .{t_word.bytes(string[i..])}); | ||
| 87 | // Check if we've captured a previous word | ||
| 88 | const t_end = t_word.offset + t_word.len + i - 1; | ||
| 89 | const l_start = l_word.offset + i + 1; | ||
| 90 | if (false) std.debug.print("t_end {d}, l_start {d}\n", .{ t_end, l_start }); | ||
| 91 | if (t_end < l_start) { | ||
| 92 | return .{ .offset = @intCast(l_start), .len = l_word.len }; | ||
| 93 | } | ||
| 94 | last_word = this_word; | ||
| 95 | } else unreachable; | ||
| 96 | if (i == 0) break; | ||
| 97 | } | ||
| 98 | return this_word.?; | ||
| 99 | } | ||
| 100 | |||
| 91 | /// Returns an iterator over words in `slice` | 101 | /// Returns an iterator over words in `slice` |
| 92 | pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { | 102 | pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { |
| 93 | return Iterator.init(wordbreak, slice); | 103 | return Iterator.init(wordbreak, slice); |
| 94 | } | 104 | } |
| 95 | 105 | ||
| 96 | const IterState = packed struct { | ||
| 97 | mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter | ||
| 98 | mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric | ||
| 99 | quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter | ||
| 100 | regional: bool, // [^RI] (RI RI)* RI × RI | ||
| 101 | |||
| 102 | pub const initial: IterState = .{ | ||
| 103 | .mid_punct = false, | ||
| 104 | .mid_num = false, | ||
| 105 | .quote_heb = false, | ||
| 106 | .regional = false, | ||
| 107 | }; | ||
| 108 | }; | ||
| 109 | |||
| 110 | pub const Iterator = struct { | 106 | pub const Iterator = struct { |
| 111 | this: ?CodePoint = null, | 107 | this: ?CodePoint = null, |
| 112 | that: ?CodePoint = null, | 108 | that: ?CodePoint = null, |
| 113 | cache: ?WordBreakProperty = null, | ||
| 114 | cp_iter: CodepointIterator, | 109 | cp_iter: CodepointIterator, |
| 115 | wb: *const WordBreak, | 110 | wb: *const WordBreak, |
| 116 | 111 | ||
| @@ -121,6 +116,16 @@ pub const Iterator = struct { | |||
| 121 | return wb_iter; | 116 | return wb_iter; |
| 122 | } | 117 | } |
| 123 | 118 | ||
| 119 | /// Returns the next word segment, without advancing. | ||
| 120 | pub fn peek(iter: *Iterator) ?Word { | ||
| 121 | const cache = .{ iter.this, iter.that, iter.cp_iter }; | ||
| 122 | defer { | ||
| 123 | iter.this, iter.that, iter.cp_iter = cache; | ||
| 124 | } | ||
| 125 | return iter.next(); | ||
| 126 | } | ||
| 127 | |||
| 128 | /// Returns the next word segment. | ||
| 124 | pub fn next(iter: *Iterator) ?Word { | 129 | pub fn next(iter: *Iterator) ?Word { |
| 125 | iter.advance(); | 130 | iter.advance(); |
| 126 | 131 | ||
| @@ -132,7 +137,7 @@ pub const Iterator = struct { | |||
| 132 | const word_start = iter.this.?.offset; | 137 | const word_start = iter.this.?.offset; |
| 133 | var word_len: u32 = 0; | 138 | var word_len: u32 = 0; |
| 134 | 139 | ||
| 135 | // state variables | 140 | // State variables. |
| 136 | var last_p: WordBreakProperty = .none; | 141 | var last_p: WordBreakProperty = .none; |
| 137 | var last_last_p: WordBreakProperty = .none; | 142 | var last_last_p: WordBreakProperty = .none; |
| 138 | var ri_count: usize = 0; | 143 | var ri_count: usize = 0; |
| @@ -141,12 +146,13 @@ pub const Iterator = struct { | |||
| 141 | const this = iter.this.?; | 146 | const this = iter.this.?; |
| 142 | word_len += this.len; | 147 | word_len += this.len; |
| 143 | if (iter.that) |that| { | 148 | if (iter.that) |that| { |
| 144 | const this_p = iter.wb.breakProperty(this.code); // WB3 CR × LF | 149 | const this_p = iter.wb.breakProp(this); |
| 145 | const that_p = iter.wb.breakProperty(that.code); | 150 | const that_p = iter.wb.breakProp(that); |
| 146 | if (!isIgnorable(this_p)) { | 151 | if (!isIgnorable(this_p)) { |
| 147 | last_last_p = last_p; | 152 | last_last_p = last_p; |
| 148 | last_p = this_p; | 153 | last_p = this_p; |
| 149 | } | 154 | } |
| 155 | // WB3 CR × LF | ||
| 150 | if (this_p == .CR and that_p == .LF) continue :scan; | 156 | if (this_p == .CR and that_p == .LF) continue :scan; |
| 151 | // WB3a (Newline | CR | LF) ÷ | 157 | // WB3a (Newline | CR | LF) ÷ |
| 152 | if (isNewline(this_p)) break :scan; | 158 | if (isNewline(this_p)) break :scan; |
| @@ -169,7 +175,7 @@ pub const Iterator = struct { | |||
| 169 | if (isMidVal(that_p)) { | 175 | if (isMidVal(that_p)) { |
| 170 | const next_val = iter.peekPast(); | 176 | const next_val = iter.peekPast(); |
| 171 | if (next_val) |next_cp| { | 177 | if (next_val) |next_cp| { |
| 172 | const next_p = iter.wb.breakProperty(next_cp.code); | 178 | const next_p = iter.wb.breakProp(next_cp); |
| 173 | if (isAHLetter(next_p)) { | 179 | if (isAHLetter(next_p)) { |
| 174 | continue :scan; | 180 | continue :scan; |
| 175 | } | 181 | } |
| @@ -187,7 +193,7 @@ pub const Iterator = struct { | |||
| 187 | if (that_p == .Double_Quote) { | 193 | if (that_p == .Double_Quote) { |
| 188 | const next_val = iter.peekPast(); | 194 | const next_val = iter.peekPast(); |
| 189 | if (next_val) |next_cp| { | 195 | if (next_val) |next_cp| { |
| 190 | const next_p = iter.wb.breakProperty(next_cp.code); | 196 | const next_p = iter.wb.breakProp(next_cp); |
| 191 | if (next_p == .Hebrew_Letter) { | 197 | if (next_p == .Hebrew_Letter) { |
| 192 | continue :scan; | 198 | continue :scan; |
| 193 | } | 199 | } |
| @@ -210,7 +216,7 @@ pub const Iterator = struct { | |||
| 210 | if (last_p == .Numeric and isMidNum(that_p)) { | 216 | if (last_p == .Numeric and isMidNum(that_p)) { |
| 211 | const next_val = iter.peekPast(); | 217 | const next_val = iter.peekPast(); |
| 212 | if (next_val) |next_cp| { | 218 | if (next_val) |next_cp| { |
| 213 | const next_p = iter.wb.breakProperty(next_cp.code); | 219 | const next_p = iter.wb.breakProp(next_cp); |
| 214 | if (next_p == .Numeric) { | 220 | if (next_p == .Numeric) { |
| 215 | continue :scan; | 221 | continue :scan; |
| 216 | } | 222 | } |
| @@ -247,13 +253,37 @@ pub const Iterator = struct { | |||
| 247 | const save_cp = iter.cp_iter; | 253 | const save_cp = iter.cp_iter; |
| 248 | defer iter.cp_iter = save_cp; | 254 | defer iter.cp_iter = save_cp; |
| 249 | while (iter.cp_iter.peek()) |peeked| { | 255 | while (iter.cp_iter.peek()) |peeked| { |
| 250 | if (!isIgnorable(iter.wb.breakProperty(peeked.code))) return peeked; | 256 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; |
| 251 | _ = iter.cp_iter.next(); | 257 | _ = iter.cp_iter.next(); |
| 252 | } | 258 | } |
| 253 | return null; | 259 | return null; |
| 254 | } | 260 | } |
| 255 | }; | 261 | }; |
| 256 | 262 | ||
| 263 | inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { | ||
| 264 | const decompressor = compress.flate.inflate.decompressor; | ||
| 265 | const in_bytes = @embedFile("wbp"); | ||
| 266 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 267 | var in_decomp = decompressor(.raw, in_fbs.reader()); | ||
| 268 | var reader = in_decomp.reader(); | ||
| 269 | |||
| 270 | const endian = builtin.cpu.arch.endian(); | ||
| 271 | |||
| 272 | const stage_1_len: u16 = try reader.readInt(u16, endian); | ||
| 273 | wb.s1 = try allocator.alloc(u16, stage_1_len); | ||
| 274 | errdefer allocator.free(wb.s1); | ||
| 275 | for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); | ||
| 276 | |||
| 277 | const stage_2_len: u16 = try reader.readInt(u16, endian); | ||
| 278 | wb.s2 = try allocator.alloc(u5, stage_2_len); | ||
| 279 | errdefer allocator.free(wb.s2); | ||
| 280 | for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); | ||
| 281 | var count_0: usize = 0; | ||
| 282 | for (wb.s2) |nyb| { | ||
| 283 | if (nyb == 0) count_0 += 1; | ||
| 284 | } | ||
| 285 | } | ||
| 286 | |||
| 257 | //| Predicates | 287 | //| Predicates |
| 258 | 288 | ||
| 259 | inline fn isNewline(wbp: WordBreakProperty) bool { | 289 | inline fn isNewline(wbp: WordBreakProperty) bool { |
| @@ -293,11 +323,33 @@ test "Word Break Properties" { | |||
| 293 | try testing.expectEqual(.LF, wb.breakProperty('\n')); | 323 | try testing.expectEqual(.LF, wb.breakProperty('\n')); |
| 294 | try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); | 324 | try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); |
| 295 | try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); | 325 | try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); |
| 326 | var iter = wb.iterator("xxx"); | ||
| 327 | _ = iter.peek(); | ||
| 296 | } | 328 | } |
| 297 | 329 | ||
| 298 | test "ext_pic" { | 330 | test "ext_pict" { |
| 299 | try testing.expect(ext_pict.isMatch("👇")); | 331 | try testing.expect(ext_pict.isMatch("👇")); |
| 300 | try testing.expect(ext_pict.isMatch("\u{2704}")); | 332 | try testing.expect(ext_pict.isMatch("\u{2701}")); |
| 333 | } | ||
| 334 | |||
| 335 | test wordAtCursor { | ||
| 336 | const wb = try WordBreak.init(testing.allocator); | ||
| 337 | defer wb.deinit(testing.allocator); | ||
| 338 | const t_string = "first second third"; | ||
| 339 | const second = wb.wordAtCursor(t_string, 8); | ||
| 340 | try testing.expectEqualStrings("second", second.bytes(t_string)); | ||
| 341 | const third = wb.wordAtCursor(t_string, 14); | ||
| 342 | try testing.expectEqualStrings("third", third.bytes(t_string)); | ||
| 343 | { | ||
| 344 | const first = wb.wordAtCursor(t_string, 3); | ||
| 345 | try testing.expectEqualStrings("first", first.bytes(t_string)); | ||
| 346 | } | ||
| 347 | { | ||
| 348 | const first = wb.wordAtCursor(t_string, 0); | ||
| 349 | try testing.expectEqualStrings("first", first.bytes(t_string)); | ||
| 350 | } | ||
| 351 | const last = wb.wordAtCursor(t_string, 14); | ||
| 352 | try testing.expectEqualStrings("third", last.bytes(t_string)); | ||
| 301 | } | 353 | } |
| 302 | 354 | ||
| 303 | fn testAllocations(allocator: Allocator) !void { | 355 | fn testAllocations(allocator: Allocator) !void { |