diff options
| author | 2025-05-13 12:15:18 -0400 | |
|---|---|---|
| committer | 2025-05-15 15:31:16 -0400 | |
| commit | 49843d6dc6cde85fe670085b809d41db314ec47d (patch) | |
| tree | bb36ee23debffab23c8066b5e8aee0a2b1b35ae1 /src | |
| parent | Rewrite, passes WordBreakTest (diff) | |
| download | zg-49843d6dc6cde85fe670085b809d41db314ec47d.tar.gz zg-49843d6dc6cde85fe670085b809d41db314ec47d.tar.xz zg-49843d6dc6cde85fe670085b809d41db314ec47d.zip | |
Add wordAtCursor
This is not actually the way to do it, and can break on some crafted
strings. The way to actually do it: implement a reverse word search
iterator, then do next() to find a word break, prev() to find a
_valid_ word start, then next() again to find the valid end of said
word. Maybe 2+, 2-, 1+ actually.
I can probably write a test to see if the cursor spot is ambiguous,
and apply an extra round if so. Need to mull the rules over before
making any rash moves.
Diffstat (limited to 'src')
| -rw-r--r-- | src/WordBreak.zig | 148 |
1 files changed, 100 insertions, 48 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig index a2be011..f0da30d 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig | |||
| @@ -42,30 +42,6 @@ pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { | |||
| 42 | }; | 42 | }; |
| 43 | } | 43 | } |
| 44 | 44 | ||
| 45 | inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { | ||
| 46 | const decompressor = compress.flate.inflate.decompressor; | ||
| 47 | const in_bytes = @embedFile("wbp"); | ||
| 48 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 49 | var in_decomp = decompressor(.raw, in_fbs.reader()); | ||
| 50 | var reader = in_decomp.reader(); | ||
| 51 | |||
| 52 | const endian = builtin.cpu.arch.endian(); | ||
| 53 | |||
| 54 | const stage_1_len: u16 = try reader.readInt(u16, endian); | ||
| 55 | wb.s1 = try allocator.alloc(u16, stage_1_len); | ||
| 56 | errdefer allocator.free(wb.s1); | ||
| 57 | for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); | ||
| 58 | |||
| 59 | const stage_2_len: u16 = try reader.readInt(u16, endian); | ||
| 60 | wb.s2 = try allocator.alloc(u5, stage_2_len); | ||
| 61 | errdefer allocator.free(wb.s2); | ||
| 62 | for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); | ||
| 63 | var count_0: usize = 0; | ||
| 64 | for (wb.s2) |nyb| { | ||
| 65 | if (nyb == 0) count_0 += 1; | ||
| 66 | } | ||
| 67 | } | ||
| 68 | |||
| 69 | pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { | 45 | pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { |
| 70 | allocator.free(wordbreak.s1); | 46 | allocator.free(wordbreak.s1); |
| 71 | allocator.free(wordbreak.s2); | 47 | allocator.free(wordbreak.s2); |
| @@ -88,29 +64,48 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { | |||
| 88 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); | 64 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); |
| 89 | } | 65 | } |
| 90 | 66 | ||
| 67 | fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { | ||
| 68 | return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); | ||
| 69 | } | ||
| 70 | |||
| 71 | /// Returns the Word at the given index. Asserts that the index is valid for | ||
| 72 | /// the provided string. Always returns a word. This algorithm is O(n^2) for | ||
| 73 | /// the size of the word before the cursor. | ||
| 74 | pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { | ||
| 75 | assert(index < string.len); | ||
| 76 | var i = index; | ||
| 77 | var iter1 = wordbreak.iterator(string[index..]); | ||
| 78 | var last_word = iter1.next(); | ||
| 79 | if (index == 0) return if (last_word) |word| word else Word{ .offset = 0, .len = 0 }; | ||
| 80 | var this_word: ?Word = undefined; | ||
| 81 | while (last_word) |l_word| : (i -= 1) { | ||
| 82 | var iter2 = wordbreak.iterator(string[i..]); | ||
| 83 | this_word = iter2.next(); | ||
| 84 | if (this_word) |t_word| { | ||
| 85 | if (false) std.debug.print("last_word: '{s}'\n", .{l_word.bytes(string[i + 1 ..])}); | ||
| 86 | if (false) std.debug.print("this_word: '{s}'\n", .{t_word.bytes(string[i..])}); | ||
| 87 | // Check if we've captured a previous word | ||
| 88 | const t_end = t_word.offset + t_word.len + i - 1; | ||
| 89 | const l_start = l_word.offset + i + 1; | ||
| 90 | if (false) std.debug.print("t_end {d}, l_start {d}\n", .{ t_end, l_start }); | ||
| 91 | if (t_end < l_start) { | ||
| 92 | return .{ .offset = @intCast(l_start), .len = l_word.len }; | ||
| 93 | } | ||
| 94 | last_word = this_word; | ||
| 95 | } else unreachable; | ||
| 96 | if (i == 0) break; | ||
| 97 | } | ||
| 98 | return this_word.?; | ||
| 99 | } | ||
| 100 | |||
| 91 | /// Returns an iterator over words in `slice` | 101 | /// Returns an iterator over words in `slice` |
| 92 | pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { | 102 | pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { |
| 93 | return Iterator.init(wordbreak, slice); | 103 | return Iterator.init(wordbreak, slice); |
| 94 | } | 104 | } |
| 95 | 105 | ||
| 96 | const IterState = packed struct { | ||
| 97 | mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter | ||
| 98 | mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric | ||
| 99 | quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter | ||
| 100 | regional: bool, // [^RI] (RI RI)* RI × RI | ||
| 101 | |||
| 102 | pub const initial: IterState = .{ | ||
| 103 | .mid_punct = false, | ||
| 104 | .mid_num = false, | ||
| 105 | .quote_heb = false, | ||
| 106 | .regional = false, | ||
| 107 | }; | ||
| 108 | }; | ||
| 109 | |||
| 110 | pub const Iterator = struct { | 106 | pub const Iterator = struct { |
| 111 | this: ?CodePoint = null, | 107 | this: ?CodePoint = null, |
| 112 | that: ?CodePoint = null, | 108 | that: ?CodePoint = null, |
| 113 | cache: ?WordBreakProperty = null, | ||
| 114 | cp_iter: CodepointIterator, | 109 | cp_iter: CodepointIterator, |
| 115 | wb: *const WordBreak, | 110 | wb: *const WordBreak, |
| 116 | 111 | ||
| @@ -121,6 +116,16 @@ pub const Iterator = struct { | |||
| 121 | return wb_iter; | 116 | return wb_iter; |
| 122 | } | 117 | } |
| 123 | 118 | ||
| 119 | /// Returns the next word segment, without advancing. | ||
| 120 | pub fn peek(iter: *Iterator) ?Word { | ||
| 121 | const cache = .{ iter.this, iter.that, iter.cp_iter }; | ||
| 122 | defer { | ||
| 123 | iter.this, iter.that, iter.cp_iter = cache; | ||
| 124 | } | ||
| 125 | return iter.next(); | ||
| 126 | } | ||
| 127 | |||
| 128 | /// Returns the next word segment. | ||
| 124 | pub fn next(iter: *Iterator) ?Word { | 129 | pub fn next(iter: *Iterator) ?Word { |
| 125 | iter.advance(); | 130 | iter.advance(); |
| 126 | 131 | ||
| @@ -132,7 +137,7 @@ pub const Iterator = struct { | |||
| 132 | const word_start = iter.this.?.offset; | 137 | const word_start = iter.this.?.offset; |
| 133 | var word_len: u32 = 0; | 138 | var word_len: u32 = 0; |
| 134 | 139 | ||
| 135 | // state variables | 140 | // State variables. |
| 136 | var last_p: WordBreakProperty = .none; | 141 | var last_p: WordBreakProperty = .none; |
| 137 | var last_last_p: WordBreakProperty = .none; | 142 | var last_last_p: WordBreakProperty = .none; |
| 138 | var ri_count: usize = 0; | 143 | var ri_count: usize = 0; |
| @@ -141,12 +146,13 @@ pub const Iterator = struct { | |||
| 141 | const this = iter.this.?; | 146 | const this = iter.this.?; |
| 142 | word_len += this.len; | 147 | word_len += this.len; |
| 143 | if (iter.that) |that| { | 148 | if (iter.that) |that| { |
| 144 | const this_p = iter.wb.breakProperty(this.code); // WB3 CR × LF | 149 | const this_p = iter.wb.breakProp(this); |
| 145 | const that_p = iter.wb.breakProperty(that.code); | 150 | const that_p = iter.wb.breakProp(that); |
| 146 | if (!isIgnorable(this_p)) { | 151 | if (!isIgnorable(this_p)) { |
| 147 | last_last_p = last_p; | 152 | last_last_p = last_p; |
| 148 | last_p = this_p; | 153 | last_p = this_p; |
| 149 | } | 154 | } |
| 155 | // WB3 CR × LF | ||
| 150 | if (this_p == .CR and that_p == .LF) continue :scan; | 156 | if (this_p == .CR and that_p == .LF) continue :scan; |
| 151 | // WB3a (Newline | CR | LF) ÷ | 157 | // WB3a (Newline | CR | LF) ÷ |
| 152 | if (isNewline(this_p)) break :scan; | 158 | if (isNewline(this_p)) break :scan; |
| @@ -169,7 +175,7 @@ pub const Iterator = struct { | |||
| 169 | if (isMidVal(that_p)) { | 175 | if (isMidVal(that_p)) { |
| 170 | const next_val = iter.peekPast(); | 176 | const next_val = iter.peekPast(); |
| 171 | if (next_val) |next_cp| { | 177 | if (next_val) |next_cp| { |
| 172 | const next_p = iter.wb.breakProperty(next_cp.code); | 178 | const next_p = iter.wb.breakProp(next_cp); |
| 173 | if (isAHLetter(next_p)) { | 179 | if (isAHLetter(next_p)) { |
| 174 | continue :scan; | 180 | continue :scan; |
| 175 | } | 181 | } |
| @@ -187,7 +193,7 @@ pub const Iterator = struct { | |||
| 187 | if (that_p == .Double_Quote) { | 193 | if (that_p == .Double_Quote) { |
| 188 | const next_val = iter.peekPast(); | 194 | const next_val = iter.peekPast(); |
| 189 | if (next_val) |next_cp| { | 195 | if (next_val) |next_cp| { |
| 190 | const next_p = iter.wb.breakProperty(next_cp.code); | 196 | const next_p = iter.wb.breakProp(next_cp); |
| 191 | if (next_p == .Hebrew_Letter) { | 197 | if (next_p == .Hebrew_Letter) { |
| 192 | continue :scan; | 198 | continue :scan; |
| 193 | } | 199 | } |
| @@ -210,7 +216,7 @@ pub const Iterator = struct { | |||
| 210 | if (last_p == .Numeric and isMidNum(that_p)) { | 216 | if (last_p == .Numeric and isMidNum(that_p)) { |
| 211 | const next_val = iter.peekPast(); | 217 | const next_val = iter.peekPast(); |
| 212 | if (next_val) |next_cp| { | 218 | if (next_val) |next_cp| { |
| 213 | const next_p = iter.wb.breakProperty(next_cp.code); | 219 | const next_p = iter.wb.breakProp(next_cp); |
| 214 | if (next_p == .Numeric) { | 220 | if (next_p == .Numeric) { |
| 215 | continue :scan; | 221 | continue :scan; |
| 216 | } | 222 | } |
| @@ -247,13 +253,37 @@ pub const Iterator = struct { | |||
| 247 | const save_cp = iter.cp_iter; | 253 | const save_cp = iter.cp_iter; |
| 248 | defer iter.cp_iter = save_cp; | 254 | defer iter.cp_iter = save_cp; |
| 249 | while (iter.cp_iter.peek()) |peeked| { | 255 | while (iter.cp_iter.peek()) |peeked| { |
| 250 | if (!isIgnorable(iter.wb.breakProperty(peeked.code))) return peeked; | 256 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; |
| 251 | _ = iter.cp_iter.next(); | 257 | _ = iter.cp_iter.next(); |
| 252 | } | 258 | } |
| 253 | return null; | 259 | return null; |
| 254 | } | 260 | } |
| 255 | }; | 261 | }; |
| 256 | 262 | ||
| 263 | inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { | ||
| 264 | const decompressor = compress.flate.inflate.decompressor; | ||
| 265 | const in_bytes = @embedFile("wbp"); | ||
| 266 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 267 | var in_decomp = decompressor(.raw, in_fbs.reader()); | ||
| 268 | var reader = in_decomp.reader(); | ||
| 269 | |||
| 270 | const endian = builtin.cpu.arch.endian(); | ||
| 271 | |||
| 272 | const stage_1_len: u16 = try reader.readInt(u16, endian); | ||
| 273 | wb.s1 = try allocator.alloc(u16, stage_1_len); | ||
| 274 | errdefer allocator.free(wb.s1); | ||
| 275 | for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); | ||
| 276 | |||
| 277 | const stage_2_len: u16 = try reader.readInt(u16, endian); | ||
| 278 | wb.s2 = try allocator.alloc(u5, stage_2_len); | ||
| 279 | errdefer allocator.free(wb.s2); | ||
| 280 | for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); | ||
| 281 | var count_0: usize = 0; | ||
| 282 | for (wb.s2) |nyb| { | ||
| 283 | if (nyb == 0) count_0 += 1; | ||
| 284 | } | ||
| 285 | } | ||
| 286 | |||
| 257 | //| Predicates | 287 | //| Predicates |
| 258 | 288 | ||
| 259 | inline fn isNewline(wbp: WordBreakProperty) bool { | 289 | inline fn isNewline(wbp: WordBreakProperty) bool { |
| @@ -293,11 +323,33 @@ test "Word Break Properties" { | |||
| 293 | try testing.expectEqual(.LF, wb.breakProperty('\n')); | 323 | try testing.expectEqual(.LF, wb.breakProperty('\n')); |
| 294 | try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); | 324 | try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); |
| 295 | try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); | 325 | try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); |
| 326 | var iter = wb.iterator("xxx"); | ||
| 327 | _ = iter.peek(); | ||
| 296 | } | 328 | } |
| 297 | 329 | ||
| 298 | test "ext_pic" { | 330 | test "ext_pict" { |
| 299 | try testing.expect(ext_pict.isMatch("👇")); | 331 | try testing.expect(ext_pict.isMatch("👇")); |
| 300 | try testing.expect(ext_pict.isMatch("\u{2704}")); | 332 | try testing.expect(ext_pict.isMatch("\u{2701}")); |
| 333 | } | ||
| 334 | |||
| 335 | test wordAtCursor { | ||
| 336 | const wb = try WordBreak.init(testing.allocator); | ||
| 337 | defer wb.deinit(testing.allocator); | ||
| 338 | const t_string = "first second third"; | ||
| 339 | const second = wb.wordAtCursor(t_string, 8); | ||
| 340 | try testing.expectEqualStrings("second", second.bytes(t_string)); | ||
| 341 | const third = wb.wordAtCursor(t_string, 14); | ||
| 342 | try testing.expectEqualStrings("third", third.bytes(t_string)); | ||
| 343 | { | ||
| 344 | const first = wb.wordAtCursor(t_string, 3); | ||
| 345 | try testing.expectEqualStrings("first", first.bytes(t_string)); | ||
| 346 | } | ||
| 347 | { | ||
| 348 | const first = wb.wordAtCursor(t_string, 0); | ||
| 349 | try testing.expectEqualStrings("first", first.bytes(t_string)); | ||
| 350 | } | ||
| 351 | const last = wb.wordAtCursor(t_string, 14); | ||
| 352 | try testing.expectEqualStrings("third", last.bytes(t_string)); | ||
| 301 | } | 353 | } |
| 302 | 354 | ||
| 303 | fn testAllocations(allocator: Allocator) !void { | 355 | fn testAllocations(allocator: Allocator) !void { |