summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-13 12:15:18 -0400
committerGravatar Sam Atman2025-05-15 15:31:16 -0400
commit49843d6dc6cde85fe670085b809d41db314ec47d (patch)
treebb36ee23debffab23c8066b5e8aee0a2b1b35ae1
parentRewrite, passes WordBreakTest (diff)
downloadzg-49843d6dc6cde85fe670085b809d41db314ec47d.tar.gz
zg-49843d6dc6cde85fe670085b809d41db314ec47d.tar.xz
zg-49843d6dc6cde85fe670085b809d41db314ec47d.zip
Add wordAtCursor
This is not actually the way to do it, and can break on some crafted strings. The way to actually do it: implement a reverse word search iterator, then do next() to find a word break, prev() to find a _valid_ word start, then next() again to find the valid end of said word. Maybe 2+, 2-, 1+ actually. I can probably write a test to see if the cursor spot is ambiguous, and apply an extra round if so. Need to mull the rules over before making any rash moves.
-rw-r--r--src/WordBreak.zig148
1 files changed, 100 insertions, 48 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index a2be011..f0da30d 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -42,30 +42,6 @@ pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void {
42 }; 42 };
43} 43}
44 44
45inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void {
46 const decompressor = compress.flate.inflate.decompressor;
47 const in_bytes = @embedFile("wbp");
48 var in_fbs = std.io.fixedBufferStream(in_bytes);
49 var in_decomp = decompressor(.raw, in_fbs.reader());
50 var reader = in_decomp.reader();
51
52 const endian = builtin.cpu.arch.endian();
53
54 const stage_1_len: u16 = try reader.readInt(u16, endian);
55 wb.s1 = try allocator.alloc(u16, stage_1_len);
56 errdefer allocator.free(wb.s1);
57 for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian);
58
59 const stage_2_len: u16 = try reader.readInt(u16, endian);
60 wb.s2 = try allocator.alloc(u5, stage_2_len);
61 errdefer allocator.free(wb.s2);
62 for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian));
63 var count_0: usize = 0;
64 for (wb.s2) |nyb| {
65 if (nyb == 0) count_0 += 1;
66 }
67}
68
69pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { 45pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void {
70 allocator.free(wordbreak.s1); 46 allocator.free(wordbreak.s1);
71 allocator.free(wordbreak.s2); 47 allocator.free(wordbreak.s2);
@@ -88,29 +64,48 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
88 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); 64 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
89} 65}
90 66
67fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty {
68 return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]);
69}
70
71/// Returns the Word at the given index. Asserts that the index is valid for
72/// the provided string. Always returns a word. This algorithm is O(n^2) for
73/// the size of the word before the cursor.
74pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word {
75 assert(index < string.len);
76 var i = index;
77 var iter1 = wordbreak.iterator(string[index..]);
78 var last_word = iter1.next();
79 if (index == 0) return if (last_word) |word| word else Word{ .offset = 0, .len = 0 };
80 var this_word: ?Word = undefined;
81 while (last_word) |l_word| : (i -= 1) {
82 var iter2 = wordbreak.iterator(string[i..]);
83 this_word = iter2.next();
84 if (this_word) |t_word| {
85 if (false) std.debug.print("last_word: '{s}'\n", .{l_word.bytes(string[i + 1 ..])});
86 if (false) std.debug.print("this_word: '{s}'\n", .{t_word.bytes(string[i..])});
87 // Check if we've captured a previous word
88 const t_end = t_word.offset + t_word.len + i - 1;
89 const l_start = l_word.offset + i + 1;
90 if (false) std.debug.print("t_end {d}, l_start {d}\n", .{ t_end, l_start });
91 if (t_end < l_start) {
92 return .{ .offset = @intCast(l_start), .len = l_word.len };
93 }
94 last_word = this_word;
95 } else unreachable;
96 if (i == 0) break;
97 }
98 return this_word.?;
99}
100
91/// Returns an iterator over words in `slice` 101/// Returns an iterator over words in `slice`
92pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { 102pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator {
93 return Iterator.init(wordbreak, slice); 103 return Iterator.init(wordbreak, slice);
94} 104}
95 105
96const IterState = packed struct {
97 mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter
98 mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric
99 quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter
100 regional: bool, // [^RI] (RI RI)* RI × RI
101
102 pub const initial: IterState = .{
103 .mid_punct = false,
104 .mid_num = false,
105 .quote_heb = false,
106 .regional = false,
107 };
108};
109
110pub const Iterator = struct { 106pub const Iterator = struct {
111 this: ?CodePoint = null, 107 this: ?CodePoint = null,
112 that: ?CodePoint = null, 108 that: ?CodePoint = null,
113 cache: ?WordBreakProperty = null,
114 cp_iter: CodepointIterator, 109 cp_iter: CodepointIterator,
115 wb: *const WordBreak, 110 wb: *const WordBreak,
116 111
@@ -121,6 +116,16 @@ pub const Iterator = struct {
121 return wb_iter; 116 return wb_iter;
122 } 117 }
123 118
119 /// Returns the next word segment, without advancing.
120 pub fn peek(iter: *Iterator) ?Word {
121 const cache = .{ iter.this, iter.that, iter.cp_iter };
122 defer {
123 iter.this, iter.that, iter.cp_iter = cache;
124 }
125 return iter.next();
126 }
127
128 /// Returns the next word segment.
124 pub fn next(iter: *Iterator) ?Word { 129 pub fn next(iter: *Iterator) ?Word {
125 iter.advance(); 130 iter.advance();
126 131
@@ -132,7 +137,7 @@ pub const Iterator = struct {
132 const word_start = iter.this.?.offset; 137 const word_start = iter.this.?.offset;
133 var word_len: u32 = 0; 138 var word_len: u32 = 0;
134 139
135 // state variables 140 // State variables.
136 var last_p: WordBreakProperty = .none; 141 var last_p: WordBreakProperty = .none;
137 var last_last_p: WordBreakProperty = .none; 142 var last_last_p: WordBreakProperty = .none;
138 var ri_count: usize = 0; 143 var ri_count: usize = 0;
@@ -141,12 +146,13 @@ pub const Iterator = struct {
141 const this = iter.this.?; 146 const this = iter.this.?;
142 word_len += this.len; 147 word_len += this.len;
143 if (iter.that) |that| { 148 if (iter.that) |that| {
144 const this_p = iter.wb.breakProperty(this.code); // WB3 CR × LF 149 const this_p = iter.wb.breakProp(this);
145 const that_p = iter.wb.breakProperty(that.code); 150 const that_p = iter.wb.breakProp(that);
146 if (!isIgnorable(this_p)) { 151 if (!isIgnorable(this_p)) {
147 last_last_p = last_p; 152 last_last_p = last_p;
148 last_p = this_p; 153 last_p = this_p;
149 } 154 }
155 // WB3 CR × LF
150 if (this_p == .CR and that_p == .LF) continue :scan; 156 if (this_p == .CR and that_p == .LF) continue :scan;
151 // WB3a (Newline | CR | LF) ÷ 157 // WB3a (Newline | CR | LF) ÷
152 if (isNewline(this_p)) break :scan; 158 if (isNewline(this_p)) break :scan;
@@ -169,7 +175,7 @@ pub const Iterator = struct {
169 if (isMidVal(that_p)) { 175 if (isMidVal(that_p)) {
170 const next_val = iter.peekPast(); 176 const next_val = iter.peekPast();
171 if (next_val) |next_cp| { 177 if (next_val) |next_cp| {
172 const next_p = iter.wb.breakProperty(next_cp.code); 178 const next_p = iter.wb.breakProp(next_cp);
173 if (isAHLetter(next_p)) { 179 if (isAHLetter(next_p)) {
174 continue :scan; 180 continue :scan;
175 } 181 }
@@ -187,7 +193,7 @@ pub const Iterator = struct {
187 if (that_p == .Double_Quote) { 193 if (that_p == .Double_Quote) {
188 const next_val = iter.peekPast(); 194 const next_val = iter.peekPast();
189 if (next_val) |next_cp| { 195 if (next_val) |next_cp| {
190 const next_p = iter.wb.breakProperty(next_cp.code); 196 const next_p = iter.wb.breakProp(next_cp);
191 if (next_p == .Hebrew_Letter) { 197 if (next_p == .Hebrew_Letter) {
192 continue :scan; 198 continue :scan;
193 } 199 }
@@ -210,7 +216,7 @@ pub const Iterator = struct {
210 if (last_p == .Numeric and isMidNum(that_p)) { 216 if (last_p == .Numeric and isMidNum(that_p)) {
211 const next_val = iter.peekPast(); 217 const next_val = iter.peekPast();
212 if (next_val) |next_cp| { 218 if (next_val) |next_cp| {
213 const next_p = iter.wb.breakProperty(next_cp.code); 219 const next_p = iter.wb.breakProp(next_cp);
214 if (next_p == .Numeric) { 220 if (next_p == .Numeric) {
215 continue :scan; 221 continue :scan;
216 } 222 }
@@ -247,13 +253,37 @@ pub const Iterator = struct {
247 const save_cp = iter.cp_iter; 253 const save_cp = iter.cp_iter;
248 defer iter.cp_iter = save_cp; 254 defer iter.cp_iter = save_cp;
249 while (iter.cp_iter.peek()) |peeked| { 255 while (iter.cp_iter.peek()) |peeked| {
250 if (!isIgnorable(iter.wb.breakProperty(peeked.code))) return peeked; 256 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
251 _ = iter.cp_iter.next(); 257 _ = iter.cp_iter.next();
252 } 258 }
253 return null; 259 return null;
254 } 260 }
255}; 261};
256 262
263inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void {
264 const decompressor = compress.flate.inflate.decompressor;
265 const in_bytes = @embedFile("wbp");
266 var in_fbs = std.io.fixedBufferStream(in_bytes);
267 var in_decomp = decompressor(.raw, in_fbs.reader());
268 var reader = in_decomp.reader();
269
270 const endian = builtin.cpu.arch.endian();
271
272 const stage_1_len: u16 = try reader.readInt(u16, endian);
273 wb.s1 = try allocator.alloc(u16, stage_1_len);
274 errdefer allocator.free(wb.s1);
275 for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian);
276
277 const stage_2_len: u16 = try reader.readInt(u16, endian);
278 wb.s2 = try allocator.alloc(u5, stage_2_len);
279 errdefer allocator.free(wb.s2);
280 for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian));
281 var count_0: usize = 0;
282 for (wb.s2) |nyb| {
283 if (nyb == 0) count_0 += 1;
284 }
285}
286
257//| Predicates 287//| Predicates
258 288
259inline fn isNewline(wbp: WordBreakProperty) bool { 289inline fn isNewline(wbp: WordBreakProperty) bool {
@@ -293,11 +323,33 @@ test "Word Break Properties" {
293 try testing.expectEqual(.LF, wb.breakProperty('\n')); 323 try testing.expectEqual(.LF, wb.breakProperty('\n'));
294 try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); 324 try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש'));
295 try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); 325 try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}'));
326 var iter = wb.iterator("xxx");
327 _ = iter.peek();
296} 328}
297 329
298test "ext_pic" { 330test "ext_pict" {
299 try testing.expect(ext_pict.isMatch("👇")); 331 try testing.expect(ext_pict.isMatch("👇"));
300 try testing.expect(ext_pict.isMatch("\u{2704}")); 332 try testing.expect(ext_pict.isMatch("\u{2701}"));
333}
334
335test wordAtCursor {
336 const wb = try WordBreak.init(testing.allocator);
337 defer wb.deinit(testing.allocator);
338 const t_string = "first second third";
339 const second = wb.wordAtCursor(t_string, 8);
340 try testing.expectEqualStrings("second", second.bytes(t_string));
341 const third = wb.wordAtCursor(t_string, 14);
342 try testing.expectEqualStrings("third", third.bytes(t_string));
343 {
344 const first = wb.wordAtCursor(t_string, 3);
345 try testing.expectEqualStrings("first", first.bytes(t_string));
346 }
347 {
348 const first = wb.wordAtCursor(t_string, 0);
349 try testing.expectEqualStrings("first", first.bytes(t_string));
350 }
351 const last = wb.wordAtCursor(t_string, 14);
352 try testing.expectEqualStrings("third", last.bytes(t_string));
301} 353}
302 354
303fn testAllocations(allocator: Allocator) !void { 355fn testAllocations(allocator: Allocator) !void {