summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-13 12:15:18 -0400
committerGravatar Sam Atman2025-05-15 15:31:16 -0400
commit49843d6dc6cde85fe670085b809d41db314ec47d (patch)
treebb36ee23debffab23c8066b5e8aee0a2b1b35ae1 /src
parentRewrite, passes WordBreakTest (diff)
downloadzg-49843d6dc6cde85fe670085b809d41db314ec47d.tar.gz
zg-49843d6dc6cde85fe670085b809d41db314ec47d.tar.xz
zg-49843d6dc6cde85fe670085b809d41db314ec47d.zip
Add wordAtCursor
This is not actually the way to do it, and can break on some crafted strings. The way to actually do it: implement a reverse word search iterator, then do next() to find a word break, prev() to find a _valid_ word start, then next() again to find the valid end of said word. Maybe 2+, 2-, 1+ actually. I can probably write a test to see if the cursor spot is ambiguous, and apply an extra round if so. Need to mull the rules over before making any rash moves.
Diffstat (limited to 'src')
-rw-r--r--src/WordBreak.zig148
1 files changed, 100 insertions, 48 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index a2be011..f0da30d 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -42,30 +42,6 @@ pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void {
42 }; 42 };
43} 43}
44 44
45inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void {
46 const decompressor = compress.flate.inflate.decompressor;
47 const in_bytes = @embedFile("wbp");
48 var in_fbs = std.io.fixedBufferStream(in_bytes);
49 var in_decomp = decompressor(.raw, in_fbs.reader());
50 var reader = in_decomp.reader();
51
52 const endian = builtin.cpu.arch.endian();
53
54 const stage_1_len: u16 = try reader.readInt(u16, endian);
55 wb.s1 = try allocator.alloc(u16, stage_1_len);
56 errdefer allocator.free(wb.s1);
57 for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian);
58
59 const stage_2_len: u16 = try reader.readInt(u16, endian);
60 wb.s2 = try allocator.alloc(u5, stage_2_len);
61 errdefer allocator.free(wb.s2);
62 for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian));
63 var count_0: usize = 0;
64 for (wb.s2) |nyb| {
65 if (nyb == 0) count_0 += 1;
66 }
67}
68
69pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { 45pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void {
70 allocator.free(wordbreak.s1); 46 allocator.free(wordbreak.s1);
71 allocator.free(wordbreak.s2); 47 allocator.free(wordbreak.s2);
@@ -88,29 +64,48 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
88 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); 64 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
89} 65}
90 66
67fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty {
68 return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]);
69}
70
71/// Returns the Word at the given index. Asserts that the index is valid for
72/// the provided string. Always returns a word. This algorithm is O(n^2) for
73/// the size of the word before the cursor.
74pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word {
75 assert(index < string.len);
76 var i = index;
77 var iter1 = wordbreak.iterator(string[index..]);
78 var last_word = iter1.next();
79 if (index == 0) return if (last_word) |word| word else Word{ .offset = 0, .len = 0 };
80 var this_word: ?Word = undefined;
81 while (last_word) |l_word| : (i -= 1) {
82 var iter2 = wordbreak.iterator(string[i..]);
83 this_word = iter2.next();
84 if (this_word) |t_word| {
85 if (false) std.debug.print("last_word: '{s}'\n", .{l_word.bytes(string[i + 1 ..])});
86 if (false) std.debug.print("this_word: '{s}'\n", .{t_word.bytes(string[i..])});
87 // Check if we've captured a previous word
88 const t_end = t_word.offset + t_word.len + i - 1;
89 const l_start = l_word.offset + i + 1;
90 if (false) std.debug.print("t_end {d}, l_start {d}\n", .{ t_end, l_start });
91 if (t_end < l_start) {
92 return .{ .offset = @intCast(l_start), .len = l_word.len };
93 }
94 last_word = this_word;
95 } else unreachable;
96 if (i == 0) break;
97 }
98 return this_word.?;
99}
100
91/// Returns an iterator over words in `slice` 101/// Returns an iterator over words in `slice`
92pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { 102pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator {
93 return Iterator.init(wordbreak, slice); 103 return Iterator.init(wordbreak, slice);
94} 104}
95 105
96const IterState = packed struct {
97 mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter
98 mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric
99 quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter
100 regional: bool, // [^RI] (RI RI)* RI × RI
101
102 pub const initial: IterState = .{
103 .mid_punct = false,
104 .mid_num = false,
105 .quote_heb = false,
106 .regional = false,
107 };
108};
109
110pub const Iterator = struct { 106pub const Iterator = struct {
111 this: ?CodePoint = null, 107 this: ?CodePoint = null,
112 that: ?CodePoint = null, 108 that: ?CodePoint = null,
113 cache: ?WordBreakProperty = null,
114 cp_iter: CodepointIterator, 109 cp_iter: CodepointIterator,
115 wb: *const WordBreak, 110 wb: *const WordBreak,
116 111
@@ -121,6 +116,16 @@ pub const Iterator = struct {
121 return wb_iter; 116 return wb_iter;
122 } 117 }
123 118
119 /// Returns the next word segment, without advancing.
120 pub fn peek(iter: *Iterator) ?Word {
121 const cache = .{ iter.this, iter.that, iter.cp_iter };
122 defer {
123 iter.this, iter.that, iter.cp_iter = cache;
124 }
125 return iter.next();
126 }
127
128 /// Returns the next word segment.
124 pub fn next(iter: *Iterator) ?Word { 129 pub fn next(iter: *Iterator) ?Word {
125 iter.advance(); 130 iter.advance();
126 131
@@ -132,7 +137,7 @@ pub const Iterator = struct {
132 const word_start = iter.this.?.offset; 137 const word_start = iter.this.?.offset;
133 var word_len: u32 = 0; 138 var word_len: u32 = 0;
134 139
135 // state variables 140 // State variables.
136 var last_p: WordBreakProperty = .none; 141 var last_p: WordBreakProperty = .none;
137 var last_last_p: WordBreakProperty = .none; 142 var last_last_p: WordBreakProperty = .none;
138 var ri_count: usize = 0; 143 var ri_count: usize = 0;
@@ -141,12 +146,13 @@ pub const Iterator = struct {
141 const this = iter.this.?; 146 const this = iter.this.?;
142 word_len += this.len; 147 word_len += this.len;
143 if (iter.that) |that| { 148 if (iter.that) |that| {
144 const this_p = iter.wb.breakProperty(this.code); // WB3 CR × LF 149 const this_p = iter.wb.breakProp(this);
145 const that_p = iter.wb.breakProperty(that.code); 150 const that_p = iter.wb.breakProp(that);
146 if (!isIgnorable(this_p)) { 151 if (!isIgnorable(this_p)) {
147 last_last_p = last_p; 152 last_last_p = last_p;
148 last_p = this_p; 153 last_p = this_p;
149 } 154 }
155 // WB3 CR × LF
150 if (this_p == .CR and that_p == .LF) continue :scan; 156 if (this_p == .CR and that_p == .LF) continue :scan;
151 // WB3a (Newline | CR | LF) ÷ 157 // WB3a (Newline | CR | LF) ÷
152 if (isNewline(this_p)) break :scan; 158 if (isNewline(this_p)) break :scan;
@@ -169,7 +175,7 @@ pub const Iterator = struct {
169 if (isMidVal(that_p)) { 175 if (isMidVal(that_p)) {
170 const next_val = iter.peekPast(); 176 const next_val = iter.peekPast();
171 if (next_val) |next_cp| { 177 if (next_val) |next_cp| {
172 const next_p = iter.wb.breakProperty(next_cp.code); 178 const next_p = iter.wb.breakProp(next_cp);
173 if (isAHLetter(next_p)) { 179 if (isAHLetter(next_p)) {
174 continue :scan; 180 continue :scan;
175 } 181 }
@@ -187,7 +193,7 @@ pub const Iterator = struct {
187 if (that_p == .Double_Quote) { 193 if (that_p == .Double_Quote) {
188 const next_val = iter.peekPast(); 194 const next_val = iter.peekPast();
189 if (next_val) |next_cp| { 195 if (next_val) |next_cp| {
190 const next_p = iter.wb.breakProperty(next_cp.code); 196 const next_p = iter.wb.breakProp(next_cp);
191 if (next_p == .Hebrew_Letter) { 197 if (next_p == .Hebrew_Letter) {
192 continue :scan; 198 continue :scan;
193 } 199 }
@@ -210,7 +216,7 @@ pub const Iterator = struct {
210 if (last_p == .Numeric and isMidNum(that_p)) { 216 if (last_p == .Numeric and isMidNum(that_p)) {
211 const next_val = iter.peekPast(); 217 const next_val = iter.peekPast();
212 if (next_val) |next_cp| { 218 if (next_val) |next_cp| {
213 const next_p = iter.wb.breakProperty(next_cp.code); 219 const next_p = iter.wb.breakProp(next_cp);
214 if (next_p == .Numeric) { 220 if (next_p == .Numeric) {
215 continue :scan; 221 continue :scan;
216 } 222 }
@@ -247,13 +253,37 @@ pub const Iterator = struct {
247 const save_cp = iter.cp_iter; 253 const save_cp = iter.cp_iter;
248 defer iter.cp_iter = save_cp; 254 defer iter.cp_iter = save_cp;
249 while (iter.cp_iter.peek()) |peeked| { 255 while (iter.cp_iter.peek()) |peeked| {
250 if (!isIgnorable(iter.wb.breakProperty(peeked.code))) return peeked; 256 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
251 _ = iter.cp_iter.next(); 257 _ = iter.cp_iter.next();
252 } 258 }
253 return null; 259 return null;
254 } 260 }
255}; 261};
256 262
263inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void {
264 const decompressor = compress.flate.inflate.decompressor;
265 const in_bytes = @embedFile("wbp");
266 var in_fbs = std.io.fixedBufferStream(in_bytes);
267 var in_decomp = decompressor(.raw, in_fbs.reader());
268 var reader = in_decomp.reader();
269
270 const endian = builtin.cpu.arch.endian();
271
272 const stage_1_len: u16 = try reader.readInt(u16, endian);
273 wb.s1 = try allocator.alloc(u16, stage_1_len);
274 errdefer allocator.free(wb.s1);
275 for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian);
276
277 const stage_2_len: u16 = try reader.readInt(u16, endian);
278 wb.s2 = try allocator.alloc(u5, stage_2_len);
279 errdefer allocator.free(wb.s2);
280 for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian));
281 var count_0: usize = 0;
282 for (wb.s2) |nyb| {
283 if (nyb == 0) count_0 += 1;
284 }
285}
286
257//| Predicates 287//| Predicates
258 288
259inline fn isNewline(wbp: WordBreakProperty) bool { 289inline fn isNewline(wbp: WordBreakProperty) bool {
@@ -293,11 +323,33 @@ test "Word Break Properties" {
293 try testing.expectEqual(.LF, wb.breakProperty('\n')); 323 try testing.expectEqual(.LF, wb.breakProperty('\n'));
294 try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); 324 try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש'));
295 try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); 325 try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}'));
326 var iter = wb.iterator("xxx");
327 _ = iter.peek();
296} 328}
297 329
298test "ext_pic" { 330test "ext_pict" {
299 try testing.expect(ext_pict.isMatch("👇")); 331 try testing.expect(ext_pict.isMatch("👇"));
300 try testing.expect(ext_pict.isMatch("\u{2704}")); 332 try testing.expect(ext_pict.isMatch("\u{2701}"));
333}
334
335test wordAtCursor {
336 const wb = try WordBreak.init(testing.allocator);
337 defer wb.deinit(testing.allocator);
338 const t_string = "first second third";
339 const second = wb.wordAtCursor(t_string, 8);
340 try testing.expectEqualStrings("second", second.bytes(t_string));
341 const third = wb.wordAtCursor(t_string, 14);
342 try testing.expectEqualStrings("third", third.bytes(t_string));
343 {
344 const first = wb.wordAtCursor(t_string, 3);
345 try testing.expectEqualStrings("first", first.bytes(t_string));
346 }
347 {
348 const first = wb.wordAtCursor(t_string, 0);
349 try testing.expectEqualStrings("first", first.bytes(t_string));
350 }
351 const last = wb.wordAtCursor(t_string, 14);
352 try testing.expectEqualStrings("third", last.bytes(t_string));
301} 353}
302 354
303fn testAllocations(allocator: Allocator) !void { 355fn testAllocations(allocator: Allocator) !void {