summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-15 10:57:33 -0400
committerGravatar Sam Atman2025-05-15 15:32:43 -0400
commit736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5 (patch)
tree09cdc6762a519cd2f20efacfa4d1af082f983e85
parentRewrite wordAtIndex to use iterator flipping (diff)
downloadzg-736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5.tar.gz
zg-736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5.tar.xz
zg-736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5.zip
wordAtIndex passes conformance
I removed the initAtIndex functions from the public vocabulary, because the last couple of days of sweat and blood prove that it's hard to use correctly. That's probably it for WordBreak, now to fix the overlong bug on v0.14 and get this integrated with the new reverse grapheme iterator.
-rw-r--r--src/WordBreak.zig161
-rw-r--r--src/code_point.zig1
-rw-r--r--src/unicode_tests.zig76
3 files changed, 135 insertions, 103 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 7ac8f14..6ada7e1 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -64,58 +64,57 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
64 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); 64 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
65} 65}
66 66
67/// Convenience function for working with CodePoints
67fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { 68fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty {
68 return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); 69 return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]);
69} 70}
70 71
71/// Returns the Word at the given index. Asserts that the index is valid for 72/// Returns the Word at the given index. Asserts that the index is less than
72/// the provided string, and that the string is not empty. Always returns a word. 73/// `string.len`, and that the string is not empty. Always returns a word.
73/// The index does not have to be the start of a codepoint in the word. 74/// The index does not have to be the start of a codepoint in the word.
74pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { 75pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word {
75 assert(index < string.len and string.len > 0); 76 assert(index < string.len and string.len > 0);
76 var iter_fwd: Iterator = .initAtIndex(wordbreak, string, index); 77 var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index);
77 if (iter_fwd.next()) |_| { 78 const first_back = iter_back.prev();
78 // This is a bit tricky because we may have been in the middle of various 79 if (first_back) |back| {
79 // stateful things. So we go forward again: 80 if (back.offset == 0) {
80 if (iter_fwd.next()) |_| { 81 var iter_fwd = wordbreak.iterator(string);
81 // Make a back iterator: 82 while (iter_fwd.next()) |word| {
82 var iter_back = iter_fwd.reverseIterator(); 83 if (word.offset <= index and index < word.offset + word.len)
83 const last_word = iter_back.prev().?; // Always works.
84 const no_flags = iter_back.flags == 0;
85 if (no_flags) {
86 // Next previous is our word.
87 const the_word = iter_back.prev();
88 if (the_word) |word| {
89 assert(word.offset <= index and index <= word.offset + word.len);
90 return word; 84 return word;
91 } else { // Can happen, at least I think so 85 }
92 assert(last_word.offset <= index and index <= last_word.offset + last_word.len); 86 }
93 return last_word; 87 } else {
94 } 88 var iter_fwd = wordbreak.iterator(string);
89 while (iter_fwd.next()) |word| {
90 if (word.offset <= index and index < word.offset + word.len)
91 return word;
92 }
93 }
94 const second_back = iter_back.prev();
95 if (second_back) |back| if (back.offset == 0) {
96 var iter_fwd = wordbreak.iterator(string);
97 while (iter_fwd.next()) |word| {
98 if (word.offset <= index and index < word.offset + word.len)
99 return word;
100 }
101 };
102 // There's sometimes flags:
103 if (iter_back.flags > 0) {
104 while (iter_back.flags > 0) {
105 if (iter_back.prev()) |_| {
106 continue;
95 } else { 107 } else {
96 // Scan past all the flags. 108 break;
97 while (iter_back.flags > 0) {
98 _ = iter_back.prev();
99 }
100 // Now just look for our word
101 iter_fwd = iter_back.forwardIterator();
102 while (iter_fwd.next()) |word| {
103 if (word.offset <= index and index <= word.offset + word.len) {
104 return word;
105 }
106 }
107 unreachable;
108 } 109 }
109 } else { // We can just reverse here
110 var iter_back = iter_fwd.reverseIterator();
111 const word = iter_back.prev().?;
112 assert(word.offset <= index and index <= word.offset + word.len);
113 return word;
114 } 110 }
115 } else { // last word then
116 var iter_back = iter_fwd.reverseIterator();
117 return iter_back.prev().?;
118 } 111 }
112 var iter_fwd = iter_back.forwardIterator();
113 while (iter_fwd.next()) |word| {
114 if (word.offset <= index and index < word.offset + word.len)
115 return word;
116 }
117 unreachable;
119} 118}
120 119
121/// Returns an iterator over words in `slice`. 120/// Returns an iterator over words in `slice`.
@@ -128,6 +127,7 @@ pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIt
128 return ReverseIterator.init(wordbreak, slice); 127 return ReverseIterator.init(wordbreak, slice);
129} 128}
130 129
130/// An iterator, forward, over all words in a provided string.
131pub const Iterator = struct { 131pub const Iterator = struct {
132 this: ?CodePoint = null, 132 this: ?CodePoint = null,
133 that: ?CodePoint = null, 133 that: ?CodePoint = null,
@@ -150,37 +150,7 @@ pub const Iterator = struct {
150 return iter.next(); 150 return iter.next();
151 } 151 }
152 152
153 /// Initialize an Iterator at the provided index. Assumes str is valid 153 /// Returns a reverse iterator from the point this iterator is paused
154 /// UTF-8, asserts that `index` is less than str.len, and that `str` is not
155 /// empty. Note that for various stateful reasons, this may give spurious
156 /// results if used naïvely. If you want to reliably iterate from an index,
157 /// use `wb.wordAtIndex(string, index)` to obtain the word, then start an
158 /// iterator at `wb.initAtIndex(string, word.offset)`.
159 pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator {
160 assert(index < string.len and string.len > 0);
161 // Just in case...
162 if (index == 0) return wb.iterator(string);
163 var idx: u32 = @intCast(index);
164 // Back up past any any follow bytes:
165 while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
166 var iter: Iterator = undefined;
167 iter.wb = wb;
168 // We need to populate the CodePoints, and the codepoint iterator.
169 // Consider "abc |def" with the cursor on d.
170 // We need `this` to be ` ` and `that` to be 'd',
171 // and `cp_iter.next()` to be `e`.
172 var cp_back: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
173 // Reverse gives us before `d`:
174 iter.this = cp_back.prev(); // that == ` `
175 // This iterator will give us `d`:
176 iter.cp_iter = .{ .bytes = string, .i = idx };
177 iter.that = iter.cp_iter.next();
178 // So that the next call will will give us `e`,
179 // thus the word will be `def`.
180 return iter;
181 }
182
183 /// Return a reverse iterator from the point this iterator is paused
184 /// at. Usually, calling `prev()` will return the word just seen. 154 /// at. Usually, calling `prev()` will return the word just seen.
185 pub fn reverseIterator(iter: *Iterator) ReverseIterator { 155 pub fn reverseIterator(iter: *Iterator) ReverseIterator {
186 var cp_it = iter.cp_iter.reverseIterator(); 156 var cp_it = iter.cp_iter.reverseIterator();
@@ -196,7 +166,7 @@ pub const Iterator = struct {
196 }; 166 };
197 } 167 }
198 168
199 /// Returns the next word segment. 169 /// Returns the next word segment, if any.
200 pub fn next(iter: *Iterator) ?Word { 170 pub fn next(iter: *Iterator) ?Word {
201 iter.advance(); 171 iter.advance();
202 172
@@ -338,6 +308,7 @@ pub const Iterator = struct {
338 } 308 }
339}; 309};
340 310
311/// An iterator, backward, over all words in a provided string.
341pub const ReverseIterator = struct { 312pub const ReverseIterator = struct {
342 after: ?CodePoint = null, 313 after: ?CodePoint = null,
343 before: ?CodePoint = null, 314 before: ?CodePoint = null,
@@ -352,16 +323,7 @@ pub const ReverseIterator = struct {
352 return wb_iter; 323 return wb_iter;
353 } 324 }
354 325
355 /// Initialize a ReverseIterator at the provided index. Assumes str is valid 326 /// Returns the previous word segment, if any, without advancing.
356 /// UTF-8, asserts that `index` is less than str.len, and that `str` is not
357 /// empty. You should prefer not to use this function, see Iterator.initAtIndex
358 /// for more details.
359 pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator {
360 var fw_iter = Iterator.initAtIndex(wb, string, index);
361 return fw_iter.reverseIterator();
362 }
363
364 /// Returns the previous word segment, without advancing.
365 pub fn peek(iter: *ReverseIterator) ?Word { 327 pub fn peek(iter: *ReverseIterator) ?Word {
366 const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; 328 const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags };
367 defer { 329 defer {
@@ -544,6 +506,27 @@ pub const ReverseIterator = struct {
544 } 506 }
545}; 507};
546 508
509//| Implementation Details
510
511/// Initialize a ReverseIterator at the provided index. Used in wordAtIndex.
512fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator {
513 var idx: u32 = @intCast(index);
514 while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
515 if (idx == string.len) return wb.reverseIterator(string);
516 var iter: ReverseIterator = undefined;
517 iter.wb = wb;
518 iter.flags = 0;
519 // We need to populate the CodePoints, and the codepoint iterator.
520 // Consider "abc| def" with the cursor as |.
521 // We need `before` to be `c` and `after` to be ' ',
522 // and `cp_iter.prev()` to be `b`.
523 var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
524 iter.after = cp_iter.prev();
525 iter.before = cp_iter.prev();
526 iter.cp_iter = cp_iter;
527 return iter;
528}
529
547fn sneaky(iter: *const ReverseIterator) SneakIterator { 530fn sneaky(iter: *const ReverseIterator) SneakIterator {
548 return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; 531 return .{ .cp_iter = iter.cp_iter, .wb = iter.wb };
549} 532}
@@ -656,23 +639,23 @@ test "ext_pict" {
656 try testing.expect(ext_pict.isMatch("\u{2701}")); 639 try testing.expect(ext_pict.isMatch("\u{2701}"));
657} 640}
658 641
659test wordAtCursor { 642test wordAtIndex {
660 const wb = try WordBreak.init(testing.allocator); 643 const wb = try WordBreak.init(testing.allocator);
661 defer wb.deinit(testing.allocator); 644 defer wb.deinit(testing.allocator);
662 const t_string = "first second third"; 645 const t_string = "first second third";
663 const second = wb.wordAtCursor(t_string, 8); 646 const second = wb.wordAtIndex(t_string, 8);
664 try testing.expectEqualStrings("second", second.bytes(t_string)); 647 try testing.expectEqualStrings("second", second.bytes(t_string));
665 const third = wb.wordAtCursor(t_string, 14); 648 const third = wb.wordAtIndex(t_string, 14);
666 try testing.expectEqualStrings("third", third.bytes(t_string)); 649 try testing.expectEqualStrings("third", third.bytes(t_string));
667 { 650 {
668 const first = wb.wordAtCursor(t_string, 3); 651 const first = wb.wordAtIndex(t_string, 3);
669 try testing.expectEqualStrings("first", first.bytes(t_string)); 652 try testing.expectEqualStrings("first", first.bytes(t_string));
670 } 653 }
671 { 654 {
672 const first = wb.wordAtCursor(t_string, 0); 655 const first = wb.wordAtIndex(t_string, 0);
673 try testing.expectEqualStrings("first", first.bytes(t_string)); 656 try testing.expectEqualStrings("first", first.bytes(t_string));
674 } 657 }
675 const last = wb.wordAtCursor(t_string, 14); 658 const last = wb.wordAtIndex(t_string, 14);
676 try testing.expectEqualStrings("third", last.bytes(t_string)); 659 try testing.expectEqualStrings("third", last.bytes(t_string));
677} 660}
678 661
diff --git a/src/code_point.zig b/src/code_point.zig
index 43b38d2..9a84080 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -272,7 +272,6 @@ pub const ReverseIterator = struct {
272 272
273 while (i_prev > 0) : (i_prev -= 1) { 273 while (i_prev > 0) : (i_prev -= 1) {
274 if (!followbyte(iter.bytes[i_prev])) break; 274 if (!followbyte(iter.bytes[i_prev])) break;
275 if (i_prev == 0) break;
276 } 275 }
277 276
278 if (i_prev > 0) 277 if (i_prev > 0)
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index ef459bf..8b02e98 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -222,32 +222,58 @@ test "Segmentation Word Iterator" {
222 try want.append(Word{ .len = gc_len, .offset = bytes_index }); 222 try want.append(Word{ .len = gc_len, .offset = bytes_index });
223 bytes_index += cp_index; 223 bytes_index += cp_index;
224 } 224 }
225 const this_str = all_bytes.items;
226
225 { 227 {
226 var iter = wb.iterator(all_bytes.items); 228 var iter = wb.iterator(this_str);
227 var peeked: ?Word = iter.peek(); 229 var peeked: ?Word = iter.peek();
228 230
229 // Check. 231 // Check.
230 for (want.items, 1..) |want_word, i| { 232 for (want.items, 1..) |want_word, idx| {
231 const got_word = (iter.next()).?; 233 const got_word = (iter.next()).?;
232 std.testing.expectEqualStrings( 234 std.testing.expectEqualStrings(
233 want_word.bytes(all_bytes.items), 235 want_word.bytes(this_str),
234 got_word.bytes(all_bytes.items), 236 got_word.bytes(this_str),
235 ) catch |err| { 237 ) catch |err| {
236 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); 238 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx });
237 return err; 239 return err;
238 }; 240 };
239 std.testing.expectEqualStrings( 241 std.testing.expectEqualStrings(
240 peeked.?.bytes(all_bytes.items), 242 peeked.?.bytes(this_str),
241 got_word.bytes(all_bytes.items), 243 got_word.bytes(this_str),
242 ) catch |err| { 244 ) catch |err| {
243 debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, i }); 245 debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx });
244 return err; 246 return err;
245 }; 247 };
248 var r_iter = iter.reverseIterator();
249 const if_r_word = r_iter.prev();
250 if (if_r_word) |r_word| {
251 std.testing.expectEqualStrings(
252 want_word.bytes(this_str),
253 r_word.bytes(this_str),
254 ) catch |err| {
255 debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx });
256 return err;
257 };
258 } else {
259 try testing.expect(false);
260 }
261 for (got_word.offset..got_word.offset + got_word.len) |i| {
262 const this_word = wb.wordAtIndex(this_str, i);
263 std.testing.expectEqualSlices(
264 u8,
265 got_word.bytes(this_str),
266 this_word.bytes(this_str),
267 ) catch |err| {
268 debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i });
269 return err;
270 };
271 }
246 peeked = iter.peek(); 272 peeked = iter.peek();
247 } 273 }
248 } 274 }
249 { 275 {
250 var r_iter = wb.reverseIterator(all_bytes.items); 276 var r_iter = wb.reverseIterator(this_str);
251 var peeked: ?Word = r_iter.peek(); 277 var peeked: ?Word = r_iter.peek();
252 var idx = want.items.len - 1; 278 var idx = want.items.len - 1;
253 279
@@ -256,19 +282,43 @@ test "Segmentation Word Iterator" {
256 const got_word = r_iter.prev().?; 282 const got_word = r_iter.prev().?;
257 std.testing.expectEqualSlices( 283 std.testing.expectEqualSlices(
258 u8, 284 u8,
259 want_word.bytes(all_bytes.items), 285 want_word.bytes(this_str),
260 got_word.bytes(all_bytes.items), 286 got_word.bytes(this_str),
261 ) catch |err| { 287 ) catch |err| {
262 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 }); 288 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 });
263 return err; 289 return err;
264 }; 290 };
265 std.testing.expectEqualStrings( 291 std.testing.expectEqualStrings(
266 peeked.?.bytes(all_bytes.items), 292 peeked.?.bytes(this_str),
267 got_word.bytes(all_bytes.items), 293 got_word.bytes(this_str),
268 ) catch |err| { 294 ) catch |err| {
269 debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx + 1 }); 295 debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx + 1 });
270 return err; 296 return err;
271 }; 297 };
298 var f_iter = r_iter.forwardIterator();
299 const if_f_word = f_iter.next();
300 if (if_f_word) |f_word| {
301 std.testing.expectEqualStrings(
302 want_word.bytes(this_str),
303 f_word.bytes(this_str),
304 ) catch |err| {
305 debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx });
306 return err;
307 };
308 } else {
309 try testing.expect(false);
310 }
311 for (got_word.offset..got_word.offset + got_word.len) |i| {
312 const this_word = wb.wordAtIndex(this_str, i);
313 std.testing.expectEqualSlices(
314 u8,
315 got_word.bytes(this_str),
316 this_word.bytes(this_str),
317 ) catch |err| {
318 debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i });
319 return err;
320 };
321 }
272 peeked = r_iter.peek(); 322 peeked = r_iter.peek();
273 if (idx == 0) break; 323 if (idx == 0) break;
274 } 324 }