summaryrefslogtreecommitdiff
path: root/src/WordBreak.zig
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-15 10:57:33 -0400
committerGravatar Sam Atman2025-05-15 15:32:43 -0400
commit736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5 (patch)
tree09cdc6762a519cd2f20efacfa4d1af082f983e85 /src/WordBreak.zig
parentRewrite wordAtIndex to use iterator flipping (diff)
downloadzg-736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5.tar.gz
zg-736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5.tar.xz
zg-736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5.zip
wordAtIndex passes conformance
I removed the initAtIndex functions from the public vocabulary, because the last couple of days of sweat and blood prove that it's hard to use correctly. That's probably it for WordBreak, now to fix the overlong bug on v0.14 and get this integrated with the new reverse grapheme iterator.
Diffstat (limited to 'src/WordBreak.zig')
-rw-r--r--src/WordBreak.zig161
1 files changed, 72 insertions, 89 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 7ac8f14..6ada7e1 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -64,58 +64,57 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
64 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); 64 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
65} 65}
66 66
67/// Convenience function for working with CodePoints
67fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { 68fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty {
68 return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); 69 return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]);
69} 70}
70 71
71/// Returns the Word at the given index. Asserts that the index is valid for 72/// Returns the Word at the given index. Asserts that the index is less than
72/// the provided string, and that the string is not empty. Always returns a word. 73/// `string.len`, and that the string is not empty. Always returns a word.
73/// The index does not have to be the start of a codepoint in the word. 74/// The index does not have to be the start of a codepoint in the word.
74pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { 75pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word {
75 assert(index < string.len and string.len > 0); 76 assert(index < string.len and string.len > 0);
76 var iter_fwd: Iterator = .initAtIndex(wordbreak, string, index); 77 var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index);
77 if (iter_fwd.next()) |_| { 78 const first_back = iter_back.prev();
78 // This is a bit tricky because we may have been in the middle of various 79 if (first_back) |back| {
79 // stateful things. So we go forward again: 80 if (back.offset == 0) {
80 if (iter_fwd.next()) |_| { 81 var iter_fwd = wordbreak.iterator(string);
81 // Make a back iterator: 82 while (iter_fwd.next()) |word| {
82 var iter_back = iter_fwd.reverseIterator(); 83 if (word.offset <= index and index < word.offset + word.len)
83 const last_word = iter_back.prev().?; // Always works.
84 const no_flags = iter_back.flags == 0;
85 if (no_flags) {
86 // Next previous is our word.
87 const the_word = iter_back.prev();
88 if (the_word) |word| {
89 assert(word.offset <= index and index <= word.offset + word.len);
90 return word; 84 return word;
91 } else { // Can happen, at least I think so 85 }
92 assert(last_word.offset <= index and index <= last_word.offset + last_word.len); 86 }
93 return last_word; 87 } else {
94 } 88 var iter_fwd = wordbreak.iterator(string);
89 while (iter_fwd.next()) |word| {
90 if (word.offset <= index and index < word.offset + word.len)
91 return word;
92 }
93 }
94 const second_back = iter_back.prev();
95 if (second_back) |back| if (back.offset == 0) {
96 var iter_fwd = wordbreak.iterator(string);
97 while (iter_fwd.next()) |word| {
98 if (word.offset <= index and index < word.offset + word.len)
99 return word;
100 }
101 };
102 // There's sometimes flags:
103 if (iter_back.flags > 0) {
104 while (iter_back.flags > 0) {
105 if (iter_back.prev()) |_| {
106 continue;
95 } else { 107 } else {
96 // Scan past all the flags. 108 break;
97 while (iter_back.flags > 0) {
98 _ = iter_back.prev();
99 }
100 // Now just look for our word
101 iter_fwd = iter_back.forwardIterator();
102 while (iter_fwd.next()) |word| {
103 if (word.offset <= index and index <= word.offset + word.len) {
104 return word;
105 }
106 }
107 unreachable;
108 } 109 }
109 } else { // We can just reverse here
110 var iter_back = iter_fwd.reverseIterator();
111 const word = iter_back.prev().?;
112 assert(word.offset <= index and index <= word.offset + word.len);
113 return word;
114 } 110 }
115 } else { // last word then
116 var iter_back = iter_fwd.reverseIterator();
117 return iter_back.prev().?;
118 } 111 }
112 var iter_fwd = iter_back.forwardIterator();
113 while (iter_fwd.next()) |word| {
114 if (word.offset <= index and index < word.offset + word.len)
115 return word;
116 }
117 unreachable;
119} 118}
120 119
121/// Returns an iterator over words in `slice`. 120/// Returns an iterator over words in `slice`.
@@ -128,6 +127,7 @@ pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIt
128 return ReverseIterator.init(wordbreak, slice); 127 return ReverseIterator.init(wordbreak, slice);
129} 128}
130 129
130/// An iterator, forward, over all words in a provided string.
131pub const Iterator = struct { 131pub const Iterator = struct {
132 this: ?CodePoint = null, 132 this: ?CodePoint = null,
133 that: ?CodePoint = null, 133 that: ?CodePoint = null,
@@ -150,37 +150,7 @@ pub const Iterator = struct {
150 return iter.next(); 150 return iter.next();
151 } 151 }
152 152
153 /// Initialize an Iterator at the provided index. Assumes str is valid 153 /// Returns a reverse iterator from the point this iterator is paused
154 /// UTF-8, asserts that `index` is less than str.len, and that `str` is not
155 /// empty. Note that for various stateful reasons, this may give spurious
156 /// results if used naïvely. If you want to reliably iterate from an index,
157 /// use `wb.wordAtIndex(string, index)` to obtain the word, then start an
158 /// iterator at `wb.initAtIndex(string, word.offset)`.
159 pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator {
160 assert(index < string.len and string.len > 0);
161 // Just in case...
162 if (index == 0) return wb.iterator(string);
163 var idx: u32 = @intCast(index);
164 // Back up past any any follow bytes:
165 while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
166 var iter: Iterator = undefined;
167 iter.wb = wb;
168 // We need to populate the CodePoints, and the codepoint iterator.
169 // Consider "abc |def" with the cursor on d.
170 // We need `this` to be ` ` and `that` to be 'd',
171 // and `cp_iter.next()` to be `e`.
172 var cp_back: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
173 // Reverse gives us before `d`:
174 iter.this = cp_back.prev(); // that == ` `
175 // This iterator will give us `d`:
176 iter.cp_iter = .{ .bytes = string, .i = idx };
177 iter.that = iter.cp_iter.next();
178 // So that the next call will will give us `e`,
179 // thus the word will be `def`.
180 return iter;
181 }
182
183 /// Return a reverse iterator from the point this iterator is paused
184 /// at. Usually, calling `prev()` will return the word just seen. 154 /// at. Usually, calling `prev()` will return the word just seen.
185 pub fn reverseIterator(iter: *Iterator) ReverseIterator { 155 pub fn reverseIterator(iter: *Iterator) ReverseIterator {
186 var cp_it = iter.cp_iter.reverseIterator(); 156 var cp_it = iter.cp_iter.reverseIterator();
@@ -196,7 +166,7 @@ pub const Iterator = struct {
196 }; 166 };
197 } 167 }
198 168
199 /// Returns the next word segment. 169 /// Returns the next word segment, if any.
200 pub fn next(iter: *Iterator) ?Word { 170 pub fn next(iter: *Iterator) ?Word {
201 iter.advance(); 171 iter.advance();
202 172
@@ -338,6 +308,7 @@ pub const Iterator = struct {
338 } 308 }
339}; 309};
340 310
311/// An iterator, backward, over all words in a provided string.
341pub const ReverseIterator = struct { 312pub const ReverseIterator = struct {
342 after: ?CodePoint = null, 313 after: ?CodePoint = null,
343 before: ?CodePoint = null, 314 before: ?CodePoint = null,
@@ -352,16 +323,7 @@ pub const ReverseIterator = struct {
352 return wb_iter; 323 return wb_iter;
353 } 324 }
354 325
355 /// Initialize a ReverseIterator at the provided index. Assumes str is valid 326 /// Returns the previous word segment, if any, without advancing.
356 /// UTF-8, asserts that `index` is less than str.len, and that `str` is not
357 /// empty. You should prefer not to use this function, see Iterator.initAtIndex
358 /// for more details.
359 pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator {
360 var fw_iter = Iterator.initAtIndex(wb, string, index);
361 return fw_iter.reverseIterator();
362 }
363
364 /// Returns the previous word segment, without advancing.
365 pub fn peek(iter: *ReverseIterator) ?Word { 327 pub fn peek(iter: *ReverseIterator) ?Word {
366 const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; 328 const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags };
367 defer { 329 defer {
@@ -544,6 +506,27 @@ pub const ReverseIterator = struct {
544 } 506 }
545}; 507};
546 508
509//| Implementation Details
510
511/// Initialize a ReverseIterator at the provided index. Used in wordAtIndex.
512fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator {
513 var idx: u32 = @intCast(index);
514 while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
515 if (idx == string.len) return wb.reverseIterator(string);
516 var iter: ReverseIterator = undefined;
517 iter.wb = wb;
518 iter.flags = 0;
519 // We need to populate the CodePoints, and the codepoint iterator.
520 // Consider "abc| def" with the cursor as |.
521 // We need `before` to be `c` and `after` to be ' ',
522 // and `cp_iter.prev()` to be `b`.
523 var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
524 iter.after = cp_iter.prev();
525 iter.before = cp_iter.prev();
526 iter.cp_iter = cp_iter;
527 return iter;
528}
529
547fn sneaky(iter: *const ReverseIterator) SneakIterator { 530fn sneaky(iter: *const ReverseIterator) SneakIterator {
548 return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; 531 return .{ .cp_iter = iter.cp_iter, .wb = iter.wb };
549} 532}
@@ -656,23 +639,23 @@ test "ext_pict" {
656 try testing.expect(ext_pict.isMatch("\u{2701}")); 639 try testing.expect(ext_pict.isMatch("\u{2701}"));
657} 640}
658 641
659test wordAtCursor { 642test wordAtIndex {
660 const wb = try WordBreak.init(testing.allocator); 643 const wb = try WordBreak.init(testing.allocator);
661 defer wb.deinit(testing.allocator); 644 defer wb.deinit(testing.allocator);
662 const t_string = "first second third"; 645 const t_string = "first second third";
663 const second = wb.wordAtCursor(t_string, 8); 646 const second = wb.wordAtIndex(t_string, 8);
664 try testing.expectEqualStrings("second", second.bytes(t_string)); 647 try testing.expectEqualStrings("second", second.bytes(t_string));
665 const third = wb.wordAtCursor(t_string, 14); 648 const third = wb.wordAtIndex(t_string, 14);
666 try testing.expectEqualStrings("third", third.bytes(t_string)); 649 try testing.expectEqualStrings("third", third.bytes(t_string));
667 { 650 {
668 const first = wb.wordAtCursor(t_string, 3); 651 const first = wb.wordAtIndex(t_string, 3);
669 try testing.expectEqualStrings("first", first.bytes(t_string)); 652 try testing.expectEqualStrings("first", first.bytes(t_string));
670 } 653 }
671 { 654 {
672 const first = wb.wordAtCursor(t_string, 0); 655 const first = wb.wordAtIndex(t_string, 0);
673 try testing.expectEqualStrings("first", first.bytes(t_string)); 656 try testing.expectEqualStrings("first", first.bytes(t_string));
674 } 657 }
675 const last = wb.wordAtCursor(t_string, 14); 658 const last = wb.wordAtIndex(t_string, 14);
676 try testing.expectEqualStrings("third", last.bytes(t_string)); 659 try testing.expectEqualStrings("third", last.bytes(t_string));
677} 660}
678 661