summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-14 14:32:06 -0400
committerGravatar Sam Atman2025-05-15 15:32:43 -0400
commitc98b19032536b0e7d7956432f605c1ede80891f0 (patch)
tree6d73142b2ee650d42e6ea1c275899b6feac16080 /src
parentAdd format for CodePoint (diff)
downloadzg-c98b19032536b0e7d7956432f605c1ede80891f0.tar.gz
zg-c98b19032536b0e7d7956432f605c1ede80891f0.tar.xz
zg-c98b19032536b0e7d7956432f605c1ede80891f0.zip
Rewrite wordAtIndex to use iterator flipping
This also adds helper functions for initializing iterators at an index within the string. Not that user code should do that necessarily, but `wordAtIndex` definitely should, and there's no reason not to make it available to others. With an appropriate warning at least.
Diffstat (limited to 'src')
-rw-r--r--src/WordBreak.zig107
1 files changed, 83 insertions, 24 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 0925b2f..7ac8f14 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -69,33 +69,53 @@ fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty {
69} 69}
70 70
71/// Returns the Word at the given index. Asserts that the index is valid for 71/// Returns the Word at the given index. Asserts that the index is valid for
72/// the provided string. Always returns a word. This algorithm is O(n^2) for 72/// the provided string, and that the string is not empty. Always returns a word.
73/// the size of the word before the cursor. 73/// The index does not have to be the start of a codepoint in the word.
74pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { 74pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word {
75 assert(index < string.len); 75 assert(index < string.len and string.len > 0);
76 var i = index; 76 var iter_fwd: Iterator = .initAtIndex(wordbreak, string, index);
77 var iter1 = wordbreak.iterator(string[index..]); 77 if (iter_fwd.next()) |_| {
78 var last_word = iter1.next(); 78 // This is a bit tricky because we may have been in the middle of various
79 if (index == 0) return if (last_word) |word| word else Word{ .offset = 0, .len = 0 }; 79 // stateful things. So we go forward again:
80 var this_word: ?Word = undefined; 80 if (iter_fwd.next()) |_| {
81 while (last_word) |l_word| : (i -= 1) { 81 // Make a back iterator:
82 var iter2 = wordbreak.iterator(string[i..]); 82 var iter_back = iter_fwd.reverseIterator();
83 this_word = iter2.next(); 83 const last_word = iter_back.prev().?; // Always works.
84 if (this_word) |t_word| { 84 const no_flags = iter_back.flags == 0;
85 if (false) std.debug.print("last_word: '{s}'\n", .{l_word.bytes(string[i + 1 ..])}); 85 if (no_flags) {
86 if (false) std.debug.print("this_word: '{s}'\n", .{t_word.bytes(string[i..])}); 86 // Next previous is our word.
87 // Check if we've captured a previous word 87 const the_word = iter_back.prev();
88 const t_end = t_word.offset + t_word.len + i - 1; 88 if (the_word) |word| {
89 const l_start = l_word.offset + i + 1; 89 assert(word.offset <= index and index <= word.offset + word.len);
90 if (false) std.debug.print("t_end {d}, l_start {d}\n", .{ t_end, l_start }); 90 return word;
91 if (t_end < l_start) { 91 } else { // Can happen, at least I think so
92 return .{ .offset = @intCast(l_start), .len = l_word.len }; 92 assert(last_word.offset <= index and index <= last_word.offset + last_word.len);
93 return last_word;
94 }
95 } else {
96 // Scan past all the flags.
97 while (iter_back.flags > 0) {
98 _ = iter_back.prev();
99 }
100 // Now just look for our word
101 iter_fwd = iter_back.forwardIterator();
102 while (iter_fwd.next()) |word| {
103 if (word.offset <= index and index <= word.offset + word.len) {
104 return word;
105 }
106 }
107 unreachable;
93 } 108 }
94 last_word = this_word; 109 } else { // We can just reverse here
95 } else unreachable; 110 var iter_back = iter_fwd.reverseIterator();
96 if (i == 0) break; 111 const word = iter_back.prev().?;
112 assert(word.offset <= index and index <= word.offset + word.len);
113 return word;
114 }
115 } else { // last word then
116 var iter_back = iter_fwd.reverseIterator();
117 return iter_back.prev().?;
97 } 118 }
98 return this_word.?;
99} 119}
100 120
101/// Returns an iterator over words in `slice`. 121/// Returns an iterator over words in `slice`.
@@ -130,6 +150,36 @@ pub const Iterator = struct {
130 return iter.next(); 150 return iter.next();
131 } 151 }
132 152
153 /// Initialize an Iterator at the provided index. Assumes str is valid
154 /// UTF-8, asserts that `index` is less than str.len, and that `str` is not
155 /// empty. Note that for various stateful reasons, this may give spurious
156 /// results if used naïvely. If you want to reliably iterate from an index,
157 /// use `wb.wordAtIndex(string, index)` to obtain the word, then start an
158 /// iterator at `wb.initAtIndex(string, word.offset)`.
159 pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator {
160 assert(index < string.len and string.len > 0);
161 // Just in case...
162 if (index == 0) return wb.iterator(string);
163 var idx: u32 = @intCast(index);
164 // Back up past any any follow bytes:
165 while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
166 var iter: Iterator = undefined;
167 iter.wb = wb;
168 // We need to populate the CodePoints, and the codepoint iterator.
169 // Consider "abc |def" with the cursor on d.
170 // We need `this` to be ` ` and `that` to be 'd',
171 // and `cp_iter.next()` to be `e`.
172 var cp_back: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
173 // Reverse gives us before `d`:
174 iter.this = cp_back.prev(); // that == ` `
175 // This iterator will give us `d`:
176 iter.cp_iter = .{ .bytes = string, .i = idx };
177 iter.that = iter.cp_iter.next();
178 // So that the next call will will give us `e`,
179 // thus the word will be `def`.
180 return iter;
181 }
182
133 /// Return a reverse iterator from the point this iterator is paused 183 /// Return a reverse iterator from the point this iterator is paused
134 /// at. Usually, calling `prev()` will return the word just seen. 184 /// at. Usually, calling `prev()` will return the word just seen.
135 pub fn reverseIterator(iter: *Iterator) ReverseIterator { 185 pub fn reverseIterator(iter: *Iterator) ReverseIterator {
@@ -302,6 +352,15 @@ pub const ReverseIterator = struct {
302 return wb_iter; 352 return wb_iter;
303 } 353 }
304 354
355 /// Initialize a ReverseIterator at the provided index. Assumes str is valid
356 /// UTF-8, asserts that `index` is less than str.len, and that `str` is not
357 /// empty. You should prefer not to use this function, see Iterator.initAtIndex
358 /// for more details.
359 pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator {
360 var fw_iter = Iterator.initAtIndex(wb, string, index);
361 return fw_iter.reverseIterator();
362 }
363
305 /// Returns the previous word segment, without advancing. 364 /// Returns the previous word segment, without advancing.
306 pub fn peek(iter: *ReverseIterator) ?Word { 365 pub fn peek(iter: *ReverseIterator) ?Word {
307 const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; 366 const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags };