diff options
| author | 2025-05-14 14:32:06 -0400 | |
|---|---|---|
| committer | 2025-05-15 15:32:43 -0400 | |
| commit | c98b19032536b0e7d7956432f605c1ede80891f0 (patch) | |
| tree | 6d73142b2ee650d42e6ea1c275899b6feac16080 /src | |
| parent | Add format for CodePoint (diff) | |
| download | zg-c98b19032536b0e7d7956432f605c1ede80891f0.tar.gz zg-c98b19032536b0e7d7956432f605c1ede80891f0.tar.xz zg-c98b19032536b0e7d7956432f605c1ede80891f0.zip | |
Rewrite wordAtIndex to use iterator flipping
This also adds helper functions for initializing iterators at an index
within the string. Not that user code should do that necessarily, but
`wordAtIndex` definitely should, and there's no reason not to make it
available to others. With an appropriate warning at least.
Diffstat (limited to 'src')
| -rw-r--r-- | src/WordBreak.zig | 107 |
1 files changed, 83 insertions, 24 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 0925b2f..7ac8f14 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig | |||
| @@ -69,33 +69,53 @@ fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { | |||
| 69 | } | 69 | } |
| 70 | 70 | ||
| 71 | /// Returns the Word at the given index. Asserts that the index is valid for | 71 | /// Returns the Word at the given index. Asserts that the index is valid for |
| 72 | /// the provided string. Always returns a word. This algorithm is O(n^2) for | 72 | /// the provided string, and that the string is not empty. Always returns a word. |
| 73 | /// the size of the word before the cursor. | 73 | /// The index does not have to be the start of a codepoint in the word. |
| 74 | pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { | 74 | pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { |
| 75 | assert(index < string.len); | 75 | assert(index < string.len and string.len > 0); |
| 76 | var i = index; | 76 | var iter_fwd: Iterator = .initAtIndex(wordbreak, string, index); |
| 77 | var iter1 = wordbreak.iterator(string[index..]); | 77 | if (iter_fwd.next()) |_| { |
| 78 | var last_word = iter1.next(); | 78 | // This is a bit tricky because we may have been in the middle of various |
| 79 | if (index == 0) return if (last_word) |word| word else Word{ .offset = 0, .len = 0 }; | 79 | // stateful things. So we go forward again: |
| 80 | var this_word: ?Word = undefined; | 80 | if (iter_fwd.next()) |_| { |
| 81 | while (last_word) |l_word| : (i -= 1) { | 81 | // Make a back iterator: |
| 82 | var iter2 = wordbreak.iterator(string[i..]); | 82 | var iter_back = iter_fwd.reverseIterator(); |
| 83 | this_word = iter2.next(); | 83 | const last_word = iter_back.prev().?; // Always works. |
| 84 | if (this_word) |t_word| { | 84 | const no_flags = iter_back.flags == 0; |
| 85 | if (false) std.debug.print("last_word: '{s}'\n", .{l_word.bytes(string[i + 1 ..])}); | 85 | if (no_flags) { |
| 86 | if (false) std.debug.print("this_word: '{s}'\n", .{t_word.bytes(string[i..])}); | 86 | // Next previous is our word. |
| 87 | // Check if we've captured a previous word | 87 | const the_word = iter_back.prev(); |
| 88 | const t_end = t_word.offset + t_word.len + i - 1; | 88 | if (the_word) |word| { |
| 89 | const l_start = l_word.offset + i + 1; | 89 | assert(word.offset <= index and index <= word.offset + word.len); |
| 90 | if (false) std.debug.print("t_end {d}, l_start {d}\n", .{ t_end, l_start }); | 90 | return word; |
| 91 | if (t_end < l_start) { | 91 | } else { // Can happen, at least I think so |
| 92 | return .{ .offset = @intCast(l_start), .len = l_word.len }; | 92 | assert(last_word.offset <= index and index <= last_word.offset + last_word.len); |
| 93 | return last_word; | ||
| 94 | } | ||
| 95 | } else { | ||
| 96 | // Scan past all the flags. | ||
| 97 | while (iter_back.flags > 0) { | ||
| 98 | _ = iter_back.prev(); | ||
| 99 | } | ||
| 100 | // Now just look for our word | ||
| 101 | iter_fwd = iter_back.forwardIterator(); | ||
| 102 | while (iter_fwd.next()) |word| { | ||
| 103 | if (word.offset <= index and index <= word.offset + word.len) { | ||
| 104 | return word; | ||
| 105 | } | ||
| 106 | } | ||
| 107 | unreachable; | ||
| 93 | } | 108 | } |
| 94 | last_word = this_word; | 109 | } else { // We can just reverse here |
| 95 | } else unreachable; | 110 | var iter_back = iter_fwd.reverseIterator(); |
| 96 | if (i == 0) break; | 111 | const word = iter_back.prev().?; |
| 112 | assert(word.offset <= index and index <= word.offset + word.len); | ||
| 113 | return word; | ||
| 114 | } | ||
| 115 | } else { // last word then | ||
| 116 | var iter_back = iter_fwd.reverseIterator(); | ||
| 117 | return iter_back.prev().?; | ||
| 97 | } | 118 | } |
| 98 | return this_word.?; | ||
| 99 | } | 119 | } |
| 100 | 120 | ||
| 101 | /// Returns an iterator over words in `slice`. | 121 | /// Returns an iterator over words in `slice`. |
| @@ -130,6 +150,36 @@ pub const Iterator = struct { | |||
| 130 | return iter.next(); | 150 | return iter.next(); |
| 131 | } | 151 | } |
| 132 | 152 | ||
| 153 | /// Initialize an Iterator at the provided index. Assumes str is valid | ||
| 154 | /// UTF-8, asserts that `index` is less than str.len, and that `str` is not | ||
| 155 | /// empty. Note that for various stateful reasons, this may give spurious | ||
| 156 | /// results if used naïvely. If you want to reliably iterate from an index, | ||
| 157 | /// use `wb.wordAtIndex(string, index)` to obtain the word, then start an | ||
| 158 | /// iterator at `wb.initAtIndex(string, word.offset)`. | ||
| 159 | pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator { | ||
| 160 | assert(index < string.len and string.len > 0); | ||
| 161 | // Just in case... | ||
| 162 | if (index == 0) return wb.iterator(string); | ||
| 163 | var idx: u32 = @intCast(index); | ||
| 164 | // Back up past any any follow bytes: | ||
| 165 | while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} | ||
| 166 | var iter: Iterator = undefined; | ||
| 167 | iter.wb = wb; | ||
| 168 | // We need to populate the CodePoints, and the codepoint iterator. | ||
| 169 | // Consider "abc |def" with the cursor on d. | ||
| 170 | // We need `this` to be ` ` and `that` to be 'd', | ||
| 171 | // and `cp_iter.next()` to be `e`. | ||
| 172 | var cp_back: ReverseCodepointIterator = .{ .bytes = string, .i = idx }; | ||
| 173 | // Reverse gives us before `d`: | ||
| 174 | iter.this = cp_back.prev(); // that == ` ` | ||
| 175 | // This iterator will give us `d`: | ||
| 176 | iter.cp_iter = .{ .bytes = string, .i = idx }; | ||
| 177 | iter.that = iter.cp_iter.next(); | ||
| 178 | // So that the next call will will give us `e`, | ||
| 179 | // thus the word will be `def`. | ||
| 180 | return iter; | ||
| 181 | } | ||
| 182 | |||
| 133 | /// Return a reverse iterator from the point this iterator is paused | 183 | /// Return a reverse iterator from the point this iterator is paused |
| 134 | /// at. Usually, calling `prev()` will return the word just seen. | 184 | /// at. Usually, calling `prev()` will return the word just seen. |
| 135 | pub fn reverseIterator(iter: *Iterator) ReverseIterator { | 185 | pub fn reverseIterator(iter: *Iterator) ReverseIterator { |
| @@ -302,6 +352,15 @@ pub const ReverseIterator = struct { | |||
| 302 | return wb_iter; | 352 | return wb_iter; |
| 303 | } | 353 | } |
| 304 | 354 | ||
| 355 | /// Initialize a ReverseIterator at the provided index. Assumes str is valid | ||
| 356 | /// UTF-8, asserts that `index` is less than str.len, and that `str` is not | ||
| 357 | /// empty. You should prefer not to use this function, see Iterator.initAtIndex | ||
| 358 | /// for more details. | ||
| 359 | pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator { | ||
| 360 | var fw_iter = Iterator.initAtIndex(wb, string, index); | ||
| 361 | return fw_iter.reverseIterator(); | ||
| 362 | } | ||
| 363 | |||
| 305 | /// Returns the previous word segment, without advancing. | 364 | /// Returns the previous word segment, without advancing. |
| 306 | pub fn peek(iter: *ReverseIterator) ?Word { | 365 | pub fn peek(iter: *ReverseIterator) ?Word { |
| 307 | const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; | 366 | const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; |