From c98b19032536b0e7d7956432f605c1ede80891f0 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 14 May 2025 14:32:06 -0400 Subject: Rewrite wordAtIndex to use iterator flipping This also adds helper functions for initializing iterators at an index within the string. Not that user code should do that necessarily, but `wordAtIndex` definitely should, and there's no reason not to make it available to others. With an appropriate warning at least. --- src/WordBreak.zig | 107 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 83 insertions(+), 24 deletions(-) (limited to 'src') diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 0925b2f..7ac8f14 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -69,33 +69,53 @@ fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { } /// Returns the Word at the given index. Asserts that the index is valid for -/// the provided string. Always returns a word. This algorithm is O(n^2) for -/// the size of the word before the cursor. +/// the provided string, and that the string is not empty. Always returns a word. +/// The index does not have to be the start of a codepoint in the word. pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { - assert(index < string.len); - var i = index; - var iter1 = wordbreak.iterator(string[index..]); - var last_word = iter1.next(); - if (index == 0) return if (last_word) |word| word else Word{ .offset = 0, .len = 0 }; - var this_word: ?Word = undefined; - while (last_word) |l_word| : (i -= 1) { - var iter2 = wordbreak.iterator(string[i..]); - this_word = iter2.next(); - if (this_word) |t_word| { - if (false) std.debug.print("last_word: '{s}'\n", .{l_word.bytes(string[i + 1 ..])}); - if (false) std.debug.print("this_word: '{s}'\n", .{t_word.bytes(string[i..])}); - // Check if we've captured a previous word - const t_end = t_word.offset + t_word.len + i - 1; - const l_start = l_word.offset + i + 1; - if (false) std.debug.print("t_end {d}, l_start {d}\n", .{ t_end, l_start }); - if (t_end < l_start) { - return .{ .offset = @intCast(l_start), .len = l_word.len }; + assert(index < string.len and string.len > 0); + var iter_fwd: Iterator = .initAtIndex(wordbreak, string, index); + if (iter_fwd.next()) |_| { + // This is a bit tricky because we may have been in the middle of various + // stateful things. So we go forward again: + if (iter_fwd.next()) |_| { + // Make a back iterator: + var iter_back = iter_fwd.reverseIterator(); + const last_word = iter_back.prev().?; // Always works. + const no_flags = iter_back.flags == 0; + if (no_flags) { + // Next previous is our word. + const the_word = iter_back.prev(); + if (the_word) |word| { + assert(word.offset <= index and index <= word.offset + word.len); + return word; + } else { // Can happen, at least I think so + assert(last_word.offset <= index and index <= last_word.offset + last_word.len); + return last_word; + } + } else { + // Scan past all the flags. + while (iter_back.flags > 0) { + _ = iter_back.prev(); + } + // Now just look for our word + iter_fwd = iter_back.forwardIterator(); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index <= word.offset + word.len) { + return word; + } + } + unreachable; } - last_word = this_word; - } else unreachable; - if (i == 0) break; + } else { // We can just reverse here + var iter_back = iter_fwd.reverseIterator(); + const word = iter_back.prev().?; + assert(word.offset <= index and index <= word.offset + word.len); + return word; + } + } else { // last word then + var iter_back = iter_fwd.reverseIterator(); + return iter_back.prev().?; } - return this_word.?; } /// Returns an iterator over words in `slice`. @@ -130,6 +150,36 @@ pub const Iterator = struct { return iter.next(); } + /// Initialize an Iterator at the provided index. Assumes str is valid + /// UTF-8, asserts that `index` is less than str.len, and that `str` is not + /// empty. Note that for various stateful reasons, this may give spurious + /// results if used naïvely. If you want to reliably iterate from an index, + /// use `wb.wordAtIndex(string, index)` to obtain the word, then start an + /// iterator at `wb.initAtIndex(string, word.offset)`. + pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator { + assert(index < string.len and string.len > 0); + // Just in case... + if (index == 0) return wb.iterator(string); + var idx: u32 = @intCast(index); + // Back up past any any follow bytes: + while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} + var iter: Iterator = undefined; + iter.wb = wb; + // We need to populate the CodePoints, and the codepoint iterator. + // Consider "abc |def" with the cursor on d. + // We need `this` to be ` ` and `that` to be 'd', + // and `cp_iter.next()` to be `e`. + var cp_back: ReverseCodepointIterator = .{ .bytes = string, .i = idx }; + // Reverse gives us before `d`: + iter.this = cp_back.prev(); // that == ` ` + // This iterator will give us `d`: + iter.cp_iter = .{ .bytes = string, .i = idx }; + iter.that = iter.cp_iter.next(); + // So that the next call will will give us `e`, + // thus the word will be `def`. + return iter; + } + /// Return a reverse iterator from the point this iterator is paused /// at. Usually, calling `prev()` will return the word just seen. pub fn reverseIterator(iter: *Iterator) ReverseIterator { @@ -302,6 +352,15 @@ pub const ReverseIterator = struct { return wb_iter; } + /// Initialize a ReverseIterator at the provided index. Assumes str is valid + /// UTF-8, asserts that `index` is less than str.len, and that `str` is not + /// empty. You should prefer not to use this function, see Iterator.initAtIndex + /// for more details. + pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator { + var fw_iter = Iterator.initAtIndex(wb, string, index); + return fw_iter.reverseIterator(); + } + /// Returns the previous word segment, without advancing. pub fn peek(iter: *ReverseIterator) ?Word { const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; -- cgit v1.2.3