Rewrite wordAtIndex to use iterator flipping

This also adds helper functions for initializing iterators at an index within the string. Not that user code should do that necessarily, but `wordAtIndex` definitely should, and there's no reason not to make it available to others. With an appropriate warning at least.
author: Sam Atman 2025-05-14 14:32:06 -0400
committer: Sam Atman 2025-05-15 15:32:43 -0400
commit: c98b19032536b0e7d7956432f605c1ede80891f0 (patch)
tree: 6d73142b2ee650d42e6ea1c275899b6feac16080 /src
parent: Add format for CodePoint (diff)
download: zg-c98b19032536b0e7d7956432f605c1ede80891f0.tar.gz
zg-c98b19032536b0e7d7956432f605c1ede80891f0.tar.xz
zg-c98b19032536b0e7d7956432f605c1ede80891f0.zip
1 files changed, 83 insertions, 24 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 0925b2f..7ac8f14 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -69,33 +69,53 @@ fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty {
 }
 /// Returns the Word at the given index.  Asserts that the index is valid for
-/// the provided string.  Always returns a word.  This algorithm is O(n^2) for
+/// the provided string, and that the string is not empty. Always returns a word.
-/// the size of the word before the cursor.
+/// The index does not have to be the start of a codepoint in the word.
 pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word {
-    assert(index < string.len);
+    assert(index < string.len and string.len > 0);
-    var i = index;
+    var iter_fwd: Iterator = .initAtIndex(wordbreak, string, index);
-    var iter1 = wordbreak.iterator(string[index..]);
+    if (iter_fwd.next()) |_| {
-    var last_word = iter1.next();
+        // This is a bit tricky because we may have been in the middle of various
-    if (index == 0) return if (last_word) |word| word else Word{ .offset = 0, .len = 0 };
+        // stateful things.  So we go forward again:
-    var this_word: ?Word = undefined;
+        if (iter_fwd.next()) |_| {
-    while (last_word) |l_word| : (i -= 1) {
+            // Make a back iterator:
-        var iter2 = wordbreak.iterator(string[i..]);
+            var iter_back = iter_fwd.reverseIterator();
-        this_word = iter2.next();
+            const last_word = iter_back.prev().?; // Always works.
-        if (this_word) |t_word| {
+            const no_flags = iter_back.flags == 0;
-            if (false) std.debug.print("last_word: '{s}'\n", .{l_word.bytes(string[i + 1 ..])});
+            if (no_flags) {
-            if (false) std.debug.print("this_word: '{s}'\n", .{t_word.bytes(string[i..])});
+                // Next previous is our word.
-            // Check if we've captured a previous word
+                const the_word = iter_back.prev();
-            const t_end = t_word.offset + t_word.len + i - 1;
+                if (the_word) |word| {
-            const l_start = l_word.offset + i + 1;
+                    assert(word.offset <= index and index <= word.offset + word.len);
-            if (false) std.debug.print("t_end {d}, l_start {d}\n", .{ t_end, l_start });
+                    return word;
-            if (t_end < l_start) {
+                } else { // Can happen, at least I think so
-                return .{ .offset = @intCast(l_start), .len = l_word.len };
+                    assert(last_word.offset <= index and index <= last_word.offset + last_word.len);
+                    return last_word;
+                }
+            } else {
+                // Scan past all the flags.
+                while (iter_back.flags > 0) {
+                    _ = iter_back.prev();
+                }
+                // Now just look for our word
+                iter_fwd = iter_back.forwardIterator();
+                while (iter_fwd.next()) |word| {
+                    if (word.offset <= index and index <= word.offset + word.len) {
+                        return word;
+                    }
+                }
+                unreachable;
            }
-            last_word = this_word;
+        } else { // We can just reverse here
-        } else unreachable;
+            var iter_back = iter_fwd.reverseIterator();
-        if (i == 0) break;
+            const word = iter_back.prev().?;
+            assert(word.offset <= index and index <= word.offset + word.len);
+            return word;
+        }
+    } else { // last word then
+        var iter_back = iter_fwd.reverseIterator();
+        return iter_back.prev().?;
    }
-    return this_word.?;
 }
 /// Returns an iterator over words in `slice`.
@@ -130,6 +150,36 @@ pub const Iterator = struct {
        return iter.next();
    }
+    /// Initialize an Iterator at the provided index.  Assumes str is valid
+    /// UTF-8, asserts that `index` is less than str.len, and that `str` is not
+    /// empty.  Note that for various stateful reasons, this may give spurious
+    /// results if used naïvely.  If you want to reliably iterate from an index,
+    /// use `wb.wordAtIndex(string, index)` to obtain the word, then start an
+    /// iterator at `wb.initAtIndex(string, word.offset)`.
+    pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator {
+        assert(index < string.len and string.len > 0);
+        // Just in case...
+        if (index == 0) return wb.iterator(string);
+        var idx: u32 = @intCast(index);
+        // Back up past any any follow bytes:
+        while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
+        var iter: Iterator = undefined;
+        iter.wb = wb;
+        // We need to populate the CodePoints, and the codepoint iterator.
+        // Consider "abc |def" with the cursor on d.
+        // We need `this` to be ` ` and `that` to be 'd',
+        // and `cp_iter.next()` to be `e`.
+        var cp_back: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
+        // Reverse gives us before `d`:
+        iter.this = cp_back.prev(); // that == ` `
+        // This iterator will give us `d`:
+        iter.cp_iter = .{ .bytes = string, .i = idx };
+        iter.that = iter.cp_iter.next();
+        // So that the next call will will give us `e`,
+        // thus the word will be `def`.
+        return iter;
+    }
    /// Return a reverse iterator from the point this iterator is paused
    /// at.  Usually, calling `prev()` will return the word just seen.
    pub fn reverseIterator(iter: *Iterator) ReverseIterator {
@@ -302,6 +352,15 @@ pub const ReverseIterator = struct {
        return wb_iter;
    }
+    /// Initialize a ReverseIterator at the provided index.  Assumes str is valid
+    /// UTF-8, asserts that `index` is less than str.len, and that `str` is not
+    /// empty.  You should prefer not to use this function, see Iterator.initAtIndex
+    /// for more details.
+    pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator {
+        var fw_iter = Iterator.initAtIndex(wb, string, index);
+        return fw_iter.reverseIterator();
+    }
    /// Returns the previous word segment, without advancing.
    pub fn peek(iter: *ReverseIterator) ?Word {
        const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags };
author	Sam Atman	2025-05-14 14:32:06 -0400
committer	Sam Atman	2025-05-15 15:32:43 -0400
commit	c98b19032536b0e7d7956432f605c1ede80891f0 (patch)
tree	6d73142b2ee650d42e6ea1c275899b6feac16080 /src
parent	Add format for CodePoint (diff)
download	zg-c98b19032536b0e7d7956432f605c1ede80891f0.tar.gz zg-c98b19032536b0e7d7956432f605c1ede80891f0.tar.xz zg-c98b19032536b0e7d7956432f605c1ede80891f0.zip

diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 0925b2f..7ac8f14 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig
@@ -69,33 +69,53 @@ fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty {
69	}	69	}
70		70
71	/// Returns the Word at the given index. Asserts that the index is valid for	71	/// Returns the Word at the given index. Asserts that the index is valid for
72	/// the provided string. Always returns a word. This algorithm is O(n^2) for	72	/// the provided string, and that the string is not empty. Always returns a word.
73	/// the size of the word before the cursor.	73	/// The index does not have to be the start of a codepoint in the word.
74	pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word {	74	pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word {
75	assert(index < string.len);	75	assert(index < string.len and string.len > 0);
76	var i = index;	76	var iter_fwd: Iterator = .initAtIndex(wordbreak, string, index);
77	var iter1 = wordbreak.iterator(string[index..]);	77	if (iter_fwd.next()) \|_\| {
78	var last_word = iter1.next();	78	// This is a bit tricky because we may have been in the middle of various
79	if (index == 0) return if (last_word) \|word\| word else Word{ .offset = 0, .len = 0 };	79	// stateful things. So we go forward again:
80	var this_word: ?Word = undefined;	80	if (iter_fwd.next()) \|_\| {
81	while (last_word) \|l_word\| : (i -= 1) {	81	// Make a back iterator:
82	var iter2 = wordbreak.iterator(string[i..]);	82	var iter_back = iter_fwd.reverseIterator();
83	this_word = iter2.next();	83	const last_word = iter_back.prev().?; // Always works.
84	if (this_word) \|t_word\| {	84	const no_flags = iter_back.flags == 0;
85	if (false) std.debug.print("last_word: '{s}'\n", .{l_word.bytes(string[i + 1 ..])});	85	if (no_flags) {
86	if (false) std.debug.print("this_word: '{s}'\n", .{t_word.bytes(string[i..])});	86	// Next previous is our word.
87	// Check if we've captured a previous word	87	const the_word = iter_back.prev();
88	const t_end = t_word.offset + t_word.len + i - 1;	88	if (the_word) \|word\| {
89	const l_start = l_word.offset + i + 1;	89	assert(word.offset <= index and index <= word.offset + word.len);
90	if (false) std.debug.print("t_end {d}, l_start {d}\n", .{ t_end, l_start });	90	return word;
91	if (t_end < l_start) {	91	} else { // Can happen, at least I think so
92	return .{ .offset = @intCast(l_start), .len = l_word.len };	92	assert(last_word.offset <= index and index <= last_word.offset + last_word.len);
		93	return last_word;
		94	}
		95	} else {
		96	// Scan past all the flags.
		97	while (iter_back.flags > 0) {
		98	_ = iter_back.prev();
		99	}
		100	// Now just look for our word
		101	iter_fwd = iter_back.forwardIterator();
		102	while (iter_fwd.next()) \|word\| {
		103	if (word.offset <= index and index <= word.offset + word.len) {
		104	return word;
		105	}
		106	}
		107	unreachable;
93	}	108	}
94	last_word = this_word;	109	} else { // We can just reverse here
95	} else unreachable;	110	var iter_back = iter_fwd.reverseIterator();
96	if (i == 0) break;	111	const word = iter_back.prev().?;
		112	assert(word.offset <= index and index <= word.offset + word.len);
		113	return word;
		114	}
		115	} else { // last word then
		116	var iter_back = iter_fwd.reverseIterator();
		117	return iter_back.prev().?;
97	}	118	}
98	return this_word.?;
99	}	119	}
100		120
101	/// Returns an iterator over words in `slice`.	121	/// Returns an iterator over words in `slice`.
@@ -130,6 +150,36 @@ pub const Iterator = struct {
130	return iter.next();	150	return iter.next();
131	}	151	}
132		152
		153	/// Initialize an Iterator at the provided index. Assumes str is valid
		154	/// UTF-8, asserts that `index` is less than str.len, and that `str` is not
		155	/// empty. Note that for various stateful reasons, this may give spurious
		156	/// results if used naïvely. If you want to reliably iterate from an index,
		157	/// use `wb.wordAtIndex(string, index)` to obtain the word, then start an
		158	/// iterator at `wb.initAtIndex(string, word.offset)`.
		159	pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator {
		160	assert(index < string.len and string.len > 0);
		161	// Just in case...
		162	if (index == 0) return wb.iterator(string);
		163	var idx: u32 = @intCast(index);
		164	// Back up past any any follow bytes:
		165	while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
		166	var iter: Iterator = undefined;
		167	iter.wb = wb;
		168	// We need to populate the CodePoints, and the codepoint iterator.
		169	// Consider "abc \|def" with the cursor on d.
		170	// We need `this` to be ` ` and `that` to be 'd',
		171	// and `cp_iter.next()` to be `e`.
		172	var cp_back: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
		173	// Reverse gives us before `d`:
		174	iter.this = cp_back.prev(); // that == ` `
		175	// This iterator will give us `d`:
		176	iter.cp_iter = .{ .bytes = string, .i = idx };
		177	iter.that = iter.cp_iter.next();
		178	// So that the next call will will give us `e`,
		179	// thus the word will be `def`.
		180	return iter;
		181	}
		182
133	/// Return a reverse iterator from the point this iterator is paused	183	/// Return a reverse iterator from the point this iterator is paused
134	/// at. Usually, calling `prev()` will return the word just seen.	184	/// at. Usually, calling `prev()` will return the word just seen.
135	pub fn reverseIterator(iter: *Iterator) ReverseIterator {	185	pub fn reverseIterator(iter: *Iterator) ReverseIterator {
@@ -302,6 +352,15 @@ pub const ReverseIterator = struct {
302	return wb_iter;	352	return wb_iter;
303	}	353	}
304		354
		355	/// Initialize a ReverseIterator at the provided index. Assumes str is valid
		356	/// UTF-8, asserts that `index` is less than str.len, and that `str` is not
		357	/// empty. You should prefer not to use this function, see Iterator.initAtIndex
		358	/// for more details.
		359	pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator {
		360	var fw_iter = Iterator.initAtIndex(wb, string, index);
		361	return fw_iter.reverseIterator();
		362	}
		363
305	/// Returns the previous word segment, without advancing.	364	/// Returns the previous word segment, without advancing.
306	pub fn peek(iter: *ReverseIterator) ?Word {	365	pub fn peek(iter: *ReverseIterator) ?Word {
307	const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags };	366	const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags };