wordAtIndex passes conformance

I removed the initAtIndex functions from the public vocabulary, because the last couple of days of sweat and blood prove that it's hard to use correctly. That's probably it for WordBreak, now to fix the overlong bug on v0.14 and get this integrated with the new reverse grapheme iterator.
author: Sam Atman 2025-05-15 10:57:33 -0400
committer: Sam Atman 2025-05-15 15:32:43 -0400
commit: 736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5 (patch)
tree: 09cdc6762a519cd2f20efacfa4d1af082f983e85 /src/WordBreak.zig
parent: Rewrite wordAtIndex to use iterator flipping (diff)
download: zg-736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5.tar.gz
zg-736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5.tar.xz
zg-736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5.zip
1 files changed, 72 insertions, 89 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 7ac8f14..6ada7e1 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -64,58 +64,57 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
    return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
 }
+/// Convenience function for working with CodePoints
 fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty {
    return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]);
 }
-/// Returns the Word at the given index.  Asserts that the index is valid for
+/// Returns the Word at the given index.  Asserts that the index is less than
-/// the provided string, and that the string is not empty. Always returns a word.
+/// `string.len`, and that the string is not empty. Always returns a word.
 /// The index does not have to be the start of a codepoint in the word.
-pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word {
+pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word {
    assert(index < string.len and string.len > 0);
-    var iter_fwd: Iterator = .initAtIndex(wordbreak, string, index);
+    var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index);
-    if (iter_fwd.next()) |_| {
+    const first_back = iter_back.prev();
-        // This is a bit tricky because we may have been in the middle of various
+    if (first_back) |back| {
-        // stateful things.  So we go forward again:
+        if (back.offset == 0) {
-        if (iter_fwd.next()) |_| {
+            var iter_fwd = wordbreak.iterator(string);
-            // Make a back iterator:
+            while (iter_fwd.next()) |word| {
-            var iter_back = iter_fwd.reverseIterator();
+                if (word.offset <= index and index < word.offset + word.len)
-            const last_word = iter_back.prev().?; // Always works.
-            const no_flags = iter_back.flags == 0;
-            if (no_flags) {
-                // Next previous is our word.
-                const the_word = iter_back.prev();
-                if (the_word) |word| {
-                    assert(word.offset <= index and index <= word.offset + word.len);
                    return word;
-                } else { // Can happen, at least I think so
+            }
-                    assert(last_word.offset <= index and index <= last_word.offset + last_word.len);
+        }
-                    return last_word;
+    } else {
-                }
+        var iter_fwd = wordbreak.iterator(string);
+        while (iter_fwd.next()) |word| {
+            if (word.offset <= index and index < word.offset + word.len)
+                return word;
+        }
+    }
+    const second_back = iter_back.prev();
+    if (second_back) |back| if (back.offset == 0) {
+        var iter_fwd = wordbreak.iterator(string);
+        while (iter_fwd.next()) |word| {
+            if (word.offset <= index and index < word.offset + word.len)
+                return word;
+        }
+    };
+    // There's sometimes flags:
+    if (iter_back.flags > 0) {
+        while (iter_back.flags > 0) {
+            if (iter_back.prev()) |_| {
+                continue;
            } else {
-                // Scan past all the flags.
+                break;
-                while (iter_back.flags > 0) {
-                    _ = iter_back.prev();
-                }
-                // Now just look for our word
-                iter_fwd = iter_back.forwardIterator();
-                while (iter_fwd.next()) |word| {
-                    if (word.offset <= index and index <= word.offset + word.len) {
-                        return word;
-                    }
-                }
-                unreachable;
            }
-        } else { // We can just reverse here
-            var iter_back = iter_fwd.reverseIterator();
-            const word = iter_back.prev().?;
-            assert(word.offset <= index and index <= word.offset + word.len);
-            return word;
        }
-    } else { // last word then
-        var iter_back = iter_fwd.reverseIterator();
-        return iter_back.prev().?;
    }
+    var iter_fwd = iter_back.forwardIterator();
+    while (iter_fwd.next()) |word| {
+        if (word.offset <= index and index < word.offset + word.len)
+            return word;
+    }
+    unreachable;
 }
 /// Returns an iterator over words in `slice`.
@@ -128,6 +127,7 @@ pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIt
    return ReverseIterator.init(wordbreak, slice);
 }
+/// An iterator, forward, over all words in a provided string.
 pub const Iterator = struct {
    this: ?CodePoint = null,
    that: ?CodePoint = null,
@@ -150,37 +150,7 @@ pub const Iterator = struct {
        return iter.next();
    }
-    /// Initialize an Iterator at the provided index.  Assumes str is valid
+    /// Returns a reverse iterator from the point this iterator is paused
-    /// UTF-8, asserts that `index` is less than str.len, and that `str` is not
-    /// empty.  Note that for various stateful reasons, this may give spurious
-    /// results if used naïvely.  If you want to reliably iterate from an index,
-    /// use `wb.wordAtIndex(string, index)` to obtain the word, then start an
-    /// iterator at `wb.initAtIndex(string, word.offset)`.
-    pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator {
-        assert(index < string.len and string.len > 0);
-        // Just in case...
-        if (index == 0) return wb.iterator(string);
-        var idx: u32 = @intCast(index);
-        // Back up past any any follow bytes:
-        while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
-        var iter: Iterator = undefined;
-        iter.wb = wb;
-        // We need to populate the CodePoints, and the codepoint iterator.
-        // Consider "abc |def" with the cursor on d.
-        // We need `this` to be ` ` and `that` to be 'd',
-        // and `cp_iter.next()` to be `e`.
-        var cp_back: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
-        // Reverse gives us before `d`:
-        iter.this = cp_back.prev(); // that == ` `
-        // This iterator will give us `d`:
-        iter.cp_iter = .{ .bytes = string, .i = idx };
-        iter.that = iter.cp_iter.next();
-        // So that the next call will will give us `e`,
-        // thus the word will be `def`.
-        return iter;
-    }
-    /// Return a reverse iterator from the point this iterator is paused
    /// at.  Usually, calling `prev()` will return the word just seen.
    pub fn reverseIterator(iter: *Iterator) ReverseIterator {
        var cp_it = iter.cp_iter.reverseIterator();
@@ -196,7 +166,7 @@ pub const Iterator = struct {
        };
    }
-    /// Returns the next word segment.
+    /// Returns the next word segment, if any.
    pub fn next(iter: *Iterator) ?Word {
        iter.advance();
@@ -338,6 +308,7 @@ pub const Iterator = struct {
    }
 };
+/// An iterator, backward, over all words in a provided string.
 pub const ReverseIterator = struct {
    after: ?CodePoint = null,
    before: ?CodePoint = null,
@@ -352,16 +323,7 @@ pub const ReverseIterator = struct {
        return wb_iter;
    }
-    /// Initialize a ReverseIterator at the provided index.  Assumes str is valid
+    /// Returns the previous word segment, if any, without advancing.
-    /// UTF-8, asserts that `index` is less than str.len, and that `str` is not
-    /// empty.  You should prefer not to use this function, see Iterator.initAtIndex
-    /// for more details.
-    pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator {
-        var fw_iter = Iterator.initAtIndex(wb, string, index);
-        return fw_iter.reverseIterator();
-    }
-    /// Returns the previous word segment, without advancing.
    pub fn peek(iter: *ReverseIterator) ?Word {
        const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags };
        defer {
@@ -544,6 +506,27 @@ pub const ReverseIterator = struct {
    }
 };
+//| Implementation Details
+/// Initialize a ReverseIterator at the provided index. Used in wordAtIndex.
+fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator {
+    var idx: u32 = @intCast(index);
+    while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
+    if (idx == string.len) return wb.reverseIterator(string);
+    var iter: ReverseIterator = undefined;
+    iter.wb = wb;
+    iter.flags = 0;
+    // We need to populate the CodePoints, and the codepoint iterator.
+    // Consider "abc| def" with the cursor as |.
+    // We need `before` to be `c` and `after` to be ' ',
+    // and `cp_iter.prev()` to be `b`.
+    var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
+    iter.after = cp_iter.prev();
+    iter.before = cp_iter.prev();
+    iter.cp_iter = cp_iter;
+    return iter;
+}
 fn sneaky(iter: *const ReverseIterator) SneakIterator {
    return .{ .cp_iter = iter.cp_iter, .wb = iter.wb };
 }
@@ -656,23 +639,23 @@ test "ext_pict" {
    try testing.expect(ext_pict.isMatch("\u{2701}"));
 }
-test wordAtCursor {
+test wordAtIndex {
    const wb = try WordBreak.init(testing.allocator);
    defer wb.deinit(testing.allocator);
    const t_string = "first second third";
-    const second = wb.wordAtCursor(t_string, 8);
+    const second = wb.wordAtIndex(t_string, 8);
    try testing.expectEqualStrings("second", second.bytes(t_string));
-    const third = wb.wordAtCursor(t_string, 14);
+    const third = wb.wordAtIndex(t_string, 14);
    try testing.expectEqualStrings("third", third.bytes(t_string));
    {
-        const first = wb.wordAtCursor(t_string, 3);
+        const first = wb.wordAtIndex(t_string, 3);
        try testing.expectEqualStrings("first", first.bytes(t_string));
    }
    {
-        const first = wb.wordAtCursor(t_string, 0);
+        const first = wb.wordAtIndex(t_string, 0);
        try testing.expectEqualStrings("first", first.bytes(t_string));
    }
-    const last = wb.wordAtCursor(t_string, 14);
+    const last = wb.wordAtIndex(t_string, 14);
    try testing.expectEqualStrings("third", last.bytes(t_string));
 }
author	Sam Atman	2025-05-15 10:57:33 -0400
committer	Sam Atman	2025-05-15 15:32:43 -0400
commit	736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5 (patch)
tree	09cdc6762a519cd2f20efacfa4d1af082f983e85 /src/WordBreak.zig
parent	Rewrite wordAtIndex to use iterator flipping (diff)
download	zg-736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5.tar.gz zg-736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5.tar.xz zg-736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5.zip