Add graphemeAtIndex + iterate before and after

That completes the set. I do think it's possible to bum a few more cycles from the implementation, but, I'm not going to. It passes the acceptance suite and that's what it needs to do.
author: Sam Atman 2025-06-01 14:08:25 -0400
committer: Sam Atman 2025-06-01 14:08:25 -0400
commit: 8f5209fa095c2ed9114ce102b2f9b2cc90d66b13 (patch)
tree: 4ec54815215a9a808be0ab9a2968159f144ba076 /src
parent: Document "fat_offset" in README (diff)
download: zg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.tar.gz
zg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.tar.xz
zg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.zip
4 files changed, 266 insertions, 87 deletions
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 49fdbf3..f1c56ed 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -1,15 +1,7 @@
-const std = @import("std");
+//! Graphemes Module
-const builtin = @import("builtin");
+//!
-const mem = std.mem;
+//! Code for handling graphemes: fragments of string which should be
-const Allocator = mem.Allocator;
+//! treated as one unit.  Like Farmer Bob here: 👨🏻‍🌾
-const compress = std.compress;
-const unicode = std.unicode;
-const code_point = @import("code_point");
-const CodePoint = code_point.CodePoint;
-const CodePointIterator = code_point.Iterator;
-const CodePointReverseIterator = code_point.ReverseIterator;
-const uoffset = code_point.uoffset;
 s1: []u16 = undefined,
 s2: []u16 = undefined,
@@ -69,10 +61,12 @@ pub fn isEmoji(graphemes: Graphemes, cp: u21) bool {
    return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1;
 }
+/// Returns an iterator over the graphemes in `string`.
 pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator {
    return Iterator.init(string, graphemes);
 }
+/// Returns a reverse iterator over the graphemes in `string`.
 pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator {
    return ReverseIterator.init(string, graphemes);
 }
@@ -116,6 +110,96 @@ pub const Grapheme = struct {
    }
 };
+// NOTE: graphemeAtIndex is, probably, not in an optimal form.  It has the advantage
+// of being composed of other parts, but the constant factor can _probably_ be improved
+// by a bespoke implmentation using graphemes.graphemeBreak directly.  There's a limit
+// to how much cycle-bumming I'm willing to do at any given moment; that limit has been
+// reached.  Perhaps you, Dear Reader, might pick up the torch?
+/// Returns the `Grapheme` at `string[index]`, which does not have to be a
+/// valid start of a codepoint.  Asserts the string is not empty.  Index must be
+/// less than `string.len`.  Always returns a `Grapheme`.
+pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme {
+    assert(string.len != 0);
+    if (index == 0 or (index > 0 and
+        string[index] < 0x80 and
+        string[index - 1] < 0x80) and
+        (string[index - 1] != '\r' and string[index] != '\n'))
+    {
+        // There's always a grapheme break between two ASCII code points (except CRLF)
+        var iter = graphemes.iterator(string[index..]);
+        const next = iter.next().?;
+        return Grapheme{
+            .len = next.len,
+            .offset = @as(u32, @intCast(index)) + next.offset,
+        };
+    } // Otherwise it gets hairy.
+    const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset;
+    if (idx == string.len) {
+        var iter = graphemes.reverseIterator(string);
+        return iter.prev().?;
+    }
+    // We're on a valid codepoint boundary, we go back from here
+    var r_iter = graphemes.reverseIterAtIndex(string, idx);
+    if (r_iter.prev()) |g| {
+        if (g.offset == 0) {
+            var iter = graphemes.iterator(string);
+            while (iter.next()) |g2| {
+                if (g2.offset <= idx and idx < g2.offset + g2.len) return g2;
+            }
+        }
+    }
+    // We need to toss one, because otherwise we might not be pending when
+    // we in fact need to be.
+    _ = r_iter.prev();
+    while (r_iter.pending != .none) : (_ = r_iter.prev()) {}
+    var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0);
+    while (iter.next()) |g| {
+        if (g.offset <= idx and idx < g.offset + g.len) return g;
+    }
+    unreachable;
+}
+/// Return a (forward) iterator of `string` after `grapheme`.
+pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator {
+    return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len);
+}
+/// Return a reverse iterator of `string` before `grapheme`.
+pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator {
+    // This bit of weirdness is because reverse iterators are "advance last",
+    // while forward iterators are "advance first".  This leaves some room for
+    // further optimization, if anyone dares.
+    var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1);
+    _ = r_iter.prev();
+    return r_iter;
+}
+fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator {
+    var r_iter: ReverseIterator = undefined;
+    r_iter.data = graphemes;
+    var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
+    r_iter.buf[1] = rcp_iter.prev();
+    r_iter.buf[0] = rcp_iter.prev();
+    r_iter.pending = .none;
+    r_iter.cp_iter = rcp_iter;
+    return r_iter;
+}
+fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator {
+    var iter: Iterator = undefined;
+    iter.data = graphemes;
+    iter.buf[0] = first: {
+        if (idx == string.len) break :first null;
+        var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
+        break :first r_cp_iter.prev();
+    };
+    var cp_iter: CodePointIterator = .{ .bytes = string, .i = idx };
+    iter.buf[1] = cp_iter.next();
+    iter.cp_iter = cp_iter;
+    return iter;
+}
 /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
 pub const Iterator = struct {
    buf: [2]?CodePoint = .{ null, null },
@@ -150,7 +234,7 @@ pub const Iterator = struct {
        const gc_start = self.buf[0].?.offset;
        var gc_len: u8 = self.buf[0].?.len;
-        var state = State{};
+        var state = IterState{};
        if (graphemeBreak(
            self.buf[0].?.code,
@@ -189,12 +273,13 @@ pub const Iterator = struct {
    }
 };
+/// Iterate a string backward by Grapheme.
 pub const ReverseIterator = struct {
    buf: [2]?CodePoint = .{ null, null },
    cp_iter: CodePointReverseIterator,
    data: *const Graphemes,
    /// Codepoint read from `cp_iter` but not returned by `previous`
-    pending: Pending = .{ .none = {} },
+    pending: Pending = .none,
    const Pending = union(enum) {
        none: void,
@@ -218,6 +303,12 @@ pub const ReverseIterator = struct {
        self.buf[0] = self.cp_iter.prev();
    }
+    pub fn peek(self: *Self) ?Grapheme {
+        const cache = .{ self.buf, self.cp_iter, self.pending };
+        defer self.buf, self.cp_iter, self.pending = cache;
+        return self.prev();
+    }
    pub fn prev(self: *Self) ?Grapheme {
        if (self.buf[1] == null) return null;
@@ -255,10 +346,10 @@ pub const ReverseIterator = struct {
        };
        while (self.buf[0] != null) {
-            var state: State = .{};
+            var state: IterState = .{};
-            state.setXpic();
+            state.xpic = true;
-            state.unsetRegional();
+            state.regional = false;
-            state.setIndic();
+            state.indic = true;
            if (graphemeBreak(
                self.buf[0].?.code,
@@ -269,7 +360,7 @@ pub const ReverseIterator = struct {
            self.advance();
-            if (!state.hasIndic()) {
+            if (!state.indic) {
                // BUF: [?Any, Extend | Linker] Consonant
                var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
@@ -296,11 +387,11 @@ pub const ReverseIterator = struct {
                            self.advance();
                            if (self.buf[0]) |cp1| {
-                                state.setIndic();
+                                state.indic = true;
                                if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break;
-                                if (!state.hasIndic()) {
+                                if (!state.indic) {
                                    continue :indic;
                                } else {
                                    break :indic;
@@ -321,7 +412,7 @@ pub const ReverseIterator = struct {
                }
            }
-            if (!state.hasXpic()) {
+            if (!state.xpic) {
                // BUF: [?Any, ZWJ] Emoji
                var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
@@ -370,7 +461,7 @@ pub const ReverseIterator = struct {
                }
            }
-            if (state.hasRegional()) {
+            if (state.regional) {
                var ri_count: usize = 0;
                while (self.buf[0] != null and
                    self.data.gbp(self.buf[0].?.code) == .Regional_Indicator)
@@ -404,6 +495,13 @@ pub const ReverseIterator = struct {
    }
 };
+/// Grapheme Iterator state.
+pub const IterState = packed struct(u3) {
+    xpic: bool = false,
+    regional: bool = false,
+    indic: bool = false,
+};
 // Predicates
 fn isBreaker(cp: u21, data: *const Graphemes) bool {
    // Extract relevant properties.
@@ -411,44 +509,6 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool {
    return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
 }
-// Grapheme break state.
-pub const State = struct {
-    bits: u3 = 0,
-    // Extended Pictographic (emoji)
-    fn hasXpic(self: State) bool {
-        return self.bits & 1 == 1;
-    }
-    fn setXpic(self: *State) void {
-        self.bits |= 1;
-    }
-    fn unsetXpic(self: *State) void {
-        self.bits &= ~@as(u3, 1);
-    }
-    // Regional Indicatior (flags)
-    fn hasRegional(self: State) bool {
-        return self.bits & 2 == 2;
-    }
-    fn setRegional(self: *State) void {
-        self.bits |= 2;
-    }
-    fn unsetRegional(self: *State) void {
-        self.bits &= ~@as(u3, 2);
-    }
-    // Indic Conjunct
-    fn hasIndic(self: State) bool {
-        return self.bits & 4 == 4;
-    }
-    fn setIndic(self: *State) void {
-        self.bits |= 4;
-    }
-    fn unsetIndic(self: *State) void {
-        self.bits &= ~@as(u3, 4);
-    }
-};
 /// `graphemeBreak` returns true only if a grapheme break point is required
 /// between `cp1` and `cp2`. `state` should start out as 0. If calling
 /// iteratively over a sequence of code points, this function must be called
@@ -459,7 +519,7 @@ pub fn graphemeBreak(
    cp1: u21,
    cp2: u21,
    data: *const Graphemes,
-    state: *State,
+    state: *IterState,
 ) bool {
    // Extract relevant properties.
    const cp1_gbp_prop = data.gbp(cp1);
@@ -471,9 +531,9 @@ pub fn graphemeBreak(
    const cp2_is_emoji = data.isEmoji(cp2);
    // GB11: Emoji Extend* ZWJ x Emoji
-    if (!state.hasXpic() and cp1_is_emoji) state.setXpic();
+    if (!state.xpic and cp1_is_emoji) state.xpic = true;
    // GB9c: Indic Conjunct Break
-    if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic();
+    if (!state.indic and cp1_indic_prop == .Consonant) state.indic = true;
    // GB3: CR x LF
    if (cp1 == '\r' and cp2 == '\n') return false;
@@ -482,11 +542,11 @@ pub fn graphemeBreak(
    if (isBreaker(cp1, data)) return true;
    // GB11: Emoji Extend* ZWJ x Emoji
-    if (state.hasXpic() and
+    if (state.xpic and
        cp1_gbp_prop == .ZWJ and
        cp2_is_emoji)
    {
-        state.unsetXpic();
+        state.xpic = false;
        return false;
    }
@@ -501,11 +561,11 @@ pub fn graphemeBreak(
    // GB12, GB13: RI x RI
    if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
-        if (state.hasRegional()) {
+        if (state.regional) {
-            state.unsetRegional();
+            state.regional = false;
            return true;
        } else {
-            state.setRegional();
+            state.regional = true;
            return false;
        }
    }
@@ -530,25 +590,25 @@ pub fn graphemeBreak(
    }
    // GB9c: Indic Conjunct Break
-    if (state.hasIndic() and
+    if (state.indic and
        cp1_indic_prop == .Consonant and
        (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker))
    {
        return false;
    }
-    if (state.hasIndic() and
+    if (state.indic and
        cp1_indic_prop == .Extend and
        cp2_indic_prop == .Linker)
    {
        return false;
    }
-    if (state.hasIndic() and
+    if (state.indic and
        (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and
        cp2_indic_prop == .Consonant)
    {
-        state.unsetIndic();
+        state.indic = false;
        return false;
    }
@@ -608,3 +668,17 @@ test "Iterator.peek" {
    try std.testing.expectEqual(null, iter.peek());
    try std.testing.expectEqual(iter.peek(), iter.next());
 }
+const std = @import("std");
+const builtin = @import("builtin");
+const assert = std.debug.assert;
+const mem = std.mem;
+const Allocator = mem.Allocator;
+const compress = std.compress;
+const unicode = std.unicode;
+const code_point = @import("code_point");
+const CodePoint = code_point.CodePoint;
+const CodePointIterator = code_point.Iterator;
+const CodePointReverseIterator = code_point.ReverseIterator;
+const uoffset = code_point.uoffset;
diff --git a/src/Words.zig b/src/Words.zig
index 1707881..af82562 100644
--- a/src/Words.zig
+++ b/src/Words.zig
@@ -124,12 +124,12 @@ pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator {
 }
 /// Returns an iterator after the `word` in `slice`.
-pub fn iterateAfter(words: *const Words, slice: []const u8, word: Word) Iterator {
+pub fn iterateAfterWord(words: *const Words, slice: []const u8, word: Word) Iterator {
    return forwardFromIndex(words, slice, word.offset + word.len);
 }
 /// Returns a reverse iterator before the `word` in `slice`.
-pub fn iterateBefore(words: *const Words, slice: []const u8, word: Word) ReverseIterator {
+pub fn iterateBeforeWord(words: *const Words, slice: []const u8, word: Word) ReverseIterator {
    return reverseFromIndex(words, slice, word.offset);
 }
diff --git a/src/code_point.zig b/src/code_point.zig
index 8bd3d5b..16648af 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -39,9 +39,17 @@ pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint {
    return null;
 }
+/// Return the codepoint at `index`, even if `index` is in the middle
+/// of that codepoint.
+pub fn codepointAtIndex(bytes: []const u8, index: uoffset) ?CodePoint {
+    var idx = index;
+    while (idx > 0 and 0x80 <= bytes[idx] and bytes[idx] <= 0xbf) : (idx -= 1) {}
+    return decodeAtIndex(bytes, idx);
+}
 /// Decode the CodePoint, if any, at `bytes[idx]`.
-pub fn decodeAtIndex(bytes: []const u8, idx: uoffset) ?CodePoint {
+pub fn decodeAtIndex(bytes: []const u8, index: uoffset) ?CodePoint {
-    var off = idx;
+    var off = index;
    return decodeAtCursor(bytes, &off);
 }
@@ -329,6 +337,54 @@ test Iterator {
    try expectEqual(@as(?CodePoint, null), iter.next());
 }
+const code_point = @This();
+// Keep this in sync with the README
+test "Code point iterator" {
+    const str = "Hi 😊";
+    var iter: code_point.Iterator = .init(str);
+    var i: usize = 0;
+    while (iter.next()) |cp| : (i += 1) {
+        // The `code` field is the actual code point scalar as a `u21`.
+        if (i == 0) try expect(cp.code == 'H');
+        if (i == 1) try expect(cp.code == 'i');
+        if (i == 2) try expect(cp.code == ' ');
+        if (i == 3) {
+            try expect(cp.code == '😊');
+            // The `offset` field is the byte offset in the
+            // source string.
+            try expect(cp.offset == 3);
+            try expectEqual(cp, code_point.decodeAtIndex(str, cp.offset).?);
+            // The `len` field is the length in bytes of the
+            // code point in the source string.
+            try expect(cp.len == 4);
+            // There is also a 'cursor' decode, like so:
+            {
+                var cursor = cp.offset;
+                try expectEqual(cp, code_point.decodeAtCursor(str, &cursor).?);
+                // Which advances the cursor variable to the next possible
+                // offset, in this case, `str.len`.  Don't forget to account
+                // for this possibility!
+                try expectEqual(cp.offset + cp.len, cursor);
+            }
+            // There's also this, for when you aren't sure if you have the
+            // correct start for a code point:
+            try expectEqual(cp, code_point.codepointAtIndex(str, cp.offset + 1).?);
+        }
+        // Reverse iteration is also an option:
+        var r_iter: code_point.ReverseIterator = .init(str);
+        // Both iterators can be peeked:
+        try expectEqual('😊', r_iter.peek().?.code);
+        try expectEqual('😊', r_iter.prev().?.code);
+        // Both kinds of iterators can be reversed:
+        var fwd_iter = r_iter.forwardIterator(); // or iter.reverseIterator();
+        // This will always return the last codepoint from
+        // the prior iterator, _if_ it yielded one:
+        try expectEqual('😊', fwd_iter.next().?.code);
+    }
+}
 test "overlongs" {
    // None of these should equal `/`, all should be byte-for-byte
    // handled as replacement characters.
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index c463dcc..ae177a9 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -162,20 +162,51 @@ test "Segmentation GraphemeIterator" {
            bytes_index += cp_index;
        }
+        const this_str = all_bytes.items;
        {
-            var iter = graph.iterator(all_bytes.items);
+            var iter = graph.iterator(this_str);
            // Check.
-            for (want.items) |want_gc| {
+            for (want.items, 1..) |want_gc, idx| {
                const got_gc = (iter.next()).?;
                try std.testing.expectEqualStrings(
-                    want_gc.bytes(all_bytes.items),
+                    want_gc.bytes(this_str),
-                    got_gc.bytes(all_bytes.items),
+                    got_gc.bytes(this_str),
                );
+                for (got_gc.offset..got_gc.offset + got_gc.len) |i| {
+                    const this_gc = graph.graphemeAtIndex(this_str, i);
+                    std.testing.expectEqualSlices(
+                        u8,
+                        got_gc.bytes(this_str),
+                        this_gc.bytes(this_str),
+                    ) catch |err| {
+                        debug.print("Wrong grapheme on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i });
+                        return err;
+                    };
+                }
+                var after_iter = graph.iterateAfterGrapheme(this_str, got_gc);
+                if (after_iter.next()) |next_gc| {
+                    if (iter.peek()) |next_peek| {
+                        std.testing.expectEqualSlices(
+                            u8,
+                            next_gc.bytes(this_str),
+                            next_peek.bytes(this_str),
+                        ) catch |err| {
+                            debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, idx });
+                            return err;
+                        };
+                    } else {
+                        debug.print("Mismatch: peek missing, next found, line {d} #{d}\n", .{ line_iter.line, idx });
+                        try testing.expect(false);
+                    }
+                } else {
+                    try testing.expectEqual(null, iter.peek());
+                }
            }
        }
        {
-            var iter = graph.reverseIterator(all_bytes.items);
+            var iter = graph.reverseIterator(this_str);
            // Check.
            var i: usize = want.items.len;
@@ -190,8 +221,8 @@ test "Segmentation GraphemeIterator" {
                    return error.TestExpectedEqual;
                };
                std.testing.expectEqualStrings(
-                    want_gc.bytes(all_bytes.items),
+                    want_gc.bytes(this_str),
-                    got_gc.bytes(all_bytes.items),
+                    got_gc.bytes(this_str),
                ) catch |err| {
                    std.debug.print(
                        "line {d} grapheme {d}: expected {any} found {any}\n",
@@ -199,6 +230,24 @@ test "Segmentation GraphemeIterator" {
                    );
                    return err;
                };
+                var before_iter = graph.iterateBeforeGrapheme(this_str, got_gc);
+                if (before_iter.prev()) |prev_gc| {
+                    if (iter.peek()) |prev_peek| {
+                        std.testing.expectEqualSlices(
+                            u8,
+                            prev_gc.bytes(this_str),
+                            prev_peek.bytes(this_str),
+                        ) catch |err| {
+                            debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, i });
+                            return err;
+                        };
+                    } else {
+                        debug.print("Mismatch: peek missing, prev found, line {d} #{d}\n", .{ line_iter.line, i });
+                        try testing.expect(false);
+                    }
+                } else {
+                    try testing.expectEqual(null, iter.peek());
+                }
            }
        }
    }
@@ -287,7 +336,7 @@ test "Segmentation Word Iterator" {
                } else {
                    try testing.expect(false);
                }
-                var peek_iter = wb.iterateAfter(this_str, got_word);
+                var peek_iter = wb.iterateAfterWord(this_str, got_word);
                const peek_1 = peek_iter.next();
                if (peek_1) |p1| {
                    const peek_2 = iter.peek();
@@ -313,7 +362,7 @@ test "Segmentation Word Iterator" {
                        got_word.bytes(this_str),
                        this_word.bytes(this_str),
                    ) catch |err| {
-                        debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i });
+                        debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i });
                        return err;
                    };
                }
@@ -356,7 +405,7 @@ test "Segmentation Word Iterator" {
                } else {
                    try testing.expect(false);
                }
-                var peek_iter = wb.iterateBefore(this_str, got_word);
+                var peek_iter = wb.iterateBeforeWord(this_str, got_word);
                const peek_1 = peek_iter.prev();
                if (peek_1) |p1| {
                    const peek_2 = r_iter.peek();
author	Sam Atman	2025-06-01 14:08:25 -0400
committer	Sam Atman	2025-06-01 14:08:25 -0400
commit	8f5209fa095c2ed9114ce102b2f9b2cc90d66b13 (patch)
tree	4ec54815215a9a808be0ab9a2968159f144ba076 /src
parent	Document "fat_offset" in README (diff)
download	zg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.tar.gz zg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.tar.xz zg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.zip