From 94b1f37474c7444d8129445ae7984f922cb9c283 Mon Sep 17 00:00:00 2001
From: Matteo Romano
Date: Mon, 12 May 2025 12:14:14 +0200
Subject: fix: State.unset* did toggle the bit instead of unsetting it

---
 src/Graphemes.zig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'src/Graphemes.zig')

diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 7bf328a..5780ed4 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -258,7 +258,7 @@ pub const State = struct {
         self.bits |= 1;
     }
     fn unsetXpic(self: *State) void {
-        self.bits ^= 1;
+        self.bits &= ~@as(u3, 1);
     }
 
     // Regional Indicatior (flags)
@@ -269,7 +269,7 @@ pub const State = struct {
         self.bits |= 2;
     }
     fn unsetRegional(self: *State) void {
-        self.bits ^= 2;
+        self.bits &= ~@as(u3, 2);
     }
 
     // Indic Conjunct
@@ -280,7 +280,7 @@ pub const State = struct {
         self.bits |= 4;
     }
     fn unsetIndic(self: *State) void {
-        self.bits ^= 4;
+        self.bits &= ~@as(u3, 4);
     }
 };
 
-- 
cgit v1.2.3


From 890370f5479299940f505e1247c408064f789bd5 Mon Sep 17 00:00:00 2001
From: Matteo Romano
Date: Mon, 12 May 2025 12:14:30 +0200
Subject: feat: add reverse grapheme iterator

Closes #53
---
 src/Graphemes.zig | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 220 insertions(+)

(limited to 'src/Graphemes.zig')

diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 5780ed4..3bff18d 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -7,6 +7,7 @@ const unicode = std.unicode;
 
 const CodePoint = @import("code_point").CodePoint;
 const CodePointIterator = @import("code_point").Iterator;
+const CodePointReverseIterator = @import("code_point").ReverseIterator;
 
 s1: []u16 = undefined,
 s2: []u16 = undefined,
@@ -70,6 +71,10 @@ pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator {
     return Iterator.init(string, graphemes);
 }
 
+pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator {
+    return ReverseIterator.init(string, graphemes);
+}
+
 /// Indic syllable type.
 pub const Indic = enum {
     none,
@@ -239,6 +244,221 @@ pub const Iterator = struct {
     }
 };
 
+pub const ReverseIterator = struct {
+    buf: [2]?CodePoint = .{ null, null },
+    cp_iter: CodePointReverseIterator,
+    data: *const Graphemes,
+    /// Codepoint read from `cp_iter` but not returned by `previous`
+    pending: Pending = .{ .none = {} },
+
+    const Pending = union(enum) {
+        none: void,
+        /// Count of pending RI codepoints, it is an even number
+        ri_count: usize,
+        /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji
+        extend_end: u32,
+    };
+
+    const Self = @This();
+
+    pub fn init(str: []const u8, data: *const Graphemes) Self {
+        var self: Self = .{ .cp_iter = .init(str), .data = data };
+        self.advance();
+        self.advance();
+        return self;
+    }
+
+    fn advance(self: *Self) void {
+        self.buf[1] = self.buf[0];
+        self.buf[0] = self.cp_iter.prev();
+    }
+
+    pub fn prev(self: *Self) ?Grapheme {
+        if (self.buf[1] == null) return null;
+
+        const grapheme_end: u32 = end: {
+            const codepoint = self.buf[1].?;
+
+            switch (self.pending) {
+                // BUF: [?Any, Any]
+                .none => break :end codepoint.offset + codepoint.len,
+                .ri_count => |ri_count| {
+                    std.debug.assert(ri_count > 0);
+                    std.debug.assert(ri_count % 2 == 0);
+
+                    if (ri_count > 2) {
+                        self.pending.ri_count -= 2;
+
+                        // Use the fact that all RI have length 4 in utf8 encoding
+                        // since they are in range 0x1f1e6...0x1f1ff
+                        // https://en.wikipedia.org/wiki/UTF-8#Encoding
+                        return Grapheme{
+                            .len = 8,
+                            .offset = @intCast(codepoint.offset + self.pending.ri_count * 4),
+                        };
+                    } else {
+                        self.pending = .{ .none = {} };
+                        break :end codepoint.offset + codepoint.len + 4;
+                    }
+                },
+                // BUF: [?Any, Extend] Extend* ZWJ
+                .extend_end => |extend_end| {
+                    self.pending = .{ .none = {} };
+                    break :end extend_end;
+                },
+            }
+        };
+
+        while (self.buf[0] != null) {
+            var state: State = .{};
+            state.setXpic();
+            state.unsetRegional();
+            state.setIndic();
+
+            if (graphemeBreak(
+                self.buf[0].?.code,
+                self.buf[1].?.code,
+                self.data,
+                &state,
+            )) break;
+
+            self.advance();
+
+            if (!state.hasIndic()) {
+
+                // BUF: [?Any, Extend | Linker] Consonant
+                var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len;
+
+                indic: while (true) {
+                    if (self.buf[0] == null) {
+                        self.pending = .{ .extend_end = indic_offset };
+                        return .{
+                            .len = @intCast(grapheme_end - indic_offset),
+                            .offset = indic_offset,
+                        };
+                    }
+
+                    const codepoint = self.buf[0].?;
+
+                    switch (self.data.indic(codepoint.code)) {
+                        .Extend, .Linker => {
+                            self.advance();
+                            continue :indic;
+                        },
+                        .Consonant => {
+                            // BUF: [Consonant, Extend | Linker] (Extend | Linker)* Consonant
+                            indic_offset = codepoint.offset;
+                            self.advance();
+
+                            if (self.buf[0]) |cp1| {
+                                state.setIndic();
+
+                                if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break;
+
+                                if (!state.hasIndic()) {
+                                    continue :indic;
+                                } else {
+                                    break :indic;
+                                }
+                            } else {
+                                break :indic;
+                            }
+                        },
+                        .none => {
+                            // BUF: [Any, Extend | Linker] (Extend | Linker)* Consonant
+                            self.pending = .{ .extend_end = indic_offset };
+                            return .{
+                                .len = @intCast(grapheme_end - indic_offset),
+                                .offset = indic_offset,
+                            };
+                        },
+                    }
+                }
+            }
+
+            if (!state.hasXpic()) {
+                // BUF: [?Any, ZWJ] Emoji
+                var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len;
+
+                // Look for previous Emoji
+                emoji: while (true) {
+                    if (self.buf[0] == null) {
+                        self.pending = .{ .extend_end = emoji_offset };
+                        return .{
+                            .len = @intCast(grapheme_end - emoji_offset),
+                            .offset = emoji_offset,
+                        };
+                    }
+
+                    const codepoint = self.buf[0].?;
+
+                    if (self.data.gbp(codepoint.code) == .Extend) {
+                        self.advance();
+                        continue :emoji;
+                    }
+
+                    if (self.data.isEmoji(codepoint.code)) {
+                        // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)*
+                        emoji_offset = codepoint.offset;
+                        self.advance();
+
+                        if (self.buf[0] != null and
+                            // ZWJ = 0x200d
+                            self.buf[0].?.code == 0x200d)
+                        {
+                            // BUF: [ZWJ, Emoji] (Extend* ZWJ Emoji)*
+                            // Back at the beginning of the loop, "recursively" look for emoji
+                            self.advance();
+                            continue :emoji;
+                        } else {
+                            // BUF: [?Any, Emoji] (Extend* ZWJ Emoji)*
+                            break :emoji;
+                        }
+                    } else {
+                        // BUF: [Any, Extend] (Extend* ZWJ Emoji)*
+                        self.pending = .{ .extend_end = emoji_offset };
+                        return .{
+                            .len = @intCast(grapheme_end - emoji_offset),
+                            .offset = emoji_offset,
+                        };
+                    }
+                }
+            }
+
+            if (state.hasRegional()) {
+                var ri_count: usize = 0;
+                while (self.buf[0] != null and
+                    self.data.gbp(self.buf[0].?.code) == .Regional_Indicator)
+                {
+                    ri_count += 1;
+                    self.advance();
+                }
+
+                // Use the fact that all RI have length 4 in utf8 encoding
+                // since they are in range 0x1f1e6...0x1f1ff
+                // https://en.wikipedia.org/wiki/UTF-8#Encoding
+                if (ri_count == 0) {
+                    // There are no pending RI codepoints
+                } else if (ri_count % 2 == 0) {
+                    self.pending = .{ .ri_count = ri_count };
+                    return .{ .len = 8, .offset = grapheme_end - 8 };
+                } else {
+                    // Add one to count for the unused RI
+                    self.pending = .{ .ri_count = ri_count + 1 };
+                    return .{ .len = 4, .offset = grapheme_end - 4 };
+                }
+            }
+        }
+
+        const grapheme_start = if (self.buf[1]) |codepoint| codepoint.offset else 0;
+        self.advance();
+        return .{
+            .len = @intCast(grapheme_end - grapheme_start),
+            .offset = grapheme_start,
+        };
+    }
+};
+
 // Predicates
 fn isBreaker(cp: u21, data: *const Graphemes) bool {
     // Extract relevant properties.
-- 
cgit v1.2.3


From 04123c2280088acbe4501bbe4c314ca64ff27dab Mon Sep 17 00:00:00 2001
From: Sam Atman
Date: Mon, 12 May 2025 12:57:04 -0400
Subject: Vastly simplify peek()

Idiomatic Zig takes awhile, what can I say (yes I wrote the first one).
---
 src/Graphemes.zig | 63 +++----------------------------------------------------
 1 file changed, 3 insertions(+), 60 deletions(-)

(limited to 'src/Graphemes.zig')

diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 7bf328a..1ce1ea6 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -99,7 +99,7 @@ pub const Gbp = enum {
 
 /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
 pub const Grapheme = struct {
-    len: u8,
+    len: u32,
     offset: u32,
 
     /// `bytes` returns the slice of bytes that correspond to
@@ -173,69 +173,12 @@ pub const Iterator = struct {
         const saved_cp_iter = self.cp_iter;
         const s0 = self.buf[0];
         const s1 = self.buf[1];
-
-        self.advance();
-
-        // If no more
-        if (self.buf[0] == null) {
-            self.cp_iter = saved_cp_iter;
-            self.buf[0] = s0;
-            self.buf[1] = s1;
-            return null;
-        }
-        // If last one
-        if (self.buf[1] == null) {
-            const len = self.buf[0].?.len;
-            const offset = self.buf[0].?.offset;
+        defer {
             self.cp_iter = saved_cp_iter;
             self.buf[0] = s0;
             self.buf[1] = s1;
-            return Grapheme{ .len = len, .offset = offset };
         }
-        // If ASCII
-        if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) {
-            const len = self.buf[0].?.len;
-            const offset = self.buf[0].?.offset;
-            self.cp_iter = saved_cp_iter;
-            self.buf[0] = s0;
-            self.buf[1] = s1;
-            return Grapheme{ .len = len, .offset = offset };
-        }
-
-        const gc_start = self.buf[0].?.offset;
-        var gc_len: u8 = self.buf[0].?.len;
-        var state = State{};
-
-        if (graphemeBreak(
-            self.buf[0].?.code,
-            self.buf[1].?.code,
-            self.data,
-            &state,
-        )) {
-            self.cp_iter = saved_cp_iter;
-            self.buf[0] = s0;
-            self.buf[1] = s1;
-            return Grapheme{ .len = gc_len, .offset = gc_start };
-        }
-
-        while (true) {
-            self.advance();
-            if (self.buf[0] == null) break;
-
-            gc_len += self.buf[0].?.len;
-
-            if (graphemeBreak(
-                self.buf[0].?.code,
-                if (self.buf[1]) |ncp| ncp.code else 0,
-                self.data,
-                &state,
-            )) break;
-        }
-        self.cp_iter = saved_cp_iter;
-        self.buf[0] = s0;
-        self.buf[1] = s1;
-
-        return Grapheme{ .len = gc_len, .offset = gc_start };
+        return self.next();
     }
 };
 
-- 
cgit v1.2.3


From cf8d8fe5d640511f6c4134fdaa36e930232ca7da Mon Sep 17 00:00:00 2001
From: Sam Atman
Date: Mon, 12 May 2025 15:22:37 -0400
Subject: Begin conformance test

I'm not sure the details of this strategy can actually be made to work.
But, something can.
---
 src/Graphemes.zig | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'src/Graphemes.zig')

diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 1ce1ea6..1f67fc6 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -364,3 +364,25 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
         try std.testing.expectEqual(@as(usize, 2), i);
     }
 }
+
+test "Iterator.peek" {
+    const peek_seq = "aΔ👨🏻‍🌾→";
+    const data = try Graphemes.init(std.testing.allocator);
+    defer data.deinit(std.testing.allocator);
+
+    var iter = data.iterator(peek_seq);
+    const peek_a = iter.peek().?;
+    const next_a = iter.next().?;
+    try std.testing.expectEqual(peek_a, next_a);
+    try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq));
+    const peek_d1 = iter.peek().?;
+    const peek_d2 = iter.peek().?;
+    try std.testing.expectEqual(peek_d1, peek_d2);
+    const next_d = iter.next().?;
+    try std.testing.expectEqual(peek_d2, next_d);
+    try std.testing.expectEqual(iter.peek(), iter.next());
+    try std.testing.expectEqual(iter.peek(), iter.next());
+    try std.testing.expectEqual(null, iter.peek());
+    try std.testing.expectEqual(null, iter.peek());
+    try std.testing.expectEqual(iter.peek(), iter.next());
+}
-- 
cgit v1.2.3


From c9a1b3392973ee30e6a9a532f1da8605619b5b06 Mon Sep 17 00:00:00 2001
From: Sam Atman
Date: Fri, 23 May 2025 18:46:30 -0400
Subject: Make offset size configurable

Hopefully I can talk users out of taking advantage of this configuration
but I'll have better luck with that if it's available.
---
 src/Graphemes.zig | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'src/Graphemes.zig')

diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 0338c04..49fdbf3 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -5,9 +5,11 @@ const Allocator = mem.Allocator;
 const compress = std.compress;
 const unicode = std.unicode;
 
-const CodePoint = @import("code_point").CodePoint;
-const CodePointIterator = @import("code_point").Iterator;
-const CodePointReverseIterator = @import("code_point").ReverseIterator;
+const code_point = @import("code_point");
+const CodePoint = code_point.CodePoint;
+const CodePointIterator = code_point.Iterator;
+const CodePointReverseIterator = code_point.ReverseIterator;
+const uoffset = code_point.uoffset;
 
 s1: []u16 = undefined,
 s2: []u16 = undefined,
@@ -104,8 +106,8 @@ pub const Gbp = enum {
 
 /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
 pub const Grapheme = struct {
-    len: u32,
-    offset: u32,
+    len: uoffset,
+    offset: uoffset,
 
     /// `bytes` returns the slice of bytes that correspond to
     /// this grapheme cluster in `src`.
@@ -199,7 +201,7 @@ pub const ReverseIterator = struct {
         /// Count of pending RI codepoints, it is an even number
         ri_count: usize,
         /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji
-        extend_end: u32,
+        extend_end: uoffset,
     };
 
     const Self = @This();
@@ -219,7 +221,7 @@ pub const ReverseIterator = struct {
     pub fn prev(self: *Self) ?Grapheme {
         if (self.buf[1] == null) return null;
 
-        const grapheme_end: u32 = end: {
+        const grapheme_end: uoffset = end: {
             const codepoint = self.buf[1].?;
 
             switch (self.pending) {
@@ -270,7 +272,7 @@ pub const ReverseIterator = struct {
             if (!state.hasIndic()) {
 
                 // BUF: [?Any, Extend | Linker] Consonant
-                var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len;
+                var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
 
                 indic: while (true) {
                     if (self.buf[0] == null) {
@@ -321,7 +323,7 @@ pub const ReverseIterator = struct {
 
             if (!state.hasXpic()) {
                 // BUF: [?Any, ZWJ] Emoji
-                var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len;
+                var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
 
                 // Look for previous Emoji
                 emoji: while (true) {
-- 
cgit v1.2.3


From 8f5209fa095c2ed9114ce102b2f9b2cc90d66b13 Mon Sep 17 00:00:00 2001
From: Sam Atman
Date: Sun, 1 Jun 2025 14:08:25 -0400
Subject: Add graphemeAtIndex + iterate before and after

That completes the set.  I do think it's possible to bum a few more
cycles from the implementation, but, I'm not going to.  It passes
the acceptance suite and that's what it needs to do.
---
 src/Graphemes.zig | 220 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 147 insertions(+), 73 deletions(-)

(limited to 'src/Graphemes.zig')

diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 49fdbf3..f1c56ed 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -1,15 +1,7 @@
-const std = @import("std");
-const builtin = @import("builtin");
-const mem = std.mem;
-const Allocator = mem.Allocator;
-const compress = std.compress;
-const unicode = std.unicode;
-
-const code_point = @import("code_point");
-const CodePoint = code_point.CodePoint;
-const CodePointIterator = code_point.Iterator;
-const CodePointReverseIterator = code_point.ReverseIterator;
-const uoffset = code_point.uoffset;
+//! Graphemes Module
+//!
+//! Code for handling graphemes: fragments of string which should be
+//! treated as one unit.  Like Farmer Bob here: 👨🏻‍🌾
 
 s1: []u16 = undefined,
 s2: []u16 = undefined,
@@ -69,10 +61,12 @@ pub fn isEmoji(graphemes: Graphemes, cp: u21) bool {
     return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1;
 }
 
+/// Returns an iterator over the graphemes in `string`.
 pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator {
     return Iterator.init(string, graphemes);
 }
 
+/// Returns a reverse iterator over the graphemes in `string`.
 pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator {
     return ReverseIterator.init(string, graphemes);
 }
@@ -116,6 +110,96 @@ pub const Grapheme = struct {
     }
 };
 
+// NOTE: graphemeAtIndex is, probably, not in an optimal form.  It has the advantage
+// of being composed of other parts, but the constant factor can _probably_ be improved
+// by a bespoke implmentation using graphemes.graphemeBreak directly.  There's a limit
+// to how much cycle-bumming I'm willing to do at any given moment; that limit has been
+// reached.  Perhaps you, Dear Reader, might pick up the torch?
+
+/// Returns the `Grapheme` at `string[index]`, which does not have to be a
+/// valid start of a codepoint.  Asserts the string is not empty.  Index must be
+/// less than `string.len`.  Always returns a `Grapheme`.
+pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme {
+    assert(string.len != 0);
+    if (index == 0 or (index > 0 and
+        string[index] < 0x80 and
+        string[index - 1] < 0x80) and
+        (string[index - 1] != '\r' and string[index] != '\n'))
+    {
+        // There's always a grapheme break between two ASCII code points (except CRLF)
+        var iter = graphemes.iterator(string[index..]);
+        const next = iter.next().?;
+        return Grapheme{
+            .len = next.len,
+            .offset = @as(u32, @intCast(index)) + next.offset,
+        };
+    } // Otherwise it gets hairy.
+    const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset;
+    if (idx == string.len) {
+        var iter = graphemes.reverseIterator(string);
+        return iter.prev().?;
+    }
+    // We're on a valid codepoint boundary, we go back from here
+    var r_iter = graphemes.reverseIterAtIndex(string, idx);
+    if (r_iter.prev()) |g| {
+        if (g.offset == 0) {
+            var iter = graphemes.iterator(string);
+            while (iter.next()) |g2| {
+                if (g2.offset <= idx and idx < g2.offset + g2.len) return g2;
+            }
+        }
+    }
+    // We need to toss one, because otherwise we might not be pending when
+    // we in fact need to be.
+    _ = r_iter.prev();
+    while (r_iter.pending != .none) : (_ = r_iter.prev()) {}
+    var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0);
+    while (iter.next()) |g| {
+        if (g.offset <= idx and idx < g.offset + g.len) return g;
+    }
+    unreachable;
+}
+
+/// Return a (forward) iterator of `string` after `grapheme`.
+pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator {
+    return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len);
+}
+
+/// Return a reverse iterator of `string` before `grapheme`.
+pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator {
+    // This bit of weirdness is because reverse iterators are "advance last",
+    // while forward iterators are "advance first".  This leaves some room for
+    // further optimization, if anyone dares.
+    var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1);
+    _ = r_iter.prev();
+    return r_iter;
+}
+
+fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator {
+    var r_iter: ReverseIterator = undefined;
+    r_iter.data = graphemes;
+    var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
+    r_iter.buf[1] = rcp_iter.prev();
+    r_iter.buf[0] = rcp_iter.prev();
+    r_iter.pending = .none;
+    r_iter.cp_iter = rcp_iter;
+    return r_iter;
+}
+
+fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator {
+    var iter: Iterator = undefined;
+    iter.data = graphemes;
+    iter.buf[0] = first: {
+        if (idx == string.len) break :first null;
+        var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
+        break :first r_cp_iter.prev();
+    };
+    var cp_iter: CodePointIterator = .{ .bytes = string, .i = idx };
+    iter.buf[1] = cp_iter.next();
+    iter.cp_iter = cp_iter;
+    return iter;
+}
+
 /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
 pub const Iterator = struct {
     buf: [2]?CodePoint = .{ null, null },
@@ -150,7 +234,7 @@ pub const Iterator = struct {
 
         const gc_start = self.buf[0].?.offset;
         var gc_len: u8 = self.buf[0].?.len;
-        var state = State{};
+        var state = IterState{};
 
         if (graphemeBreak(
             self.buf[0].?.code,
@@ -189,12 +273,13 @@ pub const Iterator = struct {
     }
 };
 
+/// Iterate a string backward by Grapheme.
 pub const ReverseIterator = struct {
     buf: [2]?CodePoint = .{ null, null },
     cp_iter: CodePointReverseIterator,
     data: *const Graphemes,
     /// Codepoint read from `cp_iter` but not returned by `previous`
-    pending: Pending = .{ .none = {} },
+    pending: Pending = .none,
 
     const Pending = union(enum) {
         none: void,
@@ -218,6 +303,12 @@ pub const ReverseIterator = struct {
         self.buf[0] = self.cp_iter.prev();
     }
 
+    pub fn peek(self: *Self) ?Grapheme {
+        const cache = .{ self.buf, self.cp_iter, self.pending };
+        defer self.buf, self.cp_iter, self.pending = cache;
+        return self.prev();
+    }
+
     pub fn prev(self: *Self) ?Grapheme {
         if (self.buf[1] == null) return null;
 
@@ -255,10 +346,10 @@ pub const ReverseIterator = struct {
         };
 
         while (self.buf[0] != null) {
-            var state: State = .{};
-            state.setXpic();
-            state.unsetRegional();
-            state.setIndic();
+            var state: IterState = .{};
+            state.xpic = true;
+            state.regional = false;
+            state.indic = true;
 
             if (graphemeBreak(
                 self.buf[0].?.code,
@@ -269,7 +360,7 @@ pub const ReverseIterator = struct {
 
             self.advance();
 
-            if (!state.hasIndic()) {
+            if (!state.indic) {
 
                 // BUF: [?Any, Extend | Linker] Consonant
                 var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
@@ -296,11 +387,11 @@ pub const ReverseIterator = struct {
                             self.advance();
 
                             if (self.buf[0]) |cp1| {
-                                state.setIndic();
+                                state.indic = true;
 
                                 if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break;
 
-                                if (!state.hasIndic()) {
+                                if (!state.indic) {
                                     continue :indic;
                                 } else {
                                     break :indic;
@@ -321,7 +412,7 @@ pub const ReverseIterator = struct {
                 }
             }
 
-            if (!state.hasXpic()) {
+            if (!state.xpic) {
                 // BUF: [?Any, ZWJ] Emoji
                 var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
 
@@ -370,7 +461,7 @@ pub const ReverseIterator = struct {
                 }
             }
 
-            if (state.hasRegional()) {
+            if (state.regional) {
                 var ri_count: usize = 0;
                 while (self.buf[0] != null and
                     self.data.gbp(self.buf[0].?.code) == .Regional_Indicator)
@@ -404,6 +495,13 @@ pub const ReverseIterator = struct {
     }
 };
 
+/// Grapheme Iterator state.
+pub const IterState = packed struct(u3) {
+    xpic: bool = false,
+    regional: bool = false,
+    indic: bool = false,
+};
+
 // Predicates
 fn isBreaker(cp: u21, data: *const Graphemes) bool {
     // Extract relevant properties.
@@ -411,44 +509,6 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool {
     return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
 }
 
-// Grapheme break state.
-pub const State = struct {
-    bits: u3 = 0,
-
-    // Extended Pictographic (emoji)
-    fn hasXpic(self: State) bool {
-        return self.bits & 1 == 1;
-    }
-    fn setXpic(self: *State) void {
-        self.bits |= 1;
-    }
-    fn unsetXpic(self: *State) void {
-        self.bits &= ~@as(u3, 1);
-    }
-
-    // Regional Indicatior (flags)
-    fn hasRegional(self: State) bool {
-        return self.bits & 2 == 2;
-    }
-    fn setRegional(self: *State) void {
-        self.bits |= 2;
-    }
-    fn unsetRegional(self: *State) void {
-        self.bits &= ~@as(u3, 2);
-    }
-
-    // Indic Conjunct
-    fn hasIndic(self: State) bool {
-        return self.bits & 4 == 4;
-    }
-    fn setIndic(self: *State) void {
-        self.bits |= 4;
-    }
-    fn unsetIndic(self: *State) void {
-        self.bits &= ~@as(u3, 4);
-    }
-};
-
 /// `graphemeBreak` returns true only if a grapheme break point is required
 /// between `cp1` and `cp2`. `state` should start out as 0. If calling
 /// iteratively over a sequence of code points, this function must be called
@@ -459,7 +519,7 @@ pub fn graphemeBreak(
     cp1: u21,
     cp2: u21,
     data: *const Graphemes,
-    state: *State,
+    state: *IterState,
 ) bool {
     // Extract relevant properties.
     const cp1_gbp_prop = data.gbp(cp1);
@@ -471,9 +531,9 @@ pub fn graphemeBreak(
     const cp2_is_emoji = data.isEmoji(cp2);
 
     // GB11: Emoji Extend* ZWJ x Emoji
-    if (!state.hasXpic() and cp1_is_emoji) state.setXpic();
+    if (!state.xpic and cp1_is_emoji) state.xpic = true;
     // GB9c: Indic Conjunct Break
-    if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic();
+    if (!state.indic and cp1_indic_prop == .Consonant) state.indic = true;
 
     // GB3: CR x LF
     if (cp1 == '\r' and cp2 == '\n') return false;
@@ -482,11 +542,11 @@ pub fn graphemeBreak(
     if (isBreaker(cp1, data)) return true;
 
     // GB11: Emoji Extend* ZWJ x Emoji
-    if (state.hasXpic() and
+    if (state.xpic and
         cp1_gbp_prop == .ZWJ and
         cp2_is_emoji)
     {
-        state.unsetXpic();
+        state.xpic = false;
         return false;
     }
 
@@ -501,11 +561,11 @@ pub fn graphemeBreak(
 
     // GB12, GB13: RI x RI
     if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
-        if (state.hasRegional()) {
-            state.unsetRegional();
+        if (state.regional) {
+            state.regional = false;
             return true;
         } else {
-            state.setRegional();
+            state.regional = true;
             return false;
         }
     }
@@ -530,25 +590,25 @@ pub fn graphemeBreak(
     }
 
     // GB9c: Indic Conjunct Break
-    if (state.hasIndic() and
+    if (state.indic and
         cp1_indic_prop == .Consonant and
         (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker))
     {
         return false;
     }
 
-    if (state.hasIndic() and
+    if (state.indic and
         cp1_indic_prop == .Extend and
         cp2_indic_prop == .Linker)
     {
         return false;
     }
 
-    if (state.hasIndic() and
+    if (state.indic and
         (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and
         cp2_indic_prop == .Consonant)
     {
-        state.unsetIndic();
+        state.indic = false;
         return false;
     }
 
@@ -608,3 +668,17 @@ test "Iterator.peek" {
     try std.testing.expectEqual(null, iter.peek());
     try std.testing.expectEqual(iter.peek(), iter.next());
 }
+
+const std = @import("std");
+const builtin = @import("builtin");
+const assert = std.debug.assert;
+const mem = std.mem;
+const Allocator = mem.Allocator;
+const compress = std.compress;
+const unicode = std.unicode;
+
+const code_point = @import("code_point");
+const CodePoint = code_point.CodePoint;
+const CodePointIterator = code_point.Iterator;
+const CodePointReverseIterator = code_point.ReverseIterator;
+const uoffset = code_point.uoffset;
-- 
cgit v1.2.3