Merge branch 'develop-next'HEAD v0.14.1 master

author: Sam Atman 2025-07-08 12:15:32 -0400
committer: Sam Atman 2025-07-08 12:15:32 -0400
commit: 9427a9e53aaa29ee071f4dcb35b809a699d75aa9 (patch)
tree: 2607c185fd8053b84d60041fadc35c05a0225d34 /src/Graphemes.zig
parent: Merge pull request 'Fix benchmarks' (#56) from jacobsandlund/zg:benchmarks in... (diff)
parent: Add Words.zig example to README (diff)
download: zg-master.tar.gz
zg-master.tar.xz
zg-master.zip
1 files changed, 370 insertions, 109 deletions
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 7bf328a..f1c56ed 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -1,12 +1,7 @@
-const std = @import("std");
+//! Graphemes Module
-const builtin = @import("builtin");
+//!
-const mem = std.mem;
+//! Code for handling graphemes: fragments of string which should be
-const Allocator = mem.Allocator;
+//! treated as one unit.  Like Farmer Bob here: 👨🏻‍🌾
-const compress = std.compress;
-const unicode = std.unicode;
-const CodePoint = @import("code_point").CodePoint;
-const CodePointIterator = @import("code_point").Iterator;
 s1: []u16 = undefined,
 s2: []u16 = undefined,
@@ -66,10 +61,16 @@ pub fn isEmoji(graphemes: Graphemes, cp: u21) bool {
    return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1;
 }
+/// Returns an iterator over the graphemes in `string`.
 pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator {
    return Iterator.init(string, graphemes);
 }
+/// Returns a reverse iterator over the graphemes in `string`.
+pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator {
+    return ReverseIterator.init(string, graphemes);
+}
 /// Indic syllable type.
 pub const Indic = enum {
    none,
@@ -99,8 +100,8 @@ pub const Gbp = enum {
 /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
 pub const Grapheme = struct {
-    len: u8,
+    len: uoffset,
-    offset: u32,
+    offset: uoffset,
    /// `bytes` returns the slice of bytes that correspond to
    /// this grapheme cluster in `src`.
@@ -109,6 +110,96 @@ pub const Grapheme = struct {
    }
 };
+// NOTE: graphemeAtIndex is, probably, not in an optimal form.  It has the advantage
+// of being composed of other parts, but the constant factor can _probably_ be improved
+// by a bespoke implmentation using graphemes.graphemeBreak directly.  There's a limit
+// to how much cycle-bumming I'm willing to do at any given moment; that limit has been
+// reached.  Perhaps you, Dear Reader, might pick up the torch?
+/// Returns the `Grapheme` at `string[index]`, which does not have to be a
+/// valid start of a codepoint.  Asserts the string is not empty.  Index must be
+/// less than `string.len`.  Always returns a `Grapheme`.
+pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme {
+    assert(string.len != 0);
+    if (index == 0 or (index > 0 and
+        string[index] < 0x80 and
+        string[index - 1] < 0x80) and
+        (string[index - 1] != '\r' and string[index] != '\n'))
+    {
+        // There's always a grapheme break between two ASCII code points (except CRLF)
+        var iter = graphemes.iterator(string[index..]);
+        const next = iter.next().?;
+        return Grapheme{
+            .len = next.len,
+            .offset = @as(u32, @intCast(index)) + next.offset,
+        };
+    } // Otherwise it gets hairy.
+    const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset;
+    if (idx == string.len) {
+        var iter = graphemes.reverseIterator(string);
+        return iter.prev().?;
+    }
+    // We're on a valid codepoint boundary, we go back from here
+    var r_iter = graphemes.reverseIterAtIndex(string, idx);
+    if (r_iter.prev()) |g| {
+        if (g.offset == 0) {
+            var iter = graphemes.iterator(string);
+            while (iter.next()) |g2| {
+                if (g2.offset <= idx and idx < g2.offset + g2.len) return g2;
+            }
+        }
+    }
+    // We need to toss one, because otherwise we might not be pending when
+    // we in fact need to be.
+    _ = r_iter.prev();
+    while (r_iter.pending != .none) : (_ = r_iter.prev()) {}
+    var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0);
+    while (iter.next()) |g| {
+        if (g.offset <= idx and idx < g.offset + g.len) return g;
+    }
+    unreachable;
+}
+/// Return a (forward) iterator of `string` after `grapheme`.
+pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator {
+    return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len);
+}
+/// Return a reverse iterator of `string` before `grapheme`.
+pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator {
+    // This bit of weirdness is because reverse iterators are "advance last",
+    // while forward iterators are "advance first".  This leaves some room for
+    // further optimization, if anyone dares.
+    var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1);
+    _ = r_iter.prev();
+    return r_iter;
+}
+fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator {
+    var r_iter: ReverseIterator = undefined;
+    r_iter.data = graphemes;
+    var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
+    r_iter.buf[1] = rcp_iter.prev();
+    r_iter.buf[0] = rcp_iter.prev();
+    r_iter.pending = .none;
+    r_iter.cp_iter = rcp_iter;
+    return r_iter;
+}
+fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator {
+    var iter: Iterator = undefined;
+    iter.data = graphemes;
+    iter.buf[0] = first: {
+        if (idx == string.len) break :first null;
+        var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
+        break :first r_cp_iter.prev();
+    };
+    var cp_iter: CodePointIterator = .{ .bytes = string, .i = idx };
+    iter.buf[1] = cp_iter.next();
+    iter.cp_iter = cp_iter;
+    return iter;
+}
 /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
 pub const Iterator = struct {
    buf: [2]?CodePoint = .{ null, null },
@@ -143,7 +234,7 @@ pub const Iterator = struct {
        const gc_start = self.buf[0].?.offset;
        var gc_len: u8 = self.buf[0].?.len;
-        var state = State{};
+        var state = IterState{};
        if (graphemeBreak(
            self.buf[0].?.code,
@@ -173,72 +264,244 @@ pub const Iterator = struct {
        const saved_cp_iter = self.cp_iter;
        const s0 = self.buf[0];
        const s1 = self.buf[1];
+        defer {
-        self.advance();
-        // If no more
-        if (self.buf[0] == null) {
-            self.cp_iter = saved_cp_iter;
-            self.buf[0] = s0;
-            self.buf[1] = s1;
-            return null;
-        }
-        // If last one
-        if (self.buf[1] == null) {
-            const len = self.buf[0].?.len;
-            const offset = self.buf[0].?.offset;
-            self.cp_iter = saved_cp_iter;
-            self.buf[0] = s0;
-            self.buf[1] = s1;
-            return Grapheme{ .len = len, .offset = offset };
-        }
-        // If ASCII
-        if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) {
-            const len = self.buf[0].?.len;
-            const offset = self.buf[0].?.offset;
            self.cp_iter = saved_cp_iter;
            self.buf[0] = s0;
            self.buf[1] = s1;
-            return Grapheme{ .len = len, .offset = offset };
        }
+        return self.next();
+    }
+};
-        const gc_start = self.buf[0].?.offset;
+/// Iterate a string backward by Grapheme.
-        var gc_len: u8 = self.buf[0].?.len;
+pub const ReverseIterator = struct {
-        var state = State{};
+    buf: [2]?CodePoint = .{ null, null },
+    cp_iter: CodePointReverseIterator,
+    data: *const Graphemes,
+    /// Codepoint read from `cp_iter` but not returned by `previous`
+    pending: Pending = .none,
-        if (graphemeBreak(
+    const Pending = union(enum) {
-            self.buf[0].?.code,
+        none: void,
-            self.buf[1].?.code,
+        /// Count of pending RI codepoints, it is an even number
-            self.data,
+        ri_count: usize,
-            &state,
+        /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji
-        )) {
+        extend_end: uoffset,
-            self.cp_iter = saved_cp_iter;
+    };
-            self.buf[0] = s0;
-            self.buf[1] = s1;
-            return Grapheme{ .len = gc_len, .offset = gc_start };
-        }
-        while (true) {
+    const Self = @This();
-            self.advance();
-            if (self.buf[0] == null) break;
-            gc_len += self.buf[0].?.len;
+    pub fn init(str: []const u8, data: *const Graphemes) Self {
+        var self: Self = .{ .cp_iter = .init(str), .data = data };
+        self.advance();
+        self.advance();
+        return self;
+    }
+    fn advance(self: *Self) void {
+        self.buf[1] = self.buf[0];
+        self.buf[0] = self.cp_iter.prev();
+    }
+    pub fn peek(self: *Self) ?Grapheme {
+        const cache = .{ self.buf, self.cp_iter, self.pending };
+        defer self.buf, self.cp_iter, self.pending = cache;
+        return self.prev();
+    }
+    pub fn prev(self: *Self) ?Grapheme {
+        if (self.buf[1] == null) return null;
+        const grapheme_end: uoffset = end: {
+            const codepoint = self.buf[1].?;
+            switch (self.pending) {
+                // BUF: [?Any, Any]
+                .none => break :end codepoint.offset + codepoint.len,
+                .ri_count => |ri_count| {
+                    std.debug.assert(ri_count > 0);
+                    std.debug.assert(ri_count % 2 == 0);
+                    if (ri_count > 2) {
+                        self.pending.ri_count -= 2;
+                        // Use the fact that all RI have length 4 in utf8 encoding
+                        // since they are in range 0x1f1e6...0x1f1ff
+                        // https://en.wikipedia.org/wiki/UTF-8#Encoding
+                        return Grapheme{
+                            .len = 8,
+                            .offset = @intCast(codepoint.offset + self.pending.ri_count * 4),
+                        };
+                    } else {
+                        self.pending = .{ .none = {} };
+                        break :end codepoint.offset + codepoint.len + 4;
+                    }
+                },
+                // BUF: [?Any, Extend] Extend* ZWJ
+                .extend_end => |extend_end| {
+                    self.pending = .{ .none = {} };
+                    break :end extend_end;
+                },
+            }
+        };
+        while (self.buf[0] != null) {
+            var state: IterState = .{};
+            state.xpic = true;
+            state.regional = false;
+            state.indic = true;
            if (graphemeBreak(
                self.buf[0].?.code,
-                if (self.buf[1]) |ncp| ncp.code else 0,
+                self.buf[1].?.code,
                self.data,
                &state,
            )) break;
+            self.advance();
+            if (!state.indic) {
+                // BUF: [?Any, Extend | Linker] Consonant
+                var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
+                indic: while (true) {
+                    if (self.buf[0] == null) {
+                        self.pending = .{ .extend_end = indic_offset };
+                        return .{
+                            .len = @intCast(grapheme_end - indic_offset),
+                            .offset = indic_offset,
+                        };
+                    }
+                    const codepoint = self.buf[0].?;
+                    switch (self.data.indic(codepoint.code)) {
+                        .Extend, .Linker => {
+                            self.advance();
+                            continue :indic;
+                        },
+                        .Consonant => {
+                            // BUF: [Consonant, Extend | Linker] (Extend | Linker)* Consonant
+                            indic_offset = codepoint.offset;
+                            self.advance();
+                            if (self.buf[0]) |cp1| {
+                                state.indic = true;
+                                if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break;
+                                if (!state.indic) {
+                                    continue :indic;
+                                } else {
+                                    break :indic;
+                                }
+                            } else {
+                                break :indic;
+                            }
+                        },
+                        .none => {
+                            // BUF: [Any, Extend | Linker] (Extend | Linker)* Consonant
+                            self.pending = .{ .extend_end = indic_offset };
+                            return .{
+                                .len = @intCast(grapheme_end - indic_offset),
+                                .offset = indic_offset,
+                            };
+                        },
+                    }
+                }
+            }
+            if (!state.xpic) {
+                // BUF: [?Any, ZWJ] Emoji
+                var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
+                // Look for previous Emoji
+                emoji: while (true) {
+                    if (self.buf[0] == null) {
+                        self.pending = .{ .extend_end = emoji_offset };
+                        return .{
+                            .len = @intCast(grapheme_end - emoji_offset),
+                            .offset = emoji_offset,
+                        };
+                    }
+                    const codepoint = self.buf[0].?;
+                    if (self.data.gbp(codepoint.code) == .Extend) {
+                        self.advance();
+                        continue :emoji;
+                    }
+                    if (self.data.isEmoji(codepoint.code)) {
+                        // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)*
+                        emoji_offset = codepoint.offset;
+                        self.advance();
+                        if (self.buf[0] != null and
+                            // ZWJ = 0x200d
+                            self.buf[0].?.code == 0x200d)
+                        {
+                            // BUF: [ZWJ, Emoji] (Extend* ZWJ Emoji)*
+                            // Back at the beginning of the loop, "recursively" look for emoji
+                            self.advance();
+                            continue :emoji;
+                        } else {
+                            // BUF: [?Any, Emoji] (Extend* ZWJ Emoji)*
+                            break :emoji;
+                        }
+                    } else {
+                        // BUF: [Any, Extend] (Extend* ZWJ Emoji)*
+                        self.pending = .{ .extend_end = emoji_offset };
+                        return .{
+                            .len = @intCast(grapheme_end - emoji_offset),
+                            .offset = emoji_offset,
+                        };
+                    }
+                }
+            }
+            if (state.regional) {
+                var ri_count: usize = 0;
+                while (self.buf[0] != null and
+                    self.data.gbp(self.buf[0].?.code) == .Regional_Indicator)
+                {
+                    ri_count += 1;
+                    self.advance();
+                }
+                // Use the fact that all RI have length 4 in utf8 encoding
+                // since they are in range 0x1f1e6...0x1f1ff
+                // https://en.wikipedia.org/wiki/UTF-8#Encoding
+                if (ri_count == 0) {
+                    // There are no pending RI codepoints
+                } else if (ri_count % 2 == 0) {
+                    self.pending = .{ .ri_count = ri_count };
+                    return .{ .len = 8, .offset = grapheme_end - 8 };
+                } else {
+                    // Add one to count for the unused RI
+                    self.pending = .{ .ri_count = ri_count + 1 };
+                    return .{ .len = 4, .offset = grapheme_end - 4 };
+                }
+            }
        }
-        self.cp_iter = saved_cp_iter;
-        self.buf[0] = s0;
-        self.buf[1] = s1;
-        return Grapheme{ .len = gc_len, .offset = gc_start };
+        const grapheme_start = if (self.buf[1]) |codepoint| codepoint.offset else 0;
+        self.advance();
+        return .{
+            .len = @intCast(grapheme_end - grapheme_start),
+            .offset = grapheme_start,
+        };
    }
 };
+/// Grapheme Iterator state.
+pub const IterState = packed struct(u3) {
+    xpic: bool = false,
+    regional: bool = false,
+    indic: bool = false,
+};
 // Predicates
 fn isBreaker(cp: u21, data: *const Graphemes) bool {
    // Extract relevant properties.
@@ -246,44 +509,6 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool {
    return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
 }
-// Grapheme break state.
-pub const State = struct {
-    bits: u3 = 0,
-    // Extended Pictographic (emoji)
-    fn hasXpic(self: State) bool {
-        return self.bits & 1 == 1;
-    }
-    fn setXpic(self: *State) void {
-        self.bits |= 1;
-    }
-    fn unsetXpic(self: *State) void {
-        self.bits ^= 1;
-    }
-    // Regional Indicatior (flags)
-    fn hasRegional(self: State) bool {
-        return self.bits & 2 == 2;
-    }
-    fn setRegional(self: *State) void {
-        self.bits |= 2;
-    }
-    fn unsetRegional(self: *State) void {
-        self.bits ^= 2;
-    }
-    // Indic Conjunct
-    fn hasIndic(self: State) bool {
-        return self.bits & 4 == 4;
-    }
-    fn setIndic(self: *State) void {
-        self.bits |= 4;
-    }
-    fn unsetIndic(self: *State) void {
-        self.bits ^= 4;
-    }
-};
 /// `graphemeBreak` returns true only if a grapheme break point is required
 /// between `cp1` and `cp2`. `state` should start out as 0. If calling
 /// iteratively over a sequence of code points, this function must be called
@@ -294,7 +519,7 @@ pub fn graphemeBreak(
    cp1: u21,
    cp2: u21,
    data: *const Graphemes,
-    state: *State,
+    state: *IterState,
 ) bool {
    // Extract relevant properties.
    const cp1_gbp_prop = data.gbp(cp1);
@@ -306,9 +531,9 @@ pub fn graphemeBreak(
    const cp2_is_emoji = data.isEmoji(cp2);
    // GB11: Emoji Extend* ZWJ x Emoji
-    if (!state.hasXpic() and cp1_is_emoji) state.setXpic();
+    if (!state.xpic and cp1_is_emoji) state.xpic = true;
    // GB9c: Indic Conjunct Break
-    if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic();
+    if (!state.indic and cp1_indic_prop == .Consonant) state.indic = true;
    // GB3: CR x LF
    if (cp1 == '\r' and cp2 == '\n') return false;
@@ -317,11 +542,11 @@ pub fn graphemeBreak(
    if (isBreaker(cp1, data)) return true;
    // GB11: Emoji Extend* ZWJ x Emoji
-    if (state.hasXpic() and
+    if (state.xpic and
        cp1_gbp_prop == .ZWJ and
        cp2_is_emoji)
    {
-        state.unsetXpic();
+        state.xpic = false;
        return false;
    }
@@ -336,11 +561,11 @@ pub fn graphemeBreak(
    // GB12, GB13: RI x RI
    if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
-        if (state.hasRegional()) {
+        if (state.regional) {
-            state.unsetRegional();
+            state.regional = false;
            return true;
        } else {
-            state.setRegional();
+            state.regional = true;
            return false;
        }
    }
@@ -365,25 +590,25 @@ pub fn graphemeBreak(
    }
    // GB9c: Indic Conjunct Break
-    if (state.hasIndic() and
+    if (state.indic and
        cp1_indic_prop == .Consonant and
        (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker))
    {
        return false;
    }
-    if (state.hasIndic() and
+    if (state.indic and
        cp1_indic_prop == .Extend and
        cp2_indic_prop == .Linker)
    {
        return false;
    }
-    if (state.hasIndic() and
+    if (state.indic and
        (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and
        cp2_indic_prop == .Consonant)
    {
-        state.unsetIndic();
+        state.indic = false;
        return false;
    }
@@ -421,3 +646,39 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
        try std.testing.expectEqual(@as(usize, 2), i);
    }
 }
+test "Iterator.peek" {
+    const peek_seq = "aΔ👨🏻‍🌾→";
+    const data = try Graphemes.init(std.testing.allocator);
+    defer data.deinit(std.testing.allocator);
+    var iter = data.iterator(peek_seq);
+    const peek_a = iter.peek().?;
+    const next_a = iter.next().?;
+    try std.testing.expectEqual(peek_a, next_a);
+    try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq));
+    const peek_d1 = iter.peek().?;
+    const peek_d2 = iter.peek().?;
+    try std.testing.expectEqual(peek_d1, peek_d2);
+    const next_d = iter.next().?;
+    try std.testing.expectEqual(peek_d2, next_d);
+    try std.testing.expectEqual(iter.peek(), iter.next());
+    try std.testing.expectEqual(iter.peek(), iter.next());
+    try std.testing.expectEqual(null, iter.peek());
+    try std.testing.expectEqual(null, iter.peek());
+    try std.testing.expectEqual(iter.peek(), iter.next());
+}
+const std = @import("std");
+const builtin = @import("builtin");
+const assert = std.debug.assert;
+const mem = std.mem;
+const Allocator = mem.Allocator;
+const compress = std.compress;
+const unicode = std.unicode;
+const code_point = @import("code_point");
+const CodePoint = code_point.CodePoint;
+const CodePointIterator = code_point.Iterator;
+const CodePointReverseIterator = code_point.ReverseIterator;
+const uoffset = code_point.uoffset;
author	Sam Atman	2025-07-08 12:15:32 -0400
committer	Sam Atman	2025-07-08 12:15:32 -0400
commit	9427a9e53aaa29ee071f4dcb35b809a699d75aa9 (patch)
tree	2607c185fd8053b84d60041fadc35c05a0225d34 /src/Graphemes.zig
parent	Merge pull request 'Fix benchmarks' (#56) from jacobsandlund/zg:benchmarks in... (diff)
parent	Add Words.zig example to README (diff)
download	zg-master.tar.gz zg-master.tar.xz zg-master.zip