Merge branch 'develop-next'HEAD v0.14.1 master

author: Sam Atman 2025-07-08 12:15:32 -0400
committer: Sam Atman 2025-07-08 12:15:32 -0400
commit: 9427a9e53aaa29ee071f4dcb35b809a699d75aa9 (patch)
tree: 2607c185fd8053b84d60041fadc35c05a0225d34 /src
parent: Merge pull request 'Fix benchmarks' (#56) from jacobsandlund/zg:benchmarks in... (diff)
parent: Add Words.zig example to README (diff)
download: zg-9427a9e53aaa29ee071f4dcb35b809a699d75aa9.tar.gz
zg-9427a9e53aaa29ee071f4dcb35b809a699d75aa9.tar.xz
zg-9427a9e53aaa29ee071f4dcb35b809a699d75aa9.zip
6 files changed, 1873 insertions, 192 deletions
diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig
index f63b860..ff41b3e 100644
--- a/src/CaseFolding.zig
+++ b/src/CaseFolding.zig
@@ -300,13 +300,13 @@ fn testAllocations(allocator: Allocator) !void {
    {
        const normalize = try Normalize.init(allocator);
        defer normalize.deinit(allocator);
-        const caser1 = try CaseFolding.initWithNormalize(allocator, normalize);
+        const caser = try CaseFolding.initWithNormalize(allocator, normalize);
-        defer caser1.deinit(allocator);
+        defer caser.deinit(allocator);
    }
    // With normalize owned
    {
-        const caser2 = try CaseFolding.init(allocator);
+        const caser = try CaseFolding.init(allocator);
-        defer caser2.deinit(allocator);
+        defer caser.deinit(allocator);
    }
 }
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 7bf328a..f1c56ed 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -1,12 +1,7 @@
-const std = @import("std");
+//! Graphemes Module
-const builtin = @import("builtin");
+//!
-const mem = std.mem;
+//! Code for handling graphemes: fragments of string which should be
-const Allocator = mem.Allocator;
+//! treated as one unit.  Like Farmer Bob here: 👨🏻‍🌾
-const compress = std.compress;
-const unicode = std.unicode;
-const CodePoint = @import("code_point").CodePoint;
-const CodePointIterator = @import("code_point").Iterator;
 s1: []u16 = undefined,
 s2: []u16 = undefined,
@@ -66,10 +61,16 @@ pub fn isEmoji(graphemes: Graphemes, cp: u21) bool {
    return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1;
 }
+/// Returns an iterator over the graphemes in `string`.
 pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator {
    return Iterator.init(string, graphemes);
 }
+/// Returns a reverse iterator over the graphemes in `string`.
+pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator {
+    return ReverseIterator.init(string, graphemes);
+}
 /// Indic syllable type.
 pub const Indic = enum {
    none,
@@ -99,8 +100,8 @@ pub const Gbp = enum {
 /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
 pub const Grapheme = struct {
-    len: u8,
+    len: uoffset,
-    offset: u32,
+    offset: uoffset,
    /// `bytes` returns the slice of bytes that correspond to
    /// this grapheme cluster in `src`.
@@ -109,6 +110,96 @@ pub const Grapheme = struct {
    }
 };
+// NOTE: graphemeAtIndex is, probably, not in an optimal form.  It has the advantage
+// of being composed of other parts, but the constant factor can _probably_ be improved
+// by a bespoke implmentation using graphemes.graphemeBreak directly.  There's a limit
+// to how much cycle-bumming I'm willing to do at any given moment; that limit has been
+// reached.  Perhaps you, Dear Reader, might pick up the torch?
+/// Returns the `Grapheme` at `string[index]`, which does not have to be a
+/// valid start of a codepoint.  Asserts the string is not empty.  Index must be
+/// less than `string.len`.  Always returns a `Grapheme`.
+pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme {
+    assert(string.len != 0);
+    if (index == 0 or (index > 0 and
+        string[index] < 0x80 and
+        string[index - 1] < 0x80) and
+        (string[index - 1] != '\r' and string[index] != '\n'))
+    {
+        // There's always a grapheme break between two ASCII code points (except CRLF)
+        var iter = graphemes.iterator(string[index..]);
+        const next = iter.next().?;
+        return Grapheme{
+            .len = next.len,
+            .offset = @as(u32, @intCast(index)) + next.offset,
+        };
+    } // Otherwise it gets hairy.
+    const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset;
+    if (idx == string.len) {
+        var iter = graphemes.reverseIterator(string);
+        return iter.prev().?;
+    }
+    // We're on a valid codepoint boundary, we go back from here
+    var r_iter = graphemes.reverseIterAtIndex(string, idx);
+    if (r_iter.prev()) |g| {
+        if (g.offset == 0) {
+            var iter = graphemes.iterator(string);
+            while (iter.next()) |g2| {
+                if (g2.offset <= idx and idx < g2.offset + g2.len) return g2;
+            }
+        }
+    }
+    // We need to toss one, because otherwise we might not be pending when
+    // we in fact need to be.
+    _ = r_iter.prev();
+    while (r_iter.pending != .none) : (_ = r_iter.prev()) {}
+    var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0);
+    while (iter.next()) |g| {
+        if (g.offset <= idx and idx < g.offset + g.len) return g;
+    }
+    unreachable;
+}
+/// Return a (forward) iterator of `string` after `grapheme`.
+pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator {
+    return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len);
+}
+/// Return a reverse iterator of `string` before `grapheme`.
+pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator {
+    // This bit of weirdness is because reverse iterators are "advance last",
+    // while forward iterators are "advance first".  This leaves some room for
+    // further optimization, if anyone dares.
+    var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1);
+    _ = r_iter.prev();
+    return r_iter;
+}
+fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator {
+    var r_iter: ReverseIterator = undefined;
+    r_iter.data = graphemes;
+    var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
+    r_iter.buf[1] = rcp_iter.prev();
+    r_iter.buf[0] = rcp_iter.prev();
+    r_iter.pending = .none;
+    r_iter.cp_iter = rcp_iter;
+    return r_iter;
+}
+fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator {
+    var iter: Iterator = undefined;
+    iter.data = graphemes;
+    iter.buf[0] = first: {
+        if (idx == string.len) break :first null;
+        var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
+        break :first r_cp_iter.prev();
+    };
+    var cp_iter: CodePointIterator = .{ .bytes = string, .i = idx };
+    iter.buf[1] = cp_iter.next();
+    iter.cp_iter = cp_iter;
+    return iter;
+}
 /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
 pub const Iterator = struct {
    buf: [2]?CodePoint = .{ null, null },
@@ -143,7 +234,7 @@ pub const Iterator = struct {
        const gc_start = self.buf[0].?.offset;
        var gc_len: u8 = self.buf[0].?.len;
-        var state = State{};
+        var state = IterState{};
        if (graphemeBreak(
            self.buf[0].?.code,
@@ -173,72 +264,244 @@ pub const Iterator = struct {
        const saved_cp_iter = self.cp_iter;
        const s0 = self.buf[0];
        const s1 = self.buf[1];
+        defer {
-        self.advance();
-        // If no more
-        if (self.buf[0] == null) {
-            self.cp_iter = saved_cp_iter;
-            self.buf[0] = s0;
-            self.buf[1] = s1;
-            return null;
-        }
-        // If last one
-        if (self.buf[1] == null) {
-            const len = self.buf[0].?.len;
-            const offset = self.buf[0].?.offset;
-            self.cp_iter = saved_cp_iter;
-            self.buf[0] = s0;
-            self.buf[1] = s1;
-            return Grapheme{ .len = len, .offset = offset };
-        }
-        // If ASCII
-        if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) {
-            const len = self.buf[0].?.len;
-            const offset = self.buf[0].?.offset;
            self.cp_iter = saved_cp_iter;
            self.buf[0] = s0;
            self.buf[1] = s1;
-            return Grapheme{ .len = len, .offset = offset };
        }
+        return self.next();
+    }
+};
-        const gc_start = self.buf[0].?.offset;
+/// Iterate a string backward by Grapheme.
-        var gc_len: u8 = self.buf[0].?.len;
+pub const ReverseIterator = struct {
-        var state = State{};
+    buf: [2]?CodePoint = .{ null, null },
+    cp_iter: CodePointReverseIterator,
+    data: *const Graphemes,
+    /// Codepoint read from `cp_iter` but not returned by `previous`
+    pending: Pending = .none,
-        if (graphemeBreak(
+    const Pending = union(enum) {
-            self.buf[0].?.code,
+        none: void,
-            self.buf[1].?.code,
+        /// Count of pending RI codepoints, it is an even number
-            self.data,
+        ri_count: usize,
-            &state,
+        /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji
-        )) {
+        extend_end: uoffset,
-            self.cp_iter = saved_cp_iter;
+    };
-            self.buf[0] = s0;
-            self.buf[1] = s1;
-            return Grapheme{ .len = gc_len, .offset = gc_start };
-        }
-        while (true) {
+    const Self = @This();
-            self.advance();
-            if (self.buf[0] == null) break;
-            gc_len += self.buf[0].?.len;
+    pub fn init(str: []const u8, data: *const Graphemes) Self {
+        var self: Self = .{ .cp_iter = .init(str), .data = data };
+        self.advance();
+        self.advance();
+        return self;
+    }
+    fn advance(self: *Self) void {
+        self.buf[1] = self.buf[0];
+        self.buf[0] = self.cp_iter.prev();
+    }
+    pub fn peek(self: *Self) ?Grapheme {
+        const cache = .{ self.buf, self.cp_iter, self.pending };
+        defer self.buf, self.cp_iter, self.pending = cache;
+        return self.prev();
+    }
+    pub fn prev(self: *Self) ?Grapheme {
+        if (self.buf[1] == null) return null;
+        const grapheme_end: uoffset = end: {
+            const codepoint = self.buf[1].?;
+            switch (self.pending) {
+                // BUF: [?Any, Any]
+                .none => break :end codepoint.offset + codepoint.len,
+                .ri_count => |ri_count| {
+                    std.debug.assert(ri_count > 0);
+                    std.debug.assert(ri_count % 2 == 0);
+                    if (ri_count > 2) {
+                        self.pending.ri_count -= 2;
+                        // Use the fact that all RI have length 4 in utf8 encoding
+                        // since they are in range 0x1f1e6...0x1f1ff
+                        // https://en.wikipedia.org/wiki/UTF-8#Encoding
+                        return Grapheme{
+                            .len = 8,
+                            .offset = @intCast(codepoint.offset + self.pending.ri_count * 4),
+                        };
+                    } else {
+                        self.pending = .{ .none = {} };
+                        break :end codepoint.offset + codepoint.len + 4;
+                    }
+                },
+                // BUF: [?Any, Extend] Extend* ZWJ
+                .extend_end => |extend_end| {
+                    self.pending = .{ .none = {} };
+                    break :end extend_end;
+                },
+            }
+        };
+        while (self.buf[0] != null) {
+            var state: IterState = .{};
+            state.xpic = true;
+            state.regional = false;
+            state.indic = true;
            if (graphemeBreak(
                self.buf[0].?.code,
-                if (self.buf[1]) |ncp| ncp.code else 0,
+                self.buf[1].?.code,
                self.data,
                &state,
            )) break;
+            self.advance();
+            if (!state.indic) {
+                // BUF: [?Any, Extend | Linker] Consonant
+                var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
+                indic: while (true) {
+                    if (self.buf[0] == null) {
+                        self.pending = .{ .extend_end = indic_offset };
+                        return .{
+                            .len = @intCast(grapheme_end - indic_offset),
+                            .offset = indic_offset,
+                        };
+                    }
+                    const codepoint = self.buf[0].?;
+                    switch (self.data.indic(codepoint.code)) {
+                        .Extend, .Linker => {
+                            self.advance();
+                            continue :indic;
+                        },
+                        .Consonant => {
+                            // BUF: [Consonant, Extend | Linker] (Extend | Linker)* Consonant
+                            indic_offset = codepoint.offset;
+                            self.advance();
+                            if (self.buf[0]) |cp1| {
+                                state.indic = true;
+                                if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break;
+                                if (!state.indic) {
+                                    continue :indic;
+                                } else {
+                                    break :indic;
+                                }
+                            } else {
+                                break :indic;
+                            }
+                        },
+                        .none => {
+                            // BUF: [Any, Extend | Linker] (Extend | Linker)* Consonant
+                            self.pending = .{ .extend_end = indic_offset };
+                            return .{
+                                .len = @intCast(grapheme_end - indic_offset),
+                                .offset = indic_offset,
+                            };
+                        },
+                    }
+                }
+            }
+            if (!state.xpic) {
+                // BUF: [?Any, ZWJ] Emoji
+                var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
+                // Look for previous Emoji
+                emoji: while (true) {
+                    if (self.buf[0] == null) {
+                        self.pending = .{ .extend_end = emoji_offset };
+                        return .{
+                            .len = @intCast(grapheme_end - emoji_offset),
+                            .offset = emoji_offset,
+                        };
+                    }
+                    const codepoint = self.buf[0].?;
+                    if (self.data.gbp(codepoint.code) == .Extend) {
+                        self.advance();
+                        continue :emoji;
+                    }
+                    if (self.data.isEmoji(codepoint.code)) {
+                        // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)*
+                        emoji_offset = codepoint.offset;
+                        self.advance();
+                        if (self.buf[0] != null and
+                            // ZWJ = 0x200d
+                            self.buf[0].?.code == 0x200d)
+                        {
+                            // BUF: [ZWJ, Emoji] (Extend* ZWJ Emoji)*
+                            // Back at the beginning of the loop, "recursively" look for emoji
+                            self.advance();
+                            continue :emoji;
+                        } else {
+                            // BUF: [?Any, Emoji] (Extend* ZWJ Emoji)*
+                            break :emoji;
+                        }
+                    } else {
+                        // BUF: [Any, Extend] (Extend* ZWJ Emoji)*
+                        self.pending = .{ .extend_end = emoji_offset };
+                        return .{
+                            .len = @intCast(grapheme_end - emoji_offset),
+                            .offset = emoji_offset,
+                        };
+                    }
+                }
+            }
+            if (state.regional) {
+                var ri_count: usize = 0;
+                while (self.buf[0] != null and
+                    self.data.gbp(self.buf[0].?.code) == .Regional_Indicator)
+                {
+                    ri_count += 1;
+                    self.advance();
+                }
+                // Use the fact that all RI have length 4 in utf8 encoding
+                // since they are in range 0x1f1e6...0x1f1ff
+                // https://en.wikipedia.org/wiki/UTF-8#Encoding
+                if (ri_count == 0) {
+                    // There are no pending RI codepoints
+                } else if (ri_count % 2 == 0) {
+                    self.pending = .{ .ri_count = ri_count };
+                    return .{ .len = 8, .offset = grapheme_end - 8 };
+                } else {
+                    // Add one to count for the unused RI
+                    self.pending = .{ .ri_count = ri_count + 1 };
+                    return .{ .len = 4, .offset = grapheme_end - 4 };
+                }
+            }
        }
-        self.cp_iter = saved_cp_iter;
-        self.buf[0] = s0;
-        self.buf[1] = s1;
-        return Grapheme{ .len = gc_len, .offset = gc_start };
+        const grapheme_start = if (self.buf[1]) |codepoint| codepoint.offset else 0;
+        self.advance();
+        return .{
+            .len = @intCast(grapheme_end - grapheme_start),
+            .offset = grapheme_start,
+        };
    }
 };
+/// Grapheme Iterator state.
+pub const IterState = packed struct(u3) {
+    xpic: bool = false,
+    regional: bool = false,
+    indic: bool = false,
+};
 // Predicates
 fn isBreaker(cp: u21, data: *const Graphemes) bool {
    // Extract relevant properties.
@@ -246,44 +509,6 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool {
    return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
 }
-// Grapheme break state.
-pub const State = struct {
-    bits: u3 = 0,
-    // Extended Pictographic (emoji)
-    fn hasXpic(self: State) bool {
-        return self.bits & 1 == 1;
-    }
-    fn setXpic(self: *State) void {
-        self.bits |= 1;
-    }
-    fn unsetXpic(self: *State) void {
-        self.bits ^= 1;
-    }
-    // Regional Indicatior (flags)
-    fn hasRegional(self: State) bool {
-        return self.bits & 2 == 2;
-    }
-    fn setRegional(self: *State) void {
-        self.bits |= 2;
-    }
-    fn unsetRegional(self: *State) void {
-        self.bits ^= 2;
-    }
-    // Indic Conjunct
-    fn hasIndic(self: State) bool {
-        return self.bits & 4 == 4;
-    }
-    fn setIndic(self: *State) void {
-        self.bits |= 4;
-    }
-    fn unsetIndic(self: *State) void {
-        self.bits ^= 4;
-    }
-};
 /// `graphemeBreak` returns true only if a grapheme break point is required
 /// between `cp1` and `cp2`. `state` should start out as 0. If calling
 /// iteratively over a sequence of code points, this function must be called
@@ -294,7 +519,7 @@ pub fn graphemeBreak(
    cp1: u21,
    cp2: u21,
    data: *const Graphemes,
-    state: *State,
+    state: *IterState,
 ) bool {
    // Extract relevant properties.
    const cp1_gbp_prop = data.gbp(cp1);
@@ -306,9 +531,9 @@ pub fn graphemeBreak(
    const cp2_is_emoji = data.isEmoji(cp2);
    // GB11: Emoji Extend* ZWJ x Emoji
-    if (!state.hasXpic() and cp1_is_emoji) state.setXpic();
+    if (!state.xpic and cp1_is_emoji) state.xpic = true;
    // GB9c: Indic Conjunct Break
-    if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic();
+    if (!state.indic and cp1_indic_prop == .Consonant) state.indic = true;
    // GB3: CR x LF
    if (cp1 == '\r' and cp2 == '\n') return false;
@@ -317,11 +542,11 @@ pub fn graphemeBreak(
    if (isBreaker(cp1, data)) return true;
    // GB11: Emoji Extend* ZWJ x Emoji
-    if (state.hasXpic() and
+    if (state.xpic and
        cp1_gbp_prop == .ZWJ and
        cp2_is_emoji)
    {
-        state.unsetXpic();
+        state.xpic = false;
        return false;
    }
@@ -336,11 +561,11 @@ pub fn graphemeBreak(
    // GB12, GB13: RI x RI
    if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
-        if (state.hasRegional()) {
+        if (state.regional) {
-            state.unsetRegional();
+            state.regional = false;
            return true;
        } else {
-            state.setRegional();
+            state.regional = true;
            return false;
        }
    }
@@ -365,25 +590,25 @@ pub fn graphemeBreak(
    }
    // GB9c: Indic Conjunct Break
-    if (state.hasIndic() and
+    if (state.indic and
        cp1_indic_prop == .Consonant and
        (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker))
    {
        return false;
    }
-    if (state.hasIndic() and
+    if (state.indic and
        cp1_indic_prop == .Extend and
        cp2_indic_prop == .Linker)
    {
        return false;
    }
-    if (state.hasIndic() and
+    if (state.indic and
        (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and
        cp2_indic_prop == .Consonant)
    {
-        state.unsetIndic();
+        state.indic = false;
        return false;
    }
@@ -421,3 +646,39 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
        try std.testing.expectEqual(@as(usize, 2), i);
    }
 }
+test "Iterator.peek" {
+    const peek_seq = "aΔ👨🏻‍🌾→";
+    const data = try Graphemes.init(std.testing.allocator);
+    defer data.deinit(std.testing.allocator);
+    var iter = data.iterator(peek_seq);
+    const peek_a = iter.peek().?;
+    const next_a = iter.next().?;
+    try std.testing.expectEqual(peek_a, next_a);
+    try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq));
+    const peek_d1 = iter.peek().?;
+    const peek_d2 = iter.peek().?;
+    try std.testing.expectEqual(peek_d1, peek_d2);
+    const next_d = iter.next().?;
+    try std.testing.expectEqual(peek_d2, next_d);
+    try std.testing.expectEqual(iter.peek(), iter.next());
+    try std.testing.expectEqual(iter.peek(), iter.next());
+    try std.testing.expectEqual(null, iter.peek());
+    try std.testing.expectEqual(null, iter.peek());
+    try std.testing.expectEqual(iter.peek(), iter.next());
+}
+const std = @import("std");
+const builtin = @import("builtin");
+const assert = std.debug.assert;
+const mem = std.mem;
+const Allocator = mem.Allocator;
+const compress = std.compress;
+const unicode = std.unicode;
+const code_point = @import("code_point");
+const CodePoint = code_point.CodePoint;
+const CodePointIterator = code_point.Iterator;
+const CodePointReverseIterator = code_point.ReverseIterator;
+const uoffset = code_point.uoffset;
diff --git a/src/Words.zig b/src/Words.zig
new file mode 100644
index 0000000..617c34d
--- /dev/null
+++ b/src/Words.zig
@@ -0,0 +1,773 @@
+//! Word Breaking Algorithm.
+//!
+//! https://www.unicode.org/reports/tr29/#Word_Boundaries
+//!
+const WordBreakProperty = enum(u5) {
+    none,
+    Double_Quote,
+    Single_Quote,
+    Hebrew_Letter,
+    CR,
+    LF,
+    Newline,
+    Extend,
+    Regional_Indicator,
+    Format,
+    Katakana,
+    ALetter,
+    MidLetter,
+    MidNum,
+    MidNumLet,
+    Numeric,
+    ExtendNumLet,
+    ZWJ,
+    WSegSpace,
+};
+s1: []u16 = undefined,
+s2: []u5 = undefined,
+const Words = @This();
+pub fn init(allocator: Allocator) Allocator.Error!Words {
+    var wb: Words = undefined;
+    try wb.setup(allocator);
+    return wb;
+}
+pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void {
+    wb.setupImpl(allocator) catch |err| {
+        switch (err) {
+            error.OutOfMemory => |e| return e,
+            else => unreachable,
+        }
+    };
+}
+pub fn deinit(words: *const Words, allocator: mem.Allocator) void {
+    allocator.free(words.s1);
+    allocator.free(words.s2);
+}
+/// Represents a Unicode word span, as an offset into the source string
+/// and the length of the word.
+pub const Word = struct {
+    offset: uoffset,
+    len: uoffset,
+    /// Returns a slice of the word given the source string.
+    pub fn bytes(word: Word, src: []const u8) []const u8 {
+        return src[word.offset..][0..word.len];
+    }
+};
+/// Returns the word break property type for `cp`.
+pub fn breakProperty(words: *const Words, cp: u21) WordBreakProperty {
+    return @enumFromInt(words.s2[words.s1[cp >> 8] + (cp & 0xff)]);
+}
+/// Convenience function for working with CodePoints
+fn breakProp(words: *const Words, point: CodePoint) WordBreakProperty {
+    return @enumFromInt(words.s2[words.s1[point.code >> 8] + (point.code & 0xff)]);
+}
+/// Returns the Word at the given index.  Asserts that the index is less than
+/// `string.len`, and that the string is not empty. Always returns a word.
+/// The index does not have to be the start of a codepoint in the word.
+pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word {
+    assert(index < string.len and string.len > 0);
+    var iter_back: ReverseIterator = reverseFromIndex(words, string, index);
+    const first_back = iter_back.prev();
+    if (first_back) |back| {
+        if (back.offset == 0) {
+            var iter_fwd = words.iterator(string);
+            while (iter_fwd.next()) |word| {
+                if (word.offset <= index and index < word.offset + word.len)
+                    return word;
+            }
+        }
+    } else {
+        var iter_fwd = words.iterator(string);
+        while (iter_fwd.next()) |word| {
+            if (word.offset <= index and index < word.offset + word.len)
+                return word;
+        }
+    }
+    _ = iter_back.prev();
+    // There's sometimes flags:
+    if (iter_back.flags > 0) {
+        while (iter_back.flags > 0) {
+            if (iter_back.prev()) |_| {
+                continue;
+            } else {
+                break;
+            }
+        }
+    }
+    var iter_fwd = iter_back.forwardIterator();
+    while (iter_fwd.next()) |word| {
+        if (word.offset <= index and index < word.offset + word.len)
+            return word;
+    }
+    unreachable;
+}
+/// Returns an iterator over words in `slice`.
+pub fn iterator(words: *const Words, slice: []const u8) Iterator {
+    return Iterator.init(words, slice);
+}
+/// Returns a reverse iterator over the words in `slice`.
+pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator {
+    return ReverseIterator.init(words, slice);
+}
+/// Returns an iterator after the `word` in `slice`.
+pub fn iterateAfterWord(words: *const Words, slice: []const u8, word: Word) Iterator {
+    return forwardFromIndex(words, slice, word.offset + word.len);
+}
+/// Returns a reverse iterator before the `word` in `slice`.
+pub fn iterateBeforeWord(words: *const Words, slice: []const u8, word: Word) ReverseIterator {
+    return reverseFromIndex(words, slice, word.offset);
+}
+/// An iterator, forward, over all words in a provided string.
+pub const Iterator = struct {
+    this: ?CodePoint = null,
+    that: ?CodePoint = null,
+    cp_iter: CodepointIterator,
+    wb: *const Words,
+    /// Assumes `str` is valid UTF-8.
+    pub fn init(words: *const Words, str: []const u8) Iterator {
+        var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = words };
+        wb_iter.advance();
+        return wb_iter;
+    }
+    /// Returns the next word segment, without advancing.
+    pub fn peek(iter: *Iterator) ?Word {
+        const cache = .{ iter.this, iter.that, iter.cp_iter };
+        defer {
+            iter.this, iter.that, iter.cp_iter = cache;
+        }
+        return iter.next();
+    }
+    /// Returns a reverse iterator from the point this iterator is paused
+    /// at.  Usually, and always when using the API to create iterators,
+    /// calling `prev()` will return the word just seen.
+    pub fn reverseIterator(iter: *Iterator) ReverseIterator {
+        var cp_it = iter.cp_iter.reverseIterator();
+        if (iter.that) |_|
+            _ = cp_it.prev();
+        if (iter.cp_iter.peek()) |_|
+            _ = cp_it.prev();
+        return .{
+            .wb = iter.wb,
+            .before = cp_it.prev(),
+            .after = iter.that,
+            .cp_iter = cp_it,
+        };
+    }
+    /// Returns the next word segment, if any.
+    pub fn next(iter: *Iterator) ?Word {
+        iter.advance();
+        // Done?
+        if (iter.this == null) return null;
+        // Last?
+        if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
+        const word_start = iter.this.?.offset;
+        var word_len: uoffset = 0;
+        // State variables.
+        var last_p: WordBreakProperty = .none;
+        var last_last_p: WordBreakProperty = .none;
+        var ri_count: usize = 0;
+        scan: while (true) : (iter.advance()) {
+            const this = iter.this.?;
+            word_len += this.len;
+            if (iter.that) |that| {
+                const this_p = iter.wb.breakProp(this);
+                const that_p = iter.wb.breakProp(that);
+                if (!isIgnorable(this_p)) {
+                    last_last_p = last_p;
+                    last_p = this_p;
+                }
+                // WB3  CR × LF
+                if (this_p == .CR and that_p == .LF) continue :scan;
+                // WB3a  (Newline | CR | LF) ÷
+                if (isNewline(this_p)) break :scan;
+                // WB3b  ÷ (Newline | CR | LF)
+                if (isNewline(that_p)) break :scan;
+                // WB3c  ZWJ × \p{Extended_Pictographic}
+                if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) {
+                    continue :scan;
+                }
+                // WB3d  WSegSpace × WSegSpace
+                if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
+                // WB4  X (Extend | Format | ZWJ)* → X
+                if (isIgnorable(that_p)) {
+                    continue :scan;
+                } // Now we use last_p instead of this_p for ignorable's sake
+                if (isAHLetter(last_p)) {
+                    // WB5  AHLetter × AHLetter
+                    if (isAHLetter(that_p)) continue :scan;
+                    // WB6  AHLetter × (MidLetter | MidNumLetQ) AHLetter
+                    if (isMidVal(that_p)) {
+                        const next_val = iter.peekPast();
+                        if (next_val) |next_cp| {
+                            const next_p = iter.wb.breakProp(next_cp);
+                            if (isAHLetter(next_p)) {
+                                continue :scan;
+                            }
+                        }
+                    }
+                }
+                // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
+                if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) {
+                    continue :scan;
+                }
+                if (last_p == .Hebrew_Letter) {
+                    // WB7a  Hebrew_Letter × Single_Quote
+                    if (that_p == .Single_Quote) continue :scan;
+                    // WB7b  Hebrew_Letter × Double_Quote Hebrew_Letter
+                    if (that_p == .Double_Quote) {
+                        const next_val = iter.peekPast();
+                        if (next_val) |next_cp| {
+                            const next_p = iter.wb.breakProp(next_cp);
+                            if (next_p == .Hebrew_Letter) {
+                                continue :scan;
+                            }
+                        }
+                    }
+                }
+                // WB7c  Hebrew_Letter Double_Quote × Hebrew_Letter
+                if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter)
+                    continue :scan;
+                // WB8  Numeric × Numeric
+                if (last_p == .Numeric and that_p == .Numeric) continue :scan;
+                // WB9  AHLetter × Numeric
+                if (isAHLetter(last_p) and that_p == .Numeric) continue :scan;
+                // WB10  Numeric ×  AHLetter
+                if (last_p == .Numeric and isAHLetter(that_p)) continue :scan;
+                // WB11  Numeric (MidNum | MidNumLetQ) × Numeric
+                if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric)
+                    continue :scan;
+                // WB12  Numeric × (MidNum | MidNumLetQ) Numeric
+                if (last_p == .Numeric and isMidNum(that_p)) {
+                    const next_val = iter.peekPast();
+                    if (next_val) |next_cp| {
+                        const next_p = iter.wb.breakProp(next_cp);
+                        if (next_p == .Numeric) {
+                            continue :scan;
+                        }
+                    }
+                }
+                // WB13  Katakana × Katakana
+                if (last_p == .Katakana and that_p == .Katakana) continue :scan;
+                // WB13a  (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+                if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan;
+                // WB13b  ExtendNumLet × (AHLetter | Numeric | Katakana)
+                if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
+                // WB15, WB16  ([^RI] | sot) (RI RI)* RI × RI
+                const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator;
+                if (maybe_flag) {
+                    ri_count += 1;
+                    if (ri_count % 2 == 1) continue :scan;
+                }
+                // WB999  Any ÷ Any
+                break :scan;
+            } else { // iter.that == null
+                break :scan;
+            }
+        }
+        return Word{ .len = word_len, .offset = word_start };
+    }
+    pub fn format(iter: Iterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
+        try writer.print(
+            "Iterator {{ .this = {any}, .that = {any} }}",
+            .{ iter.this, iter.that },
+        );
+    }
+    fn advance(iter: *Iterator) void {
+        iter.this = iter.that;
+        iter.that = iter.cp_iter.next();
+    }
+    fn peekPast(iter: *Iterator) ?CodePoint {
+        const save_cp = iter.cp_iter;
+        defer iter.cp_iter = save_cp;
+        while (iter.cp_iter.peek()) |peeked| {
+            if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
+            _ = iter.cp_iter.next();
+        }
+        return null;
+    }
+};
+/// An iterator, backward, over all words in a provided string.
+pub const ReverseIterator = struct {
+    after: ?CodePoint = null,
+    before: ?CodePoint = null,
+    cp_iter: ReverseCodepointIterator,
+    wb: *const Words,
+    flags: usize = 0,
+    /// Assumes `str` is valid UTF-8.
+    pub fn init(words: *const Words, str: []const u8) ReverseIterator {
+        var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = words };
+        wb_iter.advance();
+        return wb_iter;
+    }
+    /// Returns the previous word segment, if any, without advancing.
+    pub fn peek(iter: *ReverseIterator) ?Word {
+        const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags };
+        defer {
+            iter.before, iter.after, iter.cp_iter, iter.flags = cache;
+        }
+        return iter.prev();
+    }
+    /// Return a forward iterator from where this iterator paused.  Usually,
+    /// and always when using the API to create iterators, calling `next()`
+    /// will return the word just seen.
+    pub fn forwardIterator(iter: *ReverseIterator) Iterator {
+        var cp_it = iter.cp_iter.forwardIterator();
+        if (iter.before) |_|
+            _ = cp_it.next();
+        return .{
+            .wb = iter.wb,
+            .this = cp_it.next(),
+            .that = iter.after,
+            .cp_iter = cp_it,
+        };
+    }
+    /// Return the previous word, if any.
+    pub fn prev(iter: *ReverseIterator) ?Word {
+        iter.advance();
+        // Done?
+        if (iter.after == null) return null;
+        // Last?
+        if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 };
+        const word_end = iter.after.?.offset + iter.after.?.len;
+        var word_len: uoffset = 0;
+        // State variables.
+        var last_p: WordBreakProperty = .none;
+        var last_last_p: WordBreakProperty = .none;
+        scan: while (true) : (iter.advance()) {
+            const after = iter.after.?;
+            word_len += after.len;
+            if (iter.before) |before| {
+                var sneak = sneaky(iter); // 'sneaks' past ignorables
+                const after_p = iter.wb.breakProp(after);
+                var before_p = iter.wb.breakProp(before);
+                if (!isIgnorable(after_p)) {
+                    last_last_p = last_p;
+                    last_p = after_p;
+                }
+                // WB3  CR × LF
+                if (before_p == .CR and after_p == .LF) continue :scan;
+                // WB3a  (Newline | CR | LF) ÷
+                if (isNewline(before_p)) break :scan;
+                // WB3b  ÷ (Newline | CR | LF)
+                if (isNewline(after_p)) break :scan;
+                // WB3c  ZWJ × \p{Extended_Pictographic}
+                if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) {
+                    continue :scan;
+                }
+                // WB3d  WSegSpace × WSegSpace
+                if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan;
+                // WB4  X (Extend | Format | ZWJ)* → X
+                if (isIgnorable(before_p)) {
+                    const maybe_before = sneak.prev();
+                    if (maybe_before) |valid_before| {
+                        before_p = iter.wb.breakProp(valid_before);
+                    } else if (!isIgnorable(after_p)) {
+                        // We're done
+                        break :scan;
+                    }
+                }
+                if (isIgnorable(after_p)) continue :scan;
+                // WB5  AHLetter × AHLetter
+                if (isAHLetter(last_p) and isAHLetter(before_p)) {
+                    continue :scan;
+                }
+                // WB6  AHLetter × (MidLetter | MidNumLetQ) AHLetter
+                if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) {
+                    continue :scan;
+                }
+                // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
+                if (isMidVal(before_p) and isAHLetter(last_p)) {
+                    const prev_val = sneak.peek();
+                    if (prev_val) |prev_cp| {
+                        const prev_p = iter.wb.breakProp(prev_cp);
+                        if (isAHLetter(prev_p)) {
+                            continue :scan;
+                        }
+                    }
+                }
+                // WB7a  Hebrew_Letter × Single_Quote
+                if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan;
+                // WB7b  Hebrew_Letter × Double_Quote Hebrew_Letter
+                if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) {
+                    continue :scan;
+                }
+                // WB7c  Hebrew_Letter Double_Quote × Hebrew_Letter
+                if (before_p == .Double_Quote and last_p == .Hebrew_Letter) {
+                    const prev_val = sneak.peek();
+                    if (prev_val) |prev_cp| {
+                        const prev_p = iter.wb.breakProp(prev_cp);
+                        if (prev_p == .Hebrew_Letter) {
+                            continue :scan;
+                        }
+                    }
+                }
+                // WB8  Numeric × Numeric
+                if (before_p == .Numeric and last_p == .Numeric) continue :scan;
+                // WB9  AHLetter × Numeric
+                if (isAHLetter(before_p) and last_p == .Numeric) continue :scan;
+                // WB10  Numeric ×  AHLetter
+                if (before_p == .Numeric and isAHLetter(last_p)) continue :scan;
+                // WB11  Numeric (MidNum | MidNumLetQ) × Numeric
+                if (isMidNum(before_p) and last_p == .Numeric) {
+                    const prev_val = sneak.peek();
+                    if (prev_val) |prev_cp| {
+                        const prev_p = iter.wb.breakProp(prev_cp);
+                        if (prev_p == .Numeric) {
+                            continue :scan;
+                        }
+                    }
+                }
+                // WB12  Numeric × (MidNum | MidNumLetQ) Numeric
+                if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) {
+                    continue :scan;
+                }
+                // WB13  Katakana × Katakana
+                if (before_p == .Katakana and last_p == .Katakana) continue :scan;
+                // WB13a  (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+                if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan;
+                // WB13b  ExtendNumLet × (AHLetter | Numeric | Katakana)
+                if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan;
+                // WB15, WB16  ([^RI] | sot) (RI RI)* RI × RI
+                // NOTE:
+                // So here we simply have to know whether a run of flags is even or odd.
+                // The whole run.  To avoid quadratic behavior (and long flag runs are
+                // actually a thing in the wild), we have to count them once, store that
+                // on the iterator, and decrement each time we see two, possibly breaking
+                // once extra at the beginning. They break up one per flag, once we hit
+                // zero, that's all the flags.  If we see another flag we do it again.
+                if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) {
+                    defer {
+                        if (iter.flags > 0) iter.flags -= 1;
+                    }
+                    if (iter.flags == 0) {
+                        iter.flags = sneak.countFlags();
+                    }
+                    if (iter.flags % 2 == 0) {
+                        continue :scan;
+                    }
+                }
+                // WB999  Any ÷ Any
+                break :scan;
+            }
+            break :scan;
+        }
+        return Word{ .len = word_len, .offset = word_end - word_len };
+    }
+    pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
+        try writer.print(
+            "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}",
+            .{ iter.before, iter.after, iter.flags },
+        );
+    }
+    fn peekPast(iter: *ReverseIterator) ?CodePoint {
+        const save_cp = iter.cp_iter;
+        defer iter.cp_iter = save_cp;
+        while (iter.cp_iter.peek()) |peeked| {
+            if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
+            _ = iter.cp_iter.prev();
+        }
+        return null;
+    }
+    fn advance(iter: *ReverseIterator) void {
+        iter.after = iter.before;
+        iter.before = iter.cp_iter.prev();
+    }
+};
+//| Implementation Details
+/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`.
+fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator {
+    var idx: uoffset = @intCast(index);
+    // Find the next lead byte:
+    while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
+    if (idx == string.len) return words.reverseIterator(string);
+    var iter: ReverseIterator = undefined;
+    iter.wb = words;
+    iter.flags = 0;
+    // We need to populate the CodePoints, and the codepoint iterator.
+    // Consider "abc| def" with the cursor as |.
+    // We need `before` to be `c` and `after` to be ' ',
+    // and `cp_iter.prev()` to be `b`.
+    var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
+    iter.after = cp_iter.prev();
+    iter.before = cp_iter.prev();
+    iter.cp_iter = cp_iter;
+    return iter;
+}
+fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator {
+    var idx: uoffset = @intCast(index);
+    if (idx == string.len) {
+        return .{
+            .cp_iter = .{ .bytes = string, .i = idx },
+            .this = null,
+            .that = null,
+            .wb = words,
+        };
+    }
+    while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
+    if (idx == 0) return words.iterator(string);
+    var iter: Iterator = undefined;
+    iter.wb = words;
+    // We need to populate the CodePoints, and the codepoint iterator.
+    // Consider "abc |def" with the cursor as |.
+    // We need `this` to be ` ` and `that` to be 'd',
+    // and `cp_iter.next()` to be `d`.
+    idx -= 1;
+    while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
+    // "abc| def"
+    var cp_iter: CodepointIterator = .{ .bytes = string, .i = idx };
+    iter.this = cp_iter.next();
+    iter.that = cp_iter.next();
+    iter.cp_iter = cp_iter;
+    return iter;
+}
+fn sneaky(iter: *const ReverseIterator) SneakIterator {
+    return .{ .cp_iter = iter.cp_iter, .wb = iter.wb };
+}
+const SneakIterator = struct {
+    cp_iter: ReverseCodepointIterator,
+    wb: *const Words,
+    fn peek(iter: *SneakIterator) ?CodePoint {
+        const save_cp = iter.cp_iter;
+        defer iter.cp_iter = save_cp;
+        while (iter.cp_iter.peek()) |peeked| {
+            if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
+            _ = iter.cp_iter.prev();
+        }
+        return null;
+    }
+    fn countFlags(iter: *SneakIterator) usize {
+        var flags: usize = 0;
+        const save_cp = iter.cp_iter;
+        defer iter.cp_iter = save_cp;
+        while (iter.cp_iter.prev()) |cp| {
+            const prop = iter.wb.breakProp(cp);
+            if (isIgnorable(prop)) continue;
+            if (prop == .Regional_Indicator) {
+                flags += 1;
+            } else break;
+        }
+        return flags;
+    }
+    fn prev(iter: *SneakIterator) ?CodePoint {
+        while (iter.cp_iter.prev()) |peeked| {
+            if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
+        }
+        return null;
+    }
+};
+inline fn setupImpl(wb: *Words, allocator: Allocator) !void {
+    const decompressor = compress.flate.inflate.decompressor;
+    const in_bytes = @embedFile("wbp");
+    var in_fbs = std.io.fixedBufferStream(in_bytes);
+    var in_decomp = decompressor(.raw, in_fbs.reader());
+    var reader = in_decomp.reader();
+    const endian = builtin.cpu.arch.endian();
+    const stage_1_len: u16 = try reader.readInt(u16, endian);
+    wb.s1 = try allocator.alloc(u16, stage_1_len);
+    errdefer allocator.free(wb.s1);
+    for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian);
+    const stage_2_len: u16 = try reader.readInt(u16, endian);
+    wb.s2 = try allocator.alloc(u5, stage_2_len);
+    errdefer allocator.free(wb.s2);
+    for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian));
+    var count_0: usize = 0;
+    for (wb.s2) |nyb| {
+        if (nyb == 0) count_0 += 1;
+    }
+}
+//| Predicates
+inline fn isNewline(wbp: WordBreakProperty) bool {
+    return wbp == .CR or wbp == .LF or wbp == .Newline;
+}
+inline fn isIgnorable(wbp: WordBreakProperty) bool {
+    return switch (wbp) {
+        .Format, .Extend, .ZWJ => true,
+        else => false,
+    };
+}
+inline fn isAHLetter(wbp: WordBreakProperty) bool {
+    return wbp == .ALetter or wbp == .Hebrew_Letter;
+}
+inline fn isMidVal(wbp: WordBreakProperty) bool {
+    return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
+}
+inline fn isMidNum(wbp: WordBreakProperty) bool {
+    return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote;
+}
+inline fn isExtensible(wbp: WordBreakProperty) bool {
+    return switch (wbp) {
+        .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
+        else => false,
+    };
+}
+test "Word Break Properties" {
+    const wb = try Words.init(testing.allocator);
+    defer wb.deinit(testing.allocator);
+    try testing.expectEqual(.CR, wb.breakProperty('\r'));
+    try testing.expectEqual(.LF, wb.breakProperty('\n'));
+    try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש'));
+    try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}'));
+}
+test "ext_pict" {
+    try testing.expect(ext_pict.isMatch("👇"));
+    try testing.expect(ext_pict.isMatch("\u{2701}"));
+}
+test "Words" {
+    const wb = try Words.init(testing.allocator);
+    defer wb.deinit(testing.allocator);
+    const word_str = "Metonym   Μετωνύμιο メトニム";
+    var w_iter = wb.iterator(word_str);
+    try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str));
+    // Spaces are "words" too!
+    try testing.expectEqualStrings("   ", w_iter.next().?.bytes(word_str));
+    const in_greek = w_iter.next().?;
+    for (in_greek.offset..in_greek.offset + in_greek.len) |i| {
+        const at_index = wb.wordAtIndex(word_str, i).bytes(word_str);
+        try testing.expectEqualStrings("Μετωνύμιο", at_index);
+    }
+    _ = w_iter.next();
+    try testing.expectEqualStrings("メトニム", w_iter.next().?.bytes(word_str));
+}
+test wordAtIndex {
+    const wb = try Words.init(testing.allocator);
+    defer wb.deinit(testing.allocator);
+    const t_string = "first second third";
+    const second = wb.wordAtIndex(t_string, 8);
+    try testing.expectEqualStrings("second", second.bytes(t_string));
+    const third = wb.wordAtIndex(t_string, 14);
+    try testing.expectEqualStrings("third", third.bytes(t_string));
+    {
+        const first = wb.wordAtIndex(t_string, 3);
+        try testing.expectEqualStrings("first", first.bytes(t_string));
+    }
+    {
+        const first = wb.wordAtIndex(t_string, 0);
+        try testing.expectEqualStrings("first", first.bytes(t_string));
+    }
+    const last = wb.wordAtIndex(t_string, 14);
+    try testing.expectEqualStrings("third", last.bytes(t_string));
+}
+const testr = "don't a:ka fin!";
+test "reversal" {
+    const wb = try Words.init(testing.allocator);
+    defer wb.deinit(testing.allocator);
+    {
+        var fwd = wb.iterator(testr);
+        var this_word: ?Word = fwd.next();
+        while (this_word) |this| : (this_word = fwd.next()) {
+            var back = fwd.reverseIterator();
+            const that_word = back.prev();
+            if (that_word) |that| {
+                try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr));
+            } else {
+                try testing.expect(false);
+            }
+        }
+    }
+    {
+        var back = wb.reverseIterator(testr);
+        var this_word: ?Word = back.prev();
+        while (this_word) |this| : (this_word = back.prev()) {
+            var fwd = back.forwardIterator();
+            const that_word = fwd.next();
+            if (that_word) |that| {
+                try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr));
+            } else {
+                try testing.expect(false);
+            }
+        }
+    }
+}
+fn testAllocations(allocator: Allocator) !void {
+    const wb = try Words.init(allocator);
+    wb.deinit(allocator);
+}
+test "allocation safety" {
+    try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{});
+}
+const std = @import("std");
+const builtin = @import("builtin");
+const compress = std.compress;
+const mem = std.mem;
+const Allocator = mem.Allocator;
+const assert = std.debug.assert;
+const testing = std.testing;
+const uoffset = code_point.uoffset;
+const code_point = @import("code_point");
+const CodepointIterator = code_point.Iterator;
+const ReverseCodepointIterator = code_point.ReverseIterator;
+const CodePoint = code_point.CodePoint;
+const ext_pict = @import("micro_runeset.zig").Extended_Pictographic;
diff --git a/src/code_point.zig b/src/code_point.zig
index fe7ad6e..7a638af 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -4,18 +4,33 @@
 //! Represents invalid data according to the Replacement of Maximal
 //! Subparts algorithm.
+pub const uoffset = if (@import("config").fat_offset) u64 else u32;
 /// `CodePoint` represents a Unicode code point by its code,
 /// length, and offset in the source bytes.
 pub const CodePoint = struct {
    code: u21,
    len: u3,
-    offset: u32,
+    offset: uoffset,
+    /// Return the slice of this codepoint, given the original string.
+    pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 {
+        return str[cp.offset..][0..cp.len];
+    }
+    pub fn format(cp: CodePoint, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
+        try writer.print("CodePoint '{u}' .{{ ", .{cp.code});
+        try writer.print(
+            ".code = 0x{x}, .offset = {d}, .len = {d} }}",
+            .{ cp.code, cp.offset, cp.len },
+        );
+    }
 };
 /// This function is deprecated and will be removed in a later release.
 /// Use `decodeAtIndex` or `decodeAtCursor`.
-pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
+pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint {
-    var off: u32 = 0;
+    var off: uoffset = 0;
    var maybe_code = decodeAtCursor(bytes, &off);
    if (maybe_code) |*code| {
        code.offset = offset;
@@ -24,15 +39,23 @@ pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
    return null;
 }
+/// Return the codepoint at `index`, even if `index` is in the middle
+/// of that codepoint.
+pub fn codepointAtIndex(bytes: []const u8, index: uoffset) ?CodePoint {
+    var idx = index;
+    while (idx > 0 and 0x80 <= bytes[idx] and bytes[idx] <= 0xbf) : (idx -= 1) {}
+    return decodeAtIndex(bytes, idx);
+}
 /// Decode the CodePoint, if any, at `bytes[idx]`.
-pub fn decodeAtIndex(bytes: []const u8, idx: u32) ?CodePoint {
+pub fn decodeAtIndex(bytes: []const u8, index: uoffset) ?CodePoint {
-    var off = idx;
+    var off = index;
    return decodeAtCursor(bytes, &off);
 }
 /// Decode the CodePoint, if any, at `bytes[cursor.*]`.  After, the
 /// cursor will point at the next potential codepoint index.
-pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
+pub fn decodeAtCursor(bytes: []const u8, cursor: *uoffset) ?CodePoint {
    // EOS
    if (cursor.* >= bytes.len) return null;
@@ -98,6 +121,9 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
    }
    if (st == RUNE_REJECT or cursor.* == bytes.len) {
        @branchHint(.cold);
+        // This, and the branch below, detect truncation, the
+        // only invalid state handled differently by the Maximal
+        // Subparts algorithm.
        if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) {
            cursor.* -= 2; // +1
            return .{
@@ -148,7 +174,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
 /// `Iterator` iterates a string one `CodePoint` at-a-time.
 pub const Iterator = struct {
    bytes: []const u8,
-    i: u32 = 0,
+    i: uoffset = 0,
    pub fn init(bytes: []const u8) Iterator {
        return .{ .bytes = bytes, .i = 0 };
@@ -158,10 +184,19 @@ pub const Iterator = struct {
        return decodeAtCursor(self.bytes, &self.i);
    }
-    pub fn peek(self: *Iterator) ?CodePoint {
+    pub fn peek(iter: *Iterator) ?CodePoint {
-        const saved_i = self.i;
+        const saved_i = iter.i;
-        defer self.i = saved_i;
+        defer iter.i = saved_i;
-        return self.next();
+        return iter.next();
+    }
+    /// Create a backward iterator at this point.  It will repeat
+    /// the last CodePoint seen.
+    pub fn reverseIterator(iter: *const Iterator) ReverseIterator {
+        if (iter.i == iter.bytes.len) {
+            return .init(iter.bytes);
+        }
+        return .{ .i = iter.i, .bytes = iter.bytes };
    }
 };
@@ -233,6 +268,55 @@ const class_mask: [12]u8 = .{
    0,
 };
+pub const ReverseIterator = struct {
+    bytes: []const u8,
+    i: ?uoffset,
+    pub fn init(str: []const u8) ReverseIterator {
+        var r_iter: ReverseIterator = undefined;
+        r_iter.bytes = str;
+        r_iter.i = if (str.len == 0) 0 else @intCast(str.len - 1);
+        return r_iter;
+    }
+    pub fn prev(iter: *ReverseIterator) ?CodePoint {
+        if (iter.i == null) return null;
+        var i_prev = iter.i.?;
+        while (i_prev > 0) : (i_prev -= 1) {
+            if (!followbyte(iter.bytes[i_prev])) break;
+        }
+        if (i_prev > 0)
+            iter.i = i_prev - 1
+        else
+            iter.i = null;
+        return decode(iter.bytes[i_prev..], i_prev);
+    }
+    pub fn peek(iter: *ReverseIterator) ?CodePoint {
+        const saved_i = iter.i;
+        defer iter.i = saved_i;
+        return iter.prev();
+    }
+    /// Create a forward iterator at this point.  It will repeat the
+    /// last CodePoint seen.
+    pub fn forwardIterator(iter: *const ReverseIterator) Iterator {
+        if (iter.i) |i| {
+            var fwd: Iterator = .{ .i = i, .bytes = iter.bytes };
+            _ = fwd.next();
+            return fwd;
+        }
+        return .{ .i = 0, .bytes = iter.bytes };
+    }
+};
+inline fn followbyte(b: u8) bool {
+    return 0x80 <= b and b <= 0xbf;
+}
 test "decode" {
    const bytes = "🌩️";
    const res = decode(bytes, 0);
@@ -246,7 +330,7 @@ test "decode" {
    }
 }
-test "peek" {
+test Iterator {
    var iter = Iterator{ .bytes = "Hi" };
    try expectEqual(@as(u21, 'H'), iter.next().?.code);
@@ -256,6 +340,54 @@ test "peek" {
    try expectEqual(@as(?CodePoint, null), iter.next());
 }
+const code_point = @This();
+// Keep this in sync with the README
+test "Code point iterator" {
+    const str = "Hi 😊";
+    var iter: code_point.Iterator = .init(str);
+    var i: usize = 0;
+    while (iter.next()) |cp| : (i += 1) {
+        // The `code` field is the actual code point scalar as a `u21`.
+        if (i == 0) try expect(cp.code == 'H');
+        if (i == 1) try expect(cp.code == 'i');
+        if (i == 2) try expect(cp.code == ' ');
+        if (i == 3) {
+            try expect(cp.code == '😊');
+            // The `offset` field is the byte offset in the
+            // source string.
+            try expect(cp.offset == 3);
+            try expectEqual(cp, code_point.decodeAtIndex(str, cp.offset).?);
+            // The `len` field is the length in bytes of the
+            // code point in the source string.
+            try expect(cp.len == 4);
+            // There is also a 'cursor' decode, like so:
+            {
+                var cursor = cp.offset;
+                try expectEqual(cp, code_point.decodeAtCursor(str, &cursor).?);
+                // Which advances the cursor variable to the next possible
+                // offset, in this case, `str.len`.  Don't forget to account
+                // for this possibility!
+                try expectEqual(cp.offset + cp.len, cursor);
+            }
+            // There's also this, for when you aren't sure if you have the
+            // correct start for a code point:
+            try expectEqual(cp, code_point.codepointAtIndex(str, cp.offset + 1).?);
+        }
+        // Reverse iteration is also an option:
+        var r_iter: code_point.ReverseIterator = .init(str);
+        // Both iterators can be peeked:
+        try expectEqual('😊', r_iter.peek().?.code);
+        try expectEqual('😊', r_iter.prev().?.code);
+        // Both kinds of iterators can be reversed:
+        var fwd_iter = r_iter.forwardIterator(); // or iter.reverseIterator();
+        // This will always return the last codepoint from
+        // the prior iterator, _if_ it yielded one:
+        try expectEqual('😊', fwd_iter.next().?.code);
+    }
+}
 test "overlongs" {
    // None of these should equal `/`, all should be byte-for-byte
    // handled as replacement characters.
@@ -346,6 +478,50 @@ test "truncation" {
    }
 }
+test ReverseIterator {
+    {
+        var r_iter: ReverseIterator = .init("ABC");
+        try testing.expectEqual(@as(u21, 'C'), r_iter.prev().?.code);
+        try testing.expectEqual(@as(u21, 'B'), r_iter.peek().?.code);
+        try testing.expectEqual(@as(u21, 'B'), r_iter.prev().?.code);
+        try testing.expectEqual(@as(u21, 'A'), r_iter.prev().?.code);
+        try testing.expectEqual(@as(?CodePoint, null), r_iter.peek());
+        try testing.expectEqual(@as(?CodePoint, null), r_iter.prev());
+        try testing.expectEqual(@as(?CodePoint, null), r_iter.prev());
+    }
+    {
+        var r_iter: ReverseIterator = .init("∅δq🦾ă");
+        try testing.expectEqual(@as(u21, 'ă'), r_iter.prev().?.code);
+        try testing.expectEqual(@as(u21, '🦾'), r_iter.prev().?.code);
+        try testing.expectEqual(@as(u21, 'q'), r_iter.prev().?.code);
+        try testing.expectEqual(@as(u21, 'δ'), r_iter.peek().?.code);
+        try testing.expectEqual(@as(u21, 'δ'), r_iter.prev().?.code);
+        try testing.expectEqual(@as(u21, '∅'), r_iter.peek().?.code);
+        try testing.expectEqual(@as(u21, '∅'), r_iter.peek().?.code);
+        try testing.expectEqual(@as(u21, '∅'), r_iter.prev().?.code);
+        try testing.expectEqual(@as(?CodePoint, null), r_iter.peek());
+        try testing.expectEqual(@as(?CodePoint, null), r_iter.prev());
+        try testing.expectEqual(@as(?CodePoint, null), r_iter.prev());
+    }
+    {
+        var r_iter: ReverseIterator = .init("123");
+        try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code);
+        try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code);
+        try testing.expectEqual(@as(u21, '1'), r_iter.prev().?.code);
+        var iter = r_iter.forwardIterator();
+        try testing.expectEqual(@as(u21, '1'), iter.next().?.code);
+        try testing.expectEqual(@as(u21, '2'), iter.next().?.code);
+        try testing.expectEqual(@as(u21, '3'), iter.next().?.code);
+        r_iter = iter.reverseIterator();
+        try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code);
+        try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code);
+        iter = r_iter.forwardIterator();
+        r_iter = iter.reverseIterator();
+        try testing.expectEqual(@as(u21, '2'), iter.next().?.code);
+        try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code);
+    }
+}
 const std = @import("std");
 const testing = std.testing;
 const expect = testing.expect;
diff --git a/src/micro_runeset.zig b/src/micro_runeset.zig
new file mode 100644
index 0000000..80ce4bf
--- /dev/null
+++ b/src/micro_runeset.zig
@@ -0,0 +1,207 @@
+//! Minimal RuneSet implementation
+//!
+//! This is copied from the full RuneSet module, so that `zg` doesn't
+//! depend on it.  There's one spot in the WordBreak algorithm which
+//! needs to identify the emoji Extended_Pictographic property, which
+//! is not otherwise used in ZG.  The Runeset is 89 words, while the
+//! trie lookup used throughout ZG would be much larger.
+//!
+//! The RuneSet is borrowed from Runicode, which encodes Unicode things
+//! in RuneSet form.  This will need updating for each version of Unicode.
+pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x180000e0, 0x30000000000000, 0x8001000200e00000, 0xf800b85090, 0x1801022057ff3f, 0xffffffffffffffff, 0xffffffffffff003f, 0xffffffffffffffff, 0xfffffffffff7ffbf, 0x7800000000000001, 0x400c0000000000, 0x4, 0x70ffe0000008000, 0x100, 0x1000c000000, 0x60003f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x80000000e000, 0xc003f00000000000, 0xffffe00007fe4000, 0x3fffffffff, 0xf7fc80000400fffe, 0xfffffffffffffe00, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x7ffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff, 0xffffffffffffffc0, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xfff0000000000000, 0xffffffffffe00000, 0xf000, 0xfc00ff00, 0xffffc0000000ff00, 0xffffffffffffffff, 0xf7fffffffffff000, 0xffffffffffffffbf, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff } };
+// Meaningful names for the T1 slots
+const LOW = 0;
+const HI = 1;
+const LEAD = 2;
+const T4_OFF = 3;
+/// Minimum Viable Runeset.  Must be statically created, strictly boolean matching.
+pub const RuneSet = struct {
+    body: []const u64,
+    // Returns whether the slice is a match.  This assumes the validity of the
+    // string, which can be ensured by, in particular, deriving it from a CodePoint.
+    pub fn isMatch(runeset: RuneSet, str: []const u8) bool {
+        const set = runeset.body;
+        const a = codeunit(str[0]);
+        switch (a.kind) {
+            .follow => unreachable,
+            .low => {
+                const mask = toMask(set[LOW]);
+                if (mask.isIn(a))
+                    return true
+                else
+                    return false;
+            },
+            .hi => {
+                const mask = toMask(set[HI]);
+                if (mask.isIn(a))
+                    return true
+                else
+                    return false;
+            },
+            .lead => {
+                const nB = a.nMultiBytes().?;
+                const a_mask = toMask(set[LEAD]);
+                if (!a_mask.isIn(a)) return false;
+                const b = codeunit(str[1]);
+                const b_loc = 4 + a_mask.lowerThan(a).?;
+                const b_mask = toMask(set[b_loc]);
+                if (!b_mask.isIn(b)) return false;
+                if (nB == 2) return true;
+                const t3_off = 4 + @popCount(set[LEAD]);
+                const c = codeunit(str[2]);
+                // Slice is safe because we know the T2 span has at least one word.
+                const c_off = b_mask.higherThan(b).? + popCountSlice(set[b_loc + 1 .. t3_off]);
+                const c_loc = t3_off + c_off;
+                const c_mask = toMask(set[c_loc]);
+                if (!c_mask.isIn(c)) return false;
+                if (nB == 3) return true;
+                const d_off = c_mask.lowerThan(c).? + popCountSlice(set[t3_off..c_loc]);
+                const d_loc = set[T4_OFF] + d_off;
+                const d = codeunit(str[3]);
+                const d_mask = toMask(set[d_loc]);
+                if (d_mask.isIn(d)) return true else return false;
+            },
+        }
+    }
+};
+/// Kinds of most significant bits in UTF-8
+const RuneKind = enum(u2) {
+    low,
+    hi,
+    follow,
+    lead,
+};
+/// Packed `u8` struct representing one codeunit of UTF-8.
+const CodeUnit = packed struct(u8) {
+    body: u6,
+    kind: RuneKind,
+    /// Mask to check presence
+    pub inline fn inMask(self: *const CodeUnit) u64 {
+        return @as(u64, 1) << self.body;
+    }
+    // TODO consider an nMultiBytesFast, for the cases where we
+    // know that invalid lead bytes are never present (such as in set)
+    // operations, where we may assume that (and will assert that) the
+    // LEAD mask contains no such bytes.
+    /// Number of bytes in known multi-byte rune.
+    ///
+    /// Caller guarantees that the CodeUnit is a lead byte
+    /// of a multi-byte rune: `cu.kind == .lead`.
+    ///
+    /// Invalid lead bytes will return null.
+    pub inline fn nMultiBytes(self: *const CodeUnit) ?u8 {
+        return switch (self.body) {
+            // 0 and 1 are invalid for overlong reasons,
+            // but RuneSet supports overlong encodings
+            0...31 => 2,
+            32...47 => 3,
+            48...55 => 4,
+            // Wasted space 56...61 is due entirely to Microsoft's
+            // lack of vision and insistence on a substandard
+            // and utterly inadequate encoding for Unicode
+            // "64k should be enough for anyone" <spits>
+            56...63 => null,
+        };
+    }
+    /// Given a valid lead byte, return the number of bytes that should
+    /// make up the code unit sequence.  Will return `null` if the lead
+    /// byte is invalid.
+    pub inline fn nBytes(self: *const CodeUnit) ?u8 {
+        switch (self.kind) {
+            .low, .hi => return 1,
+            .lead => return self.nMultiBytes(),
+            .follow => return null,
+        }
+    }
+    /// Mask off all bits >= cu.body
+    pub inline fn hiMask(self: *const CodeUnit) u64 {
+        return (@as(u64, 1) << self.body) - 1;
+    }
+    /// Mask off all bits <= cu.body
+    pub inline fn lowMask(self: *const CodeUnit) u64 {
+        if (self.body == 63)
+            return 0
+        else
+            return ~((@as(u64, 1) << (self.body + 1)) - 1);
+    }
+    /// Cast the `CodeUnit` to its backing `u8`.
+    pub inline fn byte(self: *const CodeUnit) u8 {
+        return @bitCast(self.*);
+    }
+};
+/// Cast raw byte to CodeUnit
+inline fn codeunit(b: u8) CodeUnit {
+    return @bitCast(b);
+}
+inline fn toMask(w: u64) Mask {
+    return Mask.toMask(w);
+}
+/// Bitmask for runesets
+///
+/// We define our own bitset, because the operations we need to
+/// perform only overlap with IntegerBitSet for trivial one-liners,
+/// and furthermore, we need nondestructive versions of the basic
+/// operations, which aren't a part of the IntegerBitSet interface.
+///
+/// Note that Masks do not track which kind of byte they apply to,
+/// since they will be stored as ordinary u64s.  User code must
+/// ensure that CodeUnits tested against a Mask are of the appropriate
+/// type, and otherwise valid for the test performed.
+///
+const Mask = struct {
+    m: u64,
+    pub fn toMask(w: u64) Mask {
+        return Mask{ .m = w };
+    }
+    /// Test if a CodeUnit's low bytes are present in mask
+    pub inline fn isIn(self: Mask, cu: CodeUnit) bool {
+        return self.m | cu.inMask() == self.m;
+    }
+    /// Return number of bytes lower than cu.body in mask,
+    /// if cu inhabits the mask.  Otherwise return null.
+    pub inline fn lowerThan(self: Mask, cu: CodeUnit) ?u64 {
+        if (self.isIn(cu)) {
+            const m = cu.hiMask();
+            return @popCount(self.m & m);
+        } else {
+            return null;
+        }
+    }
+    /// Return number of bytes higher than cu.body in mask,
+    /// if cu inhabits the mask.  Otherwise return null.
+    pub inline fn higherThan(self: Mask, cu: CodeUnit) ?u64 {
+        if (self.isIn(cu)) {
+            const m = cu.lowMask();
+            return @popCount(self.m & m);
+        } else {
+            return null;
+        }
+    }
+};
+/// Sum of @popCount of all words in region.
+inline fn popCountSlice(region: []const u64) usize {
+    var ct: usize = 0;
+    for (region) |w| ct += @popCount(w);
+    return ct;
+}
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 2249007..ae177a9 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -1,43 +1,4 @@
-const std = @import("std");
+const dbg_print = false;
-const fmt = std.fmt;
-const fs = std.fs;
-const io = std.io;
-const heap = std.heap;
-const mem = std.mem;
-const testing = std.testing;
-const unicode = std.unicode;
-const grapheme = @import("Graphemes");
-const Grapheme = @import("Graphemes").Grapheme;
-const Graphemes = @import("Graphemes");
-const GraphemeIterator = @import("Graphemes").Iterator;
-const Normalize = @import("Normalize");
-comptime {
-    testing.refAllDecls(grapheme);
-}
-test "Iterator.peek" {
-    const peek_seq = "aΔ👨🏻‍🌾→";
-    const data = try Graphemes.init(std.testing.allocator);
-    defer data.deinit(std.testing.allocator);
-    var iter = data.iterator(peek_seq);
-    const peek_a = iter.peek().?;
-    const next_a = iter.next().?;
-    try std.testing.expectEqual(peek_a, next_a);
-    try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq));
-    const peek_d1 = iter.peek().?;
-    const peek_d2 = iter.peek().?;
-    try std.testing.expectEqual(peek_d1, peek_d2);
-    const next_d = iter.next().?;
-    try std.testing.expectEqual(peek_d2, next_d);
-    try std.testing.expectEqual(iter.peek(), iter.next());
-    try std.testing.expectEqual(iter.peek(), iter.next());
-    try std.testing.expectEqual(null, iter.peek());
-    try std.testing.expectEqual(null, iter.peek());
-    try std.testing.expectEqual(iter.peek(), iter.next());
-}
 test "Unicode normalization tests" {
    var arena = heap.ArenaAllocator.init(testing.allocator);
@@ -50,16 +11,14 @@ test "Unicode normalization tests" {
    var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
    defer file.close();
    var buf_reader = io.bufferedReader(file.reader());
-    const input_stream = buf_reader.reader();
+    var input_stream = buf_reader.reader();
-    var line_no: usize = 0;
    var buf: [4096]u8 = undefined;
    var cp_buf: [4]u8 = undefined;
-    while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| {
+    var line_iter: IterRead = .{ .read = &input_stream };
-        line_no += 1;
-        // Skip comments or empty lines.
+    while (try line_iter.next(&buf)) |line| {
-        if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
        // Iterate over fields.
        var fields = mem.splitScalar(u8, line, ';');
        var field_index: usize = 0;
@@ -80,7 +39,7 @@ test "Unicode normalization tests" {
                input = try i_buf.toOwnedSlice();
            } else if (field_index == 1) {
-                //debug.print("\n*** {s} ***\n", .{line});
+                if (dbg_print) debug.print("\n*** {s} ***\n", .{line});
                // NFC, time to test.
                var w_buf = std.ArrayList(u8).init(allocator);
                defer w_buf.deinit();
@@ -162,20 +121,17 @@ test "Segmentation GraphemeIterator" {
    var buf_reader = std.io.bufferedReader(file.reader());
    var input_stream = buf_reader.reader();
-    const data = try Graphemes.init(allocator);
+    const graph = try Graphemes.init(allocator);
-    defer data.deinit(allocator);
+    defer graph.deinit(allocator);
    var buf: [4096]u8 = undefined;
-    var line_no: usize = 1;
+    var line_iter: IterRead = .{ .read = &input_stream };
-    while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) {
-        // Skip comments or empty lines.
-        if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
+    while (try line_iter.next(&buf)) |raw| {
        // Clean up.
        var line = std.mem.trimLeft(u8, raw, "÷ ");
-        if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| {
+        if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
-            line = line[0..octo];
+            line = line[0..final];
        }
        // Iterate over fields.
        var want = std.ArrayList(Grapheme).init(allocator);
@@ -185,12 +141,12 @@ test "Segmentation GraphemeIterator" {
        defer all_bytes.deinit();
        var graphemes = std.mem.splitSequence(u8, line, " ÷ ");
-        var bytes_index: u32 = 0;
+        var bytes_index: uoffset = 0;
        while (graphemes.next()) |field| {
            var code_points = std.mem.splitScalar(u8, field, ' ');
            var cp_buf: [4]u8 = undefined;
-            var cp_index: u32 = 0;
+            var cp_index: uoffset = 0;
            var gc_len: u8 = 0;
            while (code_points.next()) |code_point| {
@@ -206,16 +162,324 @@ test "Segmentation GraphemeIterator" {
            bytes_index += cp_index;
        }
-        // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
+        const this_str = all_bytes.items;
-        var iter = data.iterator(all_bytes.items);
+        {
+            var iter = graph.iterator(this_str);
+            // Check.
+            for (want.items, 1..) |want_gc, idx| {
+                const got_gc = (iter.next()).?;
+                try std.testing.expectEqualStrings(
+                    want_gc.bytes(this_str),
+                    got_gc.bytes(this_str),
+                );
+                for (got_gc.offset..got_gc.offset + got_gc.len) |i| {
+                    const this_gc = graph.graphemeAtIndex(this_str, i);
+                    std.testing.expectEqualSlices(
+                        u8,
+                        got_gc.bytes(this_str),
+                        this_gc.bytes(this_str),
+                    ) catch |err| {
+                        debug.print("Wrong grapheme on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i });
+                        return err;
+                    };
+                }
+                var after_iter = graph.iterateAfterGrapheme(this_str, got_gc);
+                if (after_iter.next()) |next_gc| {
+                    if (iter.peek()) |next_peek| {
+                        std.testing.expectEqualSlices(
+                            u8,
+                            next_gc.bytes(this_str),
+                            next_peek.bytes(this_str),
+                        ) catch |err| {
+                            debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, idx });
+                            return err;
+                        };
+                    } else {
+                        debug.print("Mismatch: peek missing, next found, line {d} #{d}\n", .{ line_iter.line, idx });
+                        try testing.expect(false);
+                    }
+                } else {
+                    try testing.expectEqual(null, iter.peek());
+                }
+            }
+        }
+        {
+            var iter = graph.reverseIterator(this_str);
+            // Check.
+            var i: usize = want.items.len;
+            while (i > 0) {
+                i -= 1;
+                const want_gc = want.items[i];
+                const got_gc = iter.prev() orelse {
+                    std.debug.print(
+                        "line {d} grapheme {d}: expected {any} found null\n",
+                        .{ line_iter.line, i, want_gc },
+                    );
+                    return error.TestExpectedEqual;
+                };
+                std.testing.expectEqualStrings(
+                    want_gc.bytes(this_str),
+                    got_gc.bytes(this_str),
+                ) catch |err| {
+                    std.debug.print(
+                        "line {d} grapheme {d}: expected {any} found {any}\n",
+                        .{ line_iter.line, i, want_gc, got_gc },
+                    );
+                    return err;
+                };
+                var before_iter = graph.iterateBeforeGrapheme(this_str, got_gc);
+                if (before_iter.prev()) |prev_gc| {
+                    if (iter.peek()) |prev_peek| {
+                        std.testing.expectEqualSlices(
+                            u8,
+                            prev_gc.bytes(this_str),
+                            prev_peek.bytes(this_str),
+                        ) catch |err| {
+                            debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, i });
+                            return err;
+                        };
+                    } else {
+                        debug.print("Mismatch: peek missing, prev found, line {d} #{d}\n", .{ line_iter.line, i });
+                        try testing.expect(false);
+                    }
+                } else {
+                    try testing.expectEqual(null, iter.peek());
+                }
+            }
+        }
+    }
+}
+test "Segmentation Word Iterator" {
+    const allocator = std.testing.allocator;
+    var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{});
+    defer file.close();
+    var buf_reader = std.io.bufferedReader(file.reader());
+    var input_stream = buf_reader.reader();
+    const wb = try Words.init(allocator);
+    defer wb.deinit(allocator);
+    var buf: [4096]u8 = undefined;
+    var line_iter: IterRead = .{ .read = &input_stream };
+    while (try line_iter.next(&buf)) |raw| {
+        // Clean up.
+        var line = std.mem.trimLeft(u8, raw, "÷ ");
+        if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
+            line = line[0..final];
+        }
+        // Iterate over fields.
+        var want = std.ArrayList(Word).init(allocator);
+        defer want.deinit();
+        var all_bytes = std.ArrayList(u8).init(allocator);
+        defer all_bytes.deinit();
+        var words = std.mem.splitSequence(u8, line, " ÷ ");
+        var bytes_index: uoffset = 0;
+        while (words.next()) |field| {
+            var code_points = std.mem.splitScalar(u8, field, ' ');
+            var cp_buf: [4]u8 = undefined;
+            var cp_index: uoffset = 0;
+            var gc_len: u8 = 0;
-        // Check.
+            while (code_points.next()) |code_point| {
-        for (want.items) |want_gc| {
+                if (std.mem.eql(u8, code_point, "×")) continue;
-            const got_gc = (iter.next()).?;
+                const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
-            try std.testing.expectEqualStrings(
+                const len = try unicode.utf8Encode(cp, &cp_buf);
-                want_gc.bytes(all_bytes.items),
+                try all_bytes.appendSlice(cp_buf[0..len]);
-                got_gc.bytes(all_bytes.items),
+                cp_index += len;
-            );
+                gc_len += len;
+            }
+            try want.append(Word{ .len = gc_len, .offset = bytes_index });
+            bytes_index += cp_index;
+        }
+        const this_str = all_bytes.items;
+        {
+            var iter = wb.iterator(this_str);
+            var peeked: ?Word = iter.peek();
+            // Check.
+            for (want.items, 1..) |want_word, idx| {
+                const got_word = (iter.next()).?;
+                std.testing.expectEqualStrings(
+                    want_word.bytes(this_str),
+                    got_word.bytes(this_str),
+                ) catch |err| {
+                    debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx });
+                    return err;
+                };
+                std.testing.expectEqualStrings(
+                    peeked.?.bytes(this_str),
+                    got_word.bytes(this_str),
+                ) catch |err| {
+                    debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx });
+                    return err;
+                };
+                var r_iter = iter.reverseIterator();
+                const if_r_word = r_iter.prev();
+                if (if_r_word) |r_word| {
+                    std.testing.expectEqualStrings(
+                        want_word.bytes(this_str),
+                        r_word.bytes(this_str),
+                    ) catch |err| {
+                        debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx });
+                        return err;
+                    };
+                } else {
+                    try testing.expect(false);
+                }
+                var peek_iter = wb.iterateAfterWord(this_str, got_word);
+                const peek_1 = peek_iter.next();
+                if (peek_1) |p1| {
+                    const peek_2 = iter.peek();
+                    if (peek_2) |p2| {
+                        std.testing.expectEqualSlices(
+                            u8,
+                            p1.bytes(this_str),
+                            p2.bytes(this_str),
+                        ) catch |err| {
+                            debug.print("Bad peek on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, idx });
+                            return err;
+                        };
+                    } else {
+                        try testing.expect(false);
+                    }
+                } else {
+                    try testing.expectEqual(null, iter.peek());
+                }
+                for (got_word.offset..got_word.offset + got_word.len) |i| {
+                    const this_word = wb.wordAtIndex(this_str, i);
+                    std.testing.expectEqualSlices(
+                        u8,
+                        got_word.bytes(this_str),
+                        this_word.bytes(this_str),
+                    ) catch |err| {
+                        debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i });
+                        return err;
+                    };
+                }
+                peeked = iter.peek();
+            }
+        }
+        {
+            var r_iter = wb.reverseIterator(this_str);
+            var peeked: ?Word = r_iter.peek();
+            var idx = want.items.len - 1;
+            while (true) : (idx -= 1) {
+                const want_word = want.items[idx];
+                const got_word = r_iter.prev().?;
+                std.testing.expectEqualSlices(
+                    u8,
+                    want_word.bytes(this_str),
+                    got_word.bytes(this_str),
+                ) catch |err| {
+                    debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 });
+                    return err;
+                };
+                std.testing.expectEqualStrings(
+                    peeked.?.bytes(this_str),
+                    got_word.bytes(this_str),
+                ) catch |err| {
+                    debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx + 1 });
+                    return err;
+                };
+                var f_iter = r_iter.forwardIterator();
+                const if_f_word = f_iter.next();
+                if (if_f_word) |f_word| {
+                    std.testing.expectEqualStrings(
+                        want_word.bytes(this_str),
+                        f_word.bytes(this_str),
+                    ) catch |err| {
+                        debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx });
+                        return err;
+                    };
+                } else {
+                    try testing.expect(false);
+                }
+                var peek_iter = wb.iterateBeforeWord(this_str, got_word);
+                const peek_1 = peek_iter.prev();
+                if (peek_1) |p1| {
+                    const peek_2 = r_iter.peek();
+                    if (peek_2) |p2| {
+                        std.testing.expectEqualSlices(
+                            u8,
+                            p1.bytes(this_str),
+                            p2.bytes(this_str),
+                        ) catch |err| {
+                            debug.print("Bad peek on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, idx });
+                            return err;
+                        };
+                    } else {
+                        try testing.expect(false);
+                    }
+                } else {
+                    try testing.expectEqual(null, r_iter.peek());
+                }
+                for (got_word.offset..got_word.offset + got_word.len) |i| {
+                    const this_word = wb.wordAtIndex(this_str, i);
+                    std.testing.expectEqualSlices(
+                        u8,
+                        got_word.bytes(this_str),
+                        this_word.bytes(this_str),
+                    ) catch |err| {
+                        debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i });
+                        return err;
+                    };
+                }
+                peeked = r_iter.peek();
+                if (idx == 0) break;
+            }
        }
    }
 }
+const IterRead = struct {
+    read: *Reader,
+    line: usize = 0,
+    pub fn next(iter: *IterRead, buf: []u8) !?[]const u8 {
+        defer iter.line += 1;
+        const maybe_line = try iter.read.readUntilDelimiterOrEof(buf, '#');
+        if (maybe_line) |this_line| {
+            try iter.read.skipUntilDelimiterOrEof('\n');
+            if (this_line.len == 0 or this_line[0] == '@') {
+                // comment, next line
+                return iter.next(buf);
+            } else {
+                return this_line;
+            }
+        } else {
+            return null;
+        }
+    }
+};
+const std = @import("std");
+const fmt = std.fmt;
+const fs = std.fs;
+const io = std.io;
+const Reader = io.BufferedReader(4096, fs.File.Reader).Reader;
+const heap = std.heap;
+const mem = std.mem;
+const debug = std.debug;
+const testing = std.testing;
+const unicode = std.unicode;
+const uoffset = @FieldType(Word, "offset");
+const Grapheme = @import("Graphemes").Grapheme;
+const Graphemes = @import("Graphemes");
+const GraphemeIterator = @import("Graphemes").Iterator;
+const Normalize = @import("Normalize");
+const Words = @import("Words");
+const Word = Words.Word;
author	Sam Atman	2025-07-08 12:15:32 -0400
committer	Sam Atman	2025-07-08 12:15:32 -0400
commit	9427a9e53aaa29ee071f4dcb35b809a699d75aa9 (patch)
tree	2607c185fd8053b84d60041fadc35c05a0225d34 /src
parent	Merge pull request 'Fix benchmarks' (#56) from jacobsandlund/zg:benchmarks in... (diff)
parent	Add Words.zig example to README (diff)
download	zg-9427a9e53aaa29ee071f4dcb35b809a699d75aa9.tar.gz zg-9427a9e53aaa29ee071f4dcb35b809a699d75aa9.tar.xz zg-9427a9e53aaa29ee071f4dcb35b809a699d75aa9.zip