From 21d55b730e25733c98fa58de42d256adf5f80d88 Mon Sep 17 00:00:00 2001
From: Sam Atman
Date: Fri, 16 May 2025 12:06:32 -0400
Subject: Move WordBreak to Words

---
 src/WordBreak.zig | 720 ------------------------------------------------------
 src/Words.zig     | 720 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 720 insertions(+), 720 deletions(-)
 delete mode 100644 src/WordBreak.zig
 create mode 100644 src/Words.zig

(limited to 'src')

diff --git a/src/WordBreak.zig b/src/WordBreak.zig
deleted file mode 100644
index 6a532f5..0000000
--- a/src/WordBreak.zig
+++ /dev/null
@@ -1,720 +0,0 @@
-//! Word Breaking Algorithm.
-
-const WordBreakProperty = enum(u5) {
-    none,
-    Double_Quote,
-    Single_Quote,
-    Hebrew_Letter,
-    CR,
-    LF,
-    Newline,
-    Extend,
-    Regional_Indicator,
-    Format,
-    Katakana,
-    ALetter,
-    MidLetter,
-    MidNum,
-    MidNumLet,
-    Numeric,
-    ExtendNumLet,
-    ZWJ,
-    WSegSpace,
-};
-
-s1: []u16 = undefined,
-s2: []u5 = undefined,
-
-const WordBreak = @This();
-
-pub fn init(allocator: Allocator) Allocator.Error!WordBreak {
-    var wb: WordBreak = undefined;
-    try wb.setup(allocator);
-    return wb;
-}
-
-pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void {
-    wb.setupImpl(allocator) catch |err| {
-        switch (err) {
-            error.OutOfMemory => |e| return e,
-            else => unreachable,
-        }
-    };
-}
-
-pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void {
-    allocator.free(wordbreak.s1);
-    allocator.free(wordbreak.s2);
-}
-
-/// Represents a Unicode word span, as an offset into the source string
-/// and the length of the word.
-pub const Word = struct {
-    offset: u32,
-    len: u32,
-
-    /// Returns a slice of the word given the source string.
-    pub fn bytes(self: Word, src: []const u8) []const u8 {
-        return src[self.offset..][0..self.len];
-    }
-};
-
-/// Returns the word break property type for `cp`.
-pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
-    return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
-}
-
-/// Convenience function for working with CodePoints
-fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty {
-    return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]);
-}
-
-/// Returns the Word at the given index.  Asserts that the index is less than
-/// `string.len`, and that the string is not empty. Always returns a word.
-/// The index does not have to be the start of a codepoint in the word.
-pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word {
-    assert(index < string.len and string.len > 0);
-    var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index);
-    const first_back = iter_back.prev();
-    if (first_back) |back| {
-        if (back.offset == 0) {
-            var iter_fwd = wordbreak.iterator(string);
-            while (iter_fwd.next()) |word| {
-                if (word.offset <= index and index < word.offset + word.len)
-                    return word;
-            }
-        }
-    } else {
-        var iter_fwd = wordbreak.iterator(string);
-        while (iter_fwd.next()) |word| {
-            if (word.offset <= index and index < word.offset + word.len)
-                return word;
-        }
-    }
-    const second_back = iter_back.prev();
-    if (second_back) |back| if (back.offset == 0) {
-        var iter_fwd = wordbreak.iterator(string);
-        while (iter_fwd.next()) |word| {
-            if (word.offset <= index and index < word.offset + word.len)
-                return word;
-        }
-    };
-    // There's sometimes flags:
-    if (iter_back.flags > 0) {
-        while (iter_back.flags > 0) {
-            if (iter_back.prev()) |_| {
-                continue;
-            } else {
-                break;
-            }
-        }
-    }
-    var iter_fwd = iter_back.forwardIterator();
-    while (iter_fwd.next()) |word| {
-        if (word.offset <= index and index < word.offset + word.len)
-            return word;
-    }
-    unreachable;
-}
-
-/// Returns an iterator over words in `slice`.
-pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator {
-    return Iterator.init(wordbreak, slice);
-}
-
-/// Returns a reverse iterator over the words in `slice`.
-pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator {
-    return ReverseIterator.init(wordbreak, slice);
-}
-
-/// An iterator, forward, over all words in a provided string.
-pub const Iterator = struct {
-    this: ?CodePoint = null,
-    that: ?CodePoint = null,
-    cp_iter: CodepointIterator,
-    wb: *const WordBreak,
-
-    /// Assumes `str` is valid UTF-8.
-    pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
-        var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb };
-        wb_iter.advance();
-        return wb_iter;
-    }
-
-    /// Returns the next word segment, without advancing.
-    pub fn peek(iter: *Iterator) ?Word {
-        const cache = .{ iter.this, iter.that, iter.cp_iter };
-        defer {
-            iter.this, iter.that, iter.cp_iter = cache;
-        }
-        return iter.next();
-    }
-
-    /// Returns a reverse iterator from the point this iterator is paused
-    /// at.  Usually, and always when using the API to create iterators,
-    /// calling `prev()` will return the word just seen.
-    pub fn reverseIterator(iter: *Iterator) ReverseIterator {
-        var cp_it = iter.cp_iter.reverseIterator();
-        if (iter.that) |_|
-            _ = cp_it.prev();
-        if (iter.cp_iter.peek()) |_|
-            _ = cp_it.prev();
-        return .{
-            .wb = iter.wb,
-            .before = cp_it.prev(),
-            .after = iter.that,
-            .cp_iter = cp_it,
-        };
-    }
-
-    /// Returns the next word segment, if any.
-    pub fn next(iter: *Iterator) ?Word {
-        iter.advance();
-
-        // Done?
-        if (iter.this == null) return null;
-        // Last?
-        if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
-
-        const word_start = iter.this.?.offset;
-        var word_len: u32 = 0;
-
-        // State variables.
-        var last_p: WordBreakProperty = .none;
-        var last_last_p: WordBreakProperty = .none;
-        var ri_count: usize = 0;
-
-        scan: while (true) : (iter.advance()) {
-            const this = iter.this.?;
-            word_len += this.len;
-            if (iter.that) |that| {
-                const this_p = iter.wb.breakProp(this);
-                const that_p = iter.wb.breakProp(that);
-                if (!isIgnorable(this_p)) {
-                    last_last_p = last_p;
-                    last_p = this_p;
-                }
-                // WB3  CR × LF
-                if (this_p == .CR and that_p == .LF) continue :scan;
-                // WB3a  (Newline | CR | LF) ÷
-                if (isNewline(this_p)) break :scan;
-                // WB3b  ÷ (Newline | CR | LF)
-                if (isNewline(that_p)) break :scan;
-                // WB3c  ZWJ × \p{Extended_Pictographic}
-                if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) {
-                    continue :scan;
-                }
-                // WB3d  WSegSpace × WSegSpace
-                if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
-                // WB4  X (Extend | Format | ZWJ)* → X
-                if (isIgnorable(that_p)) {
-                    continue :scan;
-                } // Now we use last_p instead of this_p for ignorable's sake
-                if (isAHLetter(last_p)) {
-                    // WB5  AHLetter × AHLetter
-                    if (isAHLetter(that_p)) continue :scan;
-                    // WB6  AHLetter × (MidLetter | MidNumLetQ) AHLetter
-                    if (isMidVal(that_p)) {
-                        const next_val = iter.peekPast();
-                        if (next_val) |next_cp| {
-                            const next_p = iter.wb.breakProp(next_cp);
-                            if (isAHLetter(next_p)) {
-                                continue :scan;
-                            }
-                        }
-                    }
-                }
-                // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
-                if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) {
-                    continue :scan;
-                }
-                if (last_p == .Hebrew_Letter) {
-                    // WB7a  Hebrew_Letter × Single_Quote
-                    if (that_p == .Single_Quote) continue :scan;
-                    // WB7b  Hebrew_Letter × Double_Quote Hebrew_Letter
-                    if (that_p == .Double_Quote) {
-                        const next_val = iter.peekPast();
-                        if (next_val) |next_cp| {
-                            const next_p = iter.wb.breakProp(next_cp);
-                            if (next_p == .Hebrew_Letter) {
-                                continue :scan;
-                            }
-                        }
-                    }
-                }
-                // WB7c  Hebrew_Letter Double_Quote × Hebrew_Letter
-                if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter)
-                    continue :scan;
-                // WB8  Numeric × Numeric
-                if (last_p == .Numeric and that_p == .Numeric) continue :scan;
-                // WB9  AHLetter × Numeric
-                if (isAHLetter(last_p) and that_p == .Numeric) continue :scan;
-                // WB10  Numeric ×  AHLetter
-                if (last_p == .Numeric and isAHLetter(that_p)) continue :scan;
-                // WB11  Numeric (MidNum | MidNumLetQ) × Numeric
-                if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric)
-                    continue :scan;
-                // WB12  Numeric × (MidNum | MidNumLetQ) Numeric
-                if (last_p == .Numeric and isMidNum(that_p)) {
-                    const next_val = iter.peekPast();
-                    if (next_val) |next_cp| {
-                        const next_p = iter.wb.breakProp(next_cp);
-                        if (next_p == .Numeric) {
-                            continue :scan;
-                        }
-                    }
-                }
-                // WB13  Katakana × Katakana
-                if (last_p == .Katakana and that_p == .Katakana) continue :scan;
-                // WB13a  (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-                if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan;
-                // WB13b  ExtendNumLet × (AHLetter | Numeric | Katakana)
-                if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
-                // WB15, WB16  ([^RI] | sot) (RI RI)* RI × RI
-                const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator;
-                if (maybe_flag) {
-                    ri_count += 1;
-                    if (ri_count % 2 == 1) continue :scan;
-                }
-                // WB999  Any ÷ Any
-                break :scan;
-            } else { // iter.that == null
-                break :scan;
-            }
-        }
-
-        return Word{ .len = word_len, .offset = word_start };
-    }
-
-    pub fn format(iter: Iterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
-        try writer.print(
-            "Iterator {{ .this = {any}, .that = {any} }}",
-            .{ iter.this, iter.that },
-        );
-    }
-
-    fn advance(iter: *Iterator) void {
-        iter.this = iter.that;
-        iter.that = iter.cp_iter.next();
-    }
-
-    fn peekPast(iter: *Iterator) ?CodePoint {
-        const save_cp = iter.cp_iter;
-        defer iter.cp_iter = save_cp;
-        while (iter.cp_iter.peek()) |peeked| {
-            if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
-            _ = iter.cp_iter.next();
-        }
-        return null;
-    }
-};
-
-/// An iterator, backward, over all words in a provided string.
-pub const ReverseIterator = struct {
-    after: ?CodePoint = null,
-    before: ?CodePoint = null,
-    cp_iter: ReverseCodepointIterator,
-    wb: *const WordBreak,
-    flags: usize = 0,
-
-    /// Assumes `str` is valid UTF-8.
-    pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator {
-        var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb };
-        wb_iter.advance();
-        return wb_iter;
-    }
-
-    /// Returns the previous word segment, if any, without advancing.
-    pub fn peek(iter: *ReverseIterator) ?Word {
-        const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags };
-        defer {
-            iter.before, iter.after, iter.cp_iter, iter.flags = cache;
-        }
-        return iter.prev();
-    }
-
-    /// Return a forward iterator from where this iterator paused.  Usually,
-    /// and always when using the API to create iterators, calling `next()`
-    /// will return the word just seen.
-    pub fn forwardIterator(iter: *ReverseIterator) Iterator {
-        var cp_it = iter.cp_iter.forwardIterator();
-        if (iter.before) |_|
-            _ = cp_it.next();
-        return .{
-            .wb = iter.wb,
-            .this = cp_it.next(),
-            .that = iter.after,
-            .cp_iter = cp_it,
-        };
-    }
-
-    /// Return the previous word, if any.
-    pub fn prev(iter: *ReverseIterator) ?Word {
-        iter.advance();
-
-        // Done?
-        if (iter.after == null) return null;
-        // Last?
-        if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 };
-
-        const word_end = iter.after.?.offset + iter.after.?.len;
-        var word_len: u32 = 0;
-
-        // State variables.
-        var last_p: WordBreakProperty = .none;
-        var last_last_p: WordBreakProperty = .none;
-
-        scan: while (true) : (iter.advance()) {
-            const after = iter.after.?;
-            word_len += after.len;
-            if (iter.before) |before| {
-                var sneak = sneaky(iter); // 'sneaks' past ignorables
-                const after_p = iter.wb.breakProp(after);
-                var before_p = iter.wb.breakProp(before);
-                if (!isIgnorable(after_p)) {
-                    last_last_p = last_p;
-                    last_p = after_p;
-                }
-                // WB3  CR × LF
-                if (before_p == .CR and after_p == .LF) continue :scan;
-                // WB3a  (Newline | CR | LF) ÷
-                if (isNewline(before_p)) break :scan;
-                // WB3b  ÷ (Newline | CR | LF)
-                if (isNewline(after_p)) break :scan;
-                // WB3c  ZWJ × \p{Extended_Pictographic}
-                if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) {
-                    continue :scan;
-                }
-                // WB3d  WSegSpace × WSegSpace
-                if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan;
-                // WB4  X (Extend | Format | ZWJ)* → X
-                if (isIgnorable(before_p)) {
-                    const maybe_before = sneak.prev();
-                    if (maybe_before) |valid_before| {
-                        before_p = iter.wb.breakProp(valid_before);
-                    } else if (!isIgnorable(after_p)) {
-                        // We're done
-                        break :scan;
-                    }
-                }
-                if (isIgnorable(after_p)) continue :scan;
-                // WB5  AHLetter × AHLetter
-                if (isAHLetter(last_p) and isAHLetter(before_p)) {
-                    continue :scan;
-                }
-                // WB6  AHLetter × (MidLetter | MidNumLetQ) AHLetter
-                if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) {
-                    continue :scan;
-                }
-                // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
-                if (isMidVal(before_p) and isAHLetter(last_p)) {
-                    const prev_val = sneak.peek();
-                    if (prev_val) |prev_cp| {
-                        const prev_p = iter.wb.breakProp(prev_cp);
-                        if (isAHLetter(prev_p)) {
-                            continue :scan;
-                        }
-                    }
-                }
-                // WB7a  Hebrew_Letter × Single_Quote
-                if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan;
-                // WB7b  Hebrew_Letter × Double_Quote Hebrew_Letter
-                if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) {
-                    continue :scan;
-                }
-                // WB7c  Hebrew_Letter Double_Quote × Hebrew_Letter
-                if (before_p == .Double_Quote and last_p == .Hebrew_Letter) {
-                    const prev_val = sneak.peek();
-                    if (prev_val) |prev_cp| {
-                        const prev_p = iter.wb.breakProp(prev_cp);
-                        if (prev_p == .Hebrew_Letter) {
-                            continue :scan;
-                        }
-                    }
-                }
-                // WB8  Numeric × Numeric
-                if (before_p == .Numeric and last_p == .Numeric) continue :scan;
-                // WB9  AHLetter × Numeric
-                if (isAHLetter(before_p) and last_p == .Numeric) continue :scan;
-                // WB10  Numeric ×  AHLetter
-                if (before_p == .Numeric and isAHLetter(last_p)) continue :scan;
-                // WB11  Numeric (MidNum | MidNumLetQ) × Numeric
-                if (isMidNum(before_p) and last_p == .Numeric) {
-                    const prev_val = sneak.peek();
-                    if (prev_val) |prev_cp| {
-                        const prev_p = iter.wb.breakProp(prev_cp);
-                        if (prev_p == .Numeric) {
-                            continue :scan;
-                        }
-                    }
-                }
-                // WB12  Numeric × (MidNum | MidNumLetQ) Numeric
-                if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) {
-                    continue :scan;
-                }
-                // WB13  Katakana × Katakana
-                if (before_p == .Katakana and last_p == .Katakana) continue :scan;
-                // WB13a  (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-                if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan;
-                // WB13b  ExtendNumLet × (AHLetter | Numeric | Katakana)
-                if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan;
-                // WB15, WB16  ([^RI] | sot) (RI RI)* RI × RI
-                // NOTE:
-                // So here we simply have to know whether a run of flags is even or odd.
-                // The whole run.  To avoid quadratic behavior (and long flag runs are
-                // actually a thing in the wild), we have to count them once, store that
-                // on the iterator, and decrement each time we see two, possibly breaking
-                // once extra at the beginning. They break up one per flag, once we hit
-                // zero, that's all the flags.  If we see another flag we do it again.
-                if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) {
-                    defer {
-                        if (iter.flags > 0) iter.flags -= 1;
-                    }
-                    if (iter.flags == 0) {
-                        iter.flags = sneak.countFlags();
-                    }
-                    if (iter.flags % 2 == 0) {
-                        continue :scan;
-                    }
-                }
-                // WB999  Any ÷ Any
-                break :scan;
-            }
-            break :scan;
-        }
-        return Word{ .len = word_len, .offset = word_end - word_len };
-    }
-
-    pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
-        try writer.print(
-            "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}",
-            .{ iter.before, iter.after, iter.flags },
-        );
-    }
-
-    fn peekPast(iter: *ReverseIterator) ?CodePoint {
-        const save_cp = iter.cp_iter;
-        defer iter.cp_iter = save_cp;
-        while (iter.cp_iter.peek()) |peeked| {
-            if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
-            _ = iter.cp_iter.prev();
-        }
-        return null;
-    }
-
-    fn advance(iter: *ReverseIterator) void {
-        iter.after = iter.before;
-        iter.before = iter.cp_iter.prev();
-    }
-};
-
-//| Implementation Details
-
-/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`.
-fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator {
-    var idx: u32 = @intCast(index);
-    // Find the next lead byte:
-    while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
-    if (idx == string.len) return wb.reverseIterator(string);
-    var iter: ReverseIterator = undefined;
-    iter.wb = wb;
-    iter.flags = 0;
-    // We need to populate the CodePoints, and the codepoint iterator.
-    // Consider "abc| def" with the cursor as |.
-    // We need `before` to be `c` and `after` to be ' ',
-    // and `cp_iter.prev()` to be `b`.
-    var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
-    iter.after = cp_iter.prev();
-    iter.before = cp_iter.prev();
-    iter.cp_iter = cp_iter;
-    return iter;
-}
-
-fn sneaky(iter: *const ReverseIterator) SneakIterator {
-    return .{ .cp_iter = iter.cp_iter, .wb = iter.wb };
-}
-
-const SneakIterator = struct {
-    cp_iter: ReverseCodepointIterator,
-    wb: *const WordBreak,
-
-    fn peek(iter: *SneakIterator) ?CodePoint {
-        const save_cp = iter.cp_iter;
-        defer iter.cp_iter = save_cp;
-        while (iter.cp_iter.peek()) |peeked| {
-            if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
-            _ = iter.cp_iter.prev();
-        }
-        return null;
-    }
-
-    fn countFlags(iter: *SneakIterator) usize {
-        var flags: usize = 0;
-        const save_cp = iter.cp_iter;
-        defer iter.cp_iter = save_cp;
-        while (iter.cp_iter.prev()) |cp| {
-            const prop = iter.wb.breakProp(cp);
-            if (isIgnorable(prop)) continue;
-            if (prop == .Regional_Indicator) {
-                flags += 1;
-            } else break;
-        }
-        return flags;
-    }
-
-    fn prev(iter: *SneakIterator) ?CodePoint {
-        while (iter.cp_iter.prev()) |peeked| {
-            if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
-        }
-        return null;
-    }
-};
-
-inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void {
-    const decompressor = compress.flate.inflate.decompressor;
-    const in_bytes = @embedFile("wbp");
-    var in_fbs = std.io.fixedBufferStream(in_bytes);
-    var in_decomp = decompressor(.raw, in_fbs.reader());
-    var reader = in_decomp.reader();
-
-    const endian = builtin.cpu.arch.endian();
-
-    const stage_1_len: u16 = try reader.readInt(u16, endian);
-    wb.s1 = try allocator.alloc(u16, stage_1_len);
-    errdefer allocator.free(wb.s1);
-    for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian);
-
-    const stage_2_len: u16 = try reader.readInt(u16, endian);
-    wb.s2 = try allocator.alloc(u5, stage_2_len);
-    errdefer allocator.free(wb.s2);
-    for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian));
-    var count_0: usize = 0;
-    for (wb.s2) |nyb| {
-        if (nyb == 0) count_0 += 1;
-    }
-}
-
-//| Predicates
-
-inline fn isNewline(wbp: WordBreakProperty) bool {
-    return wbp == .CR or wbp == .LF or wbp == .Newline;
-}
-
-inline fn isIgnorable(wbp: WordBreakProperty) bool {
-    return switch (wbp) {
-        .Format, .Extend, .ZWJ => true,
-        else => false,
-    };
-}
-
-inline fn isAHLetter(wbp: WordBreakProperty) bool {
-    return wbp == .ALetter or wbp == .Hebrew_Letter;
-}
-
-inline fn isMidVal(wbp: WordBreakProperty) bool {
-    return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
-}
-
-inline fn isMidNum(wbp: WordBreakProperty) bool {
-    return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote;
-}
-
-inline fn isExtensible(wbp: WordBreakProperty) bool {
-    return switch (wbp) {
-        .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
-        else => false,
-    };
-}
-
-test "Word Break Properties" {
-    const wb = try WordBreak.init(testing.allocator);
-    defer wb.deinit(testing.allocator);
-    try testing.expectEqual(.CR, wb.breakProperty('\r'));
-    try testing.expectEqual(.LF, wb.breakProperty('\n'));
-    try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש'));
-    try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}'));
-}
-
-test "ext_pict" {
-    try testing.expect(ext_pict.isMatch("👇"));
-    try testing.expect(ext_pict.isMatch("\u{2701}"));
-}
-
-test wordAtIndex {
-    const wb = try WordBreak.init(testing.allocator);
-    defer wb.deinit(testing.allocator);
-    const t_string = "first second third";
-    const second = wb.wordAtIndex(t_string, 8);
-    try testing.expectEqualStrings("second", second.bytes(t_string));
-    const third = wb.wordAtIndex(t_string, 14);
-    try testing.expectEqualStrings("third", third.bytes(t_string));
-    {
-        const first = wb.wordAtIndex(t_string, 3);
-        try testing.expectEqualStrings("first", first.bytes(t_string));
-    }
-    {
-        const first = wb.wordAtIndex(t_string, 0);
-        try testing.expectEqualStrings("first", first.bytes(t_string));
-    }
-    const last = wb.wordAtIndex(t_string, 14);
-    try testing.expectEqualStrings("third", last.bytes(t_string));
-}
-
-const testr = "don't a:ka fin!";
-
-test "reversal" {
-    const wb = try WordBreak.init(testing.allocator);
-    defer wb.deinit(testing.allocator);
-    {
-        var fwd = wb.iterator(testr);
-        var this_word: ?Word = fwd.next();
-
-        while (this_word) |this| : (this_word = fwd.next()) {
-            var back = fwd.reverseIterator();
-            const that_word = back.prev();
-            if (that_word) |that| {
-                try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr));
-            } else {
-                try testing.expect(false);
-            }
-        }
-    }
-    {
-        var back = wb.reverseIterator(testr);
-        var this_word: ?Word = back.prev();
-
-        while (this_word) |this| : (this_word = back.prev()) {
-            var fwd = back.forwardIterator();
-            const that_word = fwd.next();
-            if (that_word) |that| {
-                try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr));
-            } else {
-                try testing.expect(false);
-            }
-        }
-    }
-}
-
-fn testAllocations(allocator: Allocator) !void {
-    const wb = try WordBreak.init(allocator);
-    wb.deinit(allocator);
-}
-
-test "allocation safety" {
-    try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{});
-}
-
-const std = @import("std");
-const builtin = @import("builtin");
-const compress = std.compress;
-const mem = std.mem;
-const Allocator = mem.Allocator;
-const assert = std.debug.assert;
-const testing = std.testing;
-
-const code_point = @import("code_point");
-const CodepointIterator = code_point.Iterator;
-const ReverseCodepointIterator = code_point.ReverseIterator;
-const CodePoint = code_point.CodePoint;
-
-const ext_pict = @import("micro_runeset.zig").Extended_Pictographic;
diff --git a/src/Words.zig b/src/Words.zig
new file mode 100644
index 0000000..6a532f5
--- /dev/null
+++ b/src/Words.zig
@@ -0,0 +1,720 @@
+//! Word Breaking Algorithm.
+
+const WordBreakProperty = enum(u5) {
+    none,
+    Double_Quote,
+    Single_Quote,
+    Hebrew_Letter,
+    CR,
+    LF,
+    Newline,
+    Extend,
+    Regional_Indicator,
+    Format,
+    Katakana,
+    ALetter,
+    MidLetter,
+    MidNum,
+    MidNumLet,
+    Numeric,
+    ExtendNumLet,
+    ZWJ,
+    WSegSpace,
+};
+
+s1: []u16 = undefined,
+s2: []u5 = undefined,
+
+const WordBreak = @This();
+
+pub fn init(allocator: Allocator) Allocator.Error!WordBreak {
+    var wb: WordBreak = undefined;
+    try wb.setup(allocator);
+    return wb;
+}
+
+pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void {
+    wb.setupImpl(allocator) catch |err| {
+        switch (err) {
+            error.OutOfMemory => |e| return e,
+            else => unreachable,
+        }
+    };
+}
+
+pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void {
+    allocator.free(wordbreak.s1);
+    allocator.free(wordbreak.s2);
+}
+
+/// Represents a Unicode word span, as an offset into the source string
+/// and the length of the word.
+pub const Word = struct {
+    offset: u32,
+    len: u32,
+
+    /// Returns a slice of the word given the source string.
+    pub fn bytes(self: Word, src: []const u8) []const u8 {
+        return src[self.offset..][0..self.len];
+    }
+};
+
+/// Returns the word break property type for `cp`.
+pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
+    return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
+}
+
+/// Convenience function for working with CodePoints
+fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty {
+    return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]);
+}
+
+/// Returns the Word at the given index.  Asserts that the index is less than
+/// `string.len`, and that the string is not empty. Always returns a word.
+/// The index does not have to be the start of a codepoint in the word.
+pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word {
+    assert(index < string.len and string.len > 0);
+    var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index);
+    const first_back = iter_back.prev();
+    if (first_back) |back| {
+        if (back.offset == 0) {
+            var iter_fwd = wordbreak.iterator(string);
+            while (iter_fwd.next()) |word| {
+                if (word.offset <= index and index < word.offset + word.len)
+                    return word;
+            }
+        }
+    } else {
+        var iter_fwd = wordbreak.iterator(string);
+        while (iter_fwd.next()) |word| {
+            if (word.offset <= index and index < word.offset + word.len)
+                return word;
+        }
+    }
+    const second_back = iter_back.prev();
+    if (second_back) |back| if (back.offset == 0) {
+        var iter_fwd = wordbreak.iterator(string);
+        while (iter_fwd.next()) |word| {
+            if (word.offset <= index and index < word.offset + word.len)
+                return word;
+        }
+    };
+    // There's sometimes flags:
+    if (iter_back.flags > 0) {
+        while (iter_back.flags > 0) {
+            if (iter_back.prev()) |_| {
+                continue;
+            } else {
+                break;
+            }
+        }
+    }
+    var iter_fwd = iter_back.forwardIterator();
+    while (iter_fwd.next()) |word| {
+        if (word.offset <= index and index < word.offset + word.len)
+            return word;
+    }
+    unreachable;
+}
+
+/// Returns an iterator over words in `slice`.
+pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator {
+    return Iterator.init(wordbreak, slice);
+}
+
+/// Returns a reverse iterator over the words in `slice`.
+pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator {
+    return ReverseIterator.init(wordbreak, slice);
+}
+
+/// An iterator, forward, over all words in a provided string.
+pub const Iterator = struct {
+    this: ?CodePoint = null,
+    that: ?CodePoint = null,
+    cp_iter: CodepointIterator,
+    wb: *const WordBreak,
+
+    /// Assumes `str` is valid UTF-8.
+    pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
+        var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb };
+        wb_iter.advance();
+        return wb_iter;
+    }
+
+    /// Returns the next word segment, without advancing.
+    pub fn peek(iter: *Iterator) ?Word {
+        const cache = .{ iter.this, iter.that, iter.cp_iter };
+        defer {
+            iter.this, iter.that, iter.cp_iter = cache;
+        }
+        return iter.next();
+    }
+
+    /// Returns a reverse iterator from the point this iterator is paused
+    /// at.  Usually, and always when using the API to create iterators,
+    /// calling `prev()` will return the word just seen.
+    pub fn reverseIterator(iter: *Iterator) ReverseIterator {
+        var cp_it = iter.cp_iter.reverseIterator();
+        if (iter.that) |_|
+            _ = cp_it.prev();
+        if (iter.cp_iter.peek()) |_|
+            _ = cp_it.prev();
+        return .{
+            .wb = iter.wb,
+            .before = cp_it.prev(),
+            .after = iter.that,
+            .cp_iter = cp_it,
+        };
+    }
+
+    /// Returns the next word segment, if any.
+    pub fn next(iter: *Iterator) ?Word {
+        iter.advance();
+
+        // Done?
+        if (iter.this == null) return null;
+        // Last?
+        if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
+
+        const word_start = iter.this.?.offset;
+        var word_len: u32 = 0;
+
+        // State variables.
+        var last_p: WordBreakProperty = .none;
+        var last_last_p: WordBreakProperty = .none;
+        var ri_count: usize = 0;
+
+        scan: while (true) : (iter.advance()) {
+            const this = iter.this.?;
+            word_len += this.len;
+            if (iter.that) |that| {
+                const this_p = iter.wb.breakProp(this);
+                const that_p = iter.wb.breakProp(that);
+                if (!isIgnorable(this_p)) {
+                    last_last_p = last_p;
+                    last_p = this_p;
+                }
+                // WB3  CR × LF
+                if (this_p == .CR and that_p == .LF) continue :scan;
+                // WB3a  (Newline | CR | LF) ÷
+                if (isNewline(this_p)) break :scan;
+                // WB3b  ÷ (Newline | CR | LF)
+                if (isNewline(that_p)) break :scan;
+                // WB3c  ZWJ × \p{Extended_Pictographic}
+                if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) {
+                    continue :scan;
+                }
+                // WB3d  WSegSpace × WSegSpace
+                if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
+                // WB4  X (Extend | Format | ZWJ)* → X
+                if (isIgnorable(that_p)) {
+                    continue :scan;
+                } // Now we use last_p instead of this_p for ignorable's sake
+                if (isAHLetter(last_p)) {
+                    // WB5  AHLetter × AHLetter
+                    if (isAHLetter(that_p)) continue :scan;
+                    // WB6  AHLetter × (MidLetter | MidNumLetQ) AHLetter
+                    if (isMidVal(that_p)) {
+                        const next_val = iter.peekPast();
+                        if (next_val) |next_cp| {
+                            const next_p = iter.wb.breakProp(next_cp);
+                            if (isAHLetter(next_p)) {
+                                continue :scan;
+                            }
+                        }
+                    }
+                }
+                // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
+                if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) {
+                    continue :scan;
+                }
+                if (last_p == .Hebrew_Letter) {
+                    // WB7a  Hebrew_Letter × Single_Quote
+                    if (that_p == .Single_Quote) continue :scan;
+                    // WB7b  Hebrew_Letter × Double_Quote Hebrew_Letter
+                    if (that_p == .Double_Quote) {
+                        const next_val = iter.peekPast();
+                        if (next_val) |next_cp| {
+                            const next_p = iter.wb.breakProp(next_cp);
+                            if (next_p == .Hebrew_Letter) {
+                                continue :scan;
+                            }
+                        }
+                    }
+                }
+                // WB7c  Hebrew_Letter Double_Quote × Hebrew_Letter
+                if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter)
+                    continue :scan;
+                // WB8  Numeric × Numeric
+                if (last_p == .Numeric and that_p == .Numeric) continue :scan;
+                // WB9  AHLetter × Numeric
+                if (isAHLetter(last_p) and that_p == .Numeric) continue :scan;
+                // WB10  Numeric ×  AHLetter
+                if (last_p == .Numeric and isAHLetter(that_p)) continue :scan;
+                // WB11  Numeric (MidNum | MidNumLetQ) × Numeric
+                if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric)
+                    continue :scan;
+                // WB12  Numeric × (MidNum | MidNumLetQ) Numeric
+                if (last_p == .Numeric and isMidNum(that_p)) {
+                    const next_val = iter.peekPast();
+                    if (next_val) |next_cp| {
+                        const next_p = iter.wb.breakProp(next_cp);
+                        if (next_p == .Numeric) {
+                            continue :scan;
+                        }
+                    }
+                }
+                // WB13  Katakana × Katakana
+                if (last_p == .Katakana and that_p == .Katakana) continue :scan;
+                // WB13a  (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+                if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan;
+                // WB13b  ExtendNumLet × (AHLetter | Numeric | Katakana)
+                if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
+                // WB15, WB16  ([^RI] | sot) (RI RI)* RI × RI
+                const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator;
+                if (maybe_flag) {
+                    ri_count += 1;
+                    if (ri_count % 2 == 1) continue :scan;
+                }
+                // WB999  Any ÷ Any
+                break :scan;
+            } else { // iter.that == null
+                break :scan;
+            }
+        }
+
+        return Word{ .len = word_len, .offset = word_start };
+    }
+
+    pub fn format(iter: Iterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
+        try writer.print(
+            "Iterator {{ .this = {any}, .that = {any} }}",
+            .{ iter.this, iter.that },
+        );
+    }
+
+    fn advance(iter: *Iterator) void {
+        iter.this = iter.that;
+        iter.that = iter.cp_iter.next();
+    }
+
+    fn peekPast(iter: *Iterator) ?CodePoint {
+        const save_cp = iter.cp_iter;
+        defer iter.cp_iter = save_cp;
+        while (iter.cp_iter.peek()) |peeked| {
+            if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
+            _ = iter.cp_iter.next();
+        }
+        return null;
+    }
+};
+
+/// An iterator, backward, over all words in a provided string.
+pub const ReverseIterator = struct {
+    after: ?CodePoint = null,
+    before: ?CodePoint = null,
+    cp_iter: ReverseCodepointIterator,
+    wb: *const WordBreak,
+    flags: usize = 0,
+
+    /// Assumes `str` is valid UTF-8.
+    pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator {
+        var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb };
+        wb_iter.advance();
+        return wb_iter;
+    }
+
+    /// Returns the previous word segment, if any, without advancing.
+    pub fn peek(iter: *ReverseIterator) ?Word {
+        const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags };
+        defer {
+            iter.before, iter.after, iter.cp_iter, iter.flags = cache;
+        }
+        return iter.prev();
+    }
+
+    /// Return a forward iterator from where this iterator paused.  Usually,
+    /// and always when using the API to create iterators, calling `next()`
+    /// will return the word just seen.
+    pub fn forwardIterator(iter: *ReverseIterator) Iterator {
+        var cp_it = iter.cp_iter.forwardIterator();
+        if (iter.before) |_|
+            _ = cp_it.next();
+        return .{
+            .wb = iter.wb,
+            .this = cp_it.next(),
+            .that = iter.after,
+            .cp_iter = cp_it,
+        };
+    }
+
+    /// Return the previous word, if any.
+    pub fn prev(iter: *ReverseIterator) ?Word {
+        iter.advance();
+
+        // Done?
+        if (iter.after == null) return null;
+        // Last?
+        if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 };
+
+        const word_end = iter.after.?.offset + iter.after.?.len;
+        var word_len: u32 = 0;
+
+        // State variables.
+        var last_p: WordBreakProperty = .none;
+        var last_last_p: WordBreakProperty = .none;
+
+        scan: while (true) : (iter.advance()) {
+            const after = iter.after.?;
+            word_len += after.len;
+            if (iter.before) |before| {
+                var sneak = sneaky(iter); // 'sneaks' past ignorables
+                const after_p = iter.wb.breakProp(after);
+                var before_p = iter.wb.breakProp(before);
+                if (!isIgnorable(after_p)) {
+                    last_last_p = last_p;
+                    last_p = after_p;
+                }
+                // WB3  CR × LF
+                if (before_p == .CR and after_p == .LF) continue :scan;
+                // WB3a  (Newline | CR | LF) ÷
+                if (isNewline(before_p)) break :scan;
+                // WB3b  ÷ (Newline | CR | LF)
+                if (isNewline(after_p)) break :scan;
+                // WB3c  ZWJ × \p{Extended_Pictographic}
+                if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) {
+                    continue :scan;
+                }
+                // WB3d  WSegSpace × WSegSpace
+                if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan;
+                // WB4  X (Extend | Format | ZWJ)* → X
+                if (isIgnorable(before_p)) {
+                    const maybe_before = sneak.prev();
+                    if (maybe_before) |valid_before| {
+                        before_p = iter.wb.breakProp(valid_before);
+                    } else if (!isIgnorable(after_p)) {
+                        // We're done
+                        break :scan;
+                    }
+                }
+                if (isIgnorable(after_p)) continue :scan;
+                // WB5  AHLetter × AHLetter
+                if (isAHLetter(last_p) and isAHLetter(before_p)) {
+                    continue :scan;
+                }
+                // WB6  AHLetter × (MidLetter | MidNumLetQ) AHLetter
+                if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) {
+                    continue :scan;
+                }
+                // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
+                if (isMidVal(before_p) and isAHLetter(last_p)) {
+                    const prev_val = sneak.peek();
+                    if (prev_val) |prev_cp| {
+                        const prev_p = iter.wb.breakProp(prev_cp);
+                        if (isAHLetter(prev_p)) {
+                            continue :scan;
+                        }
+                    }
+                }
+                // WB7a  Hebrew_Letter × Single_Quote
+                if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan;
+                // WB7b  Hebrew_Letter × Double_Quote Hebrew_Letter
+                if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) {
+                    continue :scan;
+                }
+                // WB7c  Hebrew_Letter Double_Quote × Hebrew_Letter
+                if (before_p == .Double_Quote and last_p == .Hebrew_Letter) {
+                    const prev_val = sneak.peek();
+                    if (prev_val) |prev_cp| {
+                        const prev_p = iter.wb.breakProp(prev_cp);
+                        if (prev_p == .Hebrew_Letter) {
+                            continue :scan;
+                        }
+                    }
+                }
+                // WB8  Numeric × Numeric
+                if (before_p == .Numeric and last_p == .Numeric) continue :scan;
+                // WB9  AHLetter × Numeric
+                if (isAHLetter(before_p) and last_p == .Numeric) continue :scan;
+                // WB10  Numeric ×  AHLetter
+                if (before_p == .Numeric and isAHLetter(last_p)) continue :scan;
+                // WB11  Numeric (MidNum | MidNumLetQ) × Numeric
+                if (isMidNum(before_p) and last_p == .Numeric) {
+                    const prev_val = sneak.peek();
+                    if (prev_val) |prev_cp| {
+                        const prev_p = iter.wb.breakProp(prev_cp);
+                        if (prev_p == .Numeric) {
+                            continue :scan;
+                        }
+                    }
+                }
+                // WB12  Numeric × (MidNum | MidNumLetQ) Numeric
+                if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) {
+                    continue :scan;
+                }
+                // WB13  Katakana × Katakana
+                if (before_p == .Katakana and last_p == .Katakana) continue :scan;
+                // WB13a  (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+                if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan;
+                // WB13b  ExtendNumLet × (AHLetter | Numeric | Katakana)
+                if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan;
+                // WB15, WB16  ([^RI] | sot) (RI RI)* RI × RI
+                // NOTE:
+                // So here we simply have to know whether a run of flags is even or odd.
+                // The whole run.  To avoid quadratic behavior (and long flag runs are
+                // actually a thing in the wild), we have to count them once, store that
+                // on the iterator, and decrement each time we see two, possibly breaking
+                // once extra at the beginning. They break up one per flag, once we hit
+                // zero, that's all the flags.  If we see another flag we do it again.
+                if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) {
+                    defer {
+                        if (iter.flags > 0) iter.flags -= 1;
+                    }
+                    if (iter.flags == 0) {
+                        iter.flags = sneak.countFlags();
+                    }
+                    if (iter.flags % 2 == 0) {
+                        continue :scan;
+                    }
+                }
+                // WB999  Any ÷ Any
+                break :scan;
+            }
+            break :scan;
+        }
+        return Word{ .len = word_len, .offset = word_end - word_len };
+    }
+
+    pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
+        try writer.print(
+            "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}",
+            .{ iter.before, iter.after, iter.flags },
+        );
+    }
+
+    fn peekPast(iter: *ReverseIterator) ?CodePoint {
+        const save_cp = iter.cp_iter;
+        defer iter.cp_iter = save_cp;
+        while (iter.cp_iter.peek()) |peeked| {
+            if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
+            _ = iter.cp_iter.prev();
+        }
+        return null;
+    }
+
+    fn advance(iter: *ReverseIterator) void {
+        iter.after = iter.before;
+        iter.before = iter.cp_iter.prev();
+    }
+};
+
+//| Implementation Details
+
+/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`.
+fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator {
+    var idx: u32 = @intCast(index);
+    // Find the next lead byte:
+    while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
+    if (idx == string.len) return wb.reverseIterator(string);
+    var iter: ReverseIterator = undefined;
+    iter.wb = wb;
+    iter.flags = 0;
+    // We need to populate the CodePoints, and the codepoint iterator.
+    // Consider "abc| def" with the cursor as |.
+    // We need `before` to be `c` and `after` to be ' ',
+    // and `cp_iter.prev()` to be `b`.
+    var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
+    iter.after = cp_iter.prev();
+    iter.before = cp_iter.prev();
+    iter.cp_iter = cp_iter;
+    return iter;
+}
+
+fn sneaky(iter: *const ReverseIterator) SneakIterator {
+    return .{ .cp_iter = iter.cp_iter, .wb = iter.wb };
+}
+
+const SneakIterator = struct {
+    cp_iter: ReverseCodepointIterator,
+    wb: *const WordBreak,
+
+    fn peek(iter: *SneakIterator) ?CodePoint {
+        const save_cp = iter.cp_iter;
+        defer iter.cp_iter = save_cp;
+        while (iter.cp_iter.peek()) |peeked| {
+            if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
+            _ = iter.cp_iter.prev();
+        }
+        return null;
+    }
+
+    fn countFlags(iter: *SneakIterator) usize {
+        var flags: usize = 0;
+        const save_cp = iter.cp_iter;
+        defer iter.cp_iter = save_cp;
+        while (iter.cp_iter.prev()) |cp| {
+            const prop = iter.wb.breakProp(cp);
+            if (isIgnorable(prop)) continue;
+            if (prop == .Regional_Indicator) {
+                flags += 1;
+            } else break;
+        }
+        return flags;
+    }
+
+    fn prev(iter: *SneakIterator) ?CodePoint {
+        while (iter.cp_iter.prev()) |peeked| {
+            if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
+        }
+        return null;
+    }
+};
+
+inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void {
+    const decompressor = compress.flate.inflate.decompressor;
+    const in_bytes = @embedFile("wbp");
+    var in_fbs = std.io.fixedBufferStream(in_bytes);
+    var in_decomp = decompressor(.raw, in_fbs.reader());
+    var reader = in_decomp.reader();
+
+    const endian = builtin.cpu.arch.endian();
+
+    const stage_1_len: u16 = try reader.readInt(u16, endian);
+    wb.s1 = try allocator.alloc(u16, stage_1_len);
+    errdefer allocator.free(wb.s1);
+    for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian);
+
+    const stage_2_len: u16 = try reader.readInt(u16, endian);
+    wb.s2 = try allocator.alloc(u5, stage_2_len);
+    errdefer allocator.free(wb.s2);
+    for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian));
+    var count_0: usize = 0;
+    for (wb.s2) |nyb| {
+        if (nyb == 0) count_0 += 1;
+    }
+}
+
+//| Predicates
+
+inline fn isNewline(wbp: WordBreakProperty) bool {
+    return wbp == .CR or wbp == .LF or wbp == .Newline;
+}
+
+inline fn isIgnorable(wbp: WordBreakProperty) bool {
+    return switch (wbp) {
+        .Format, .Extend, .ZWJ => true,
+        else => false,
+    };
+}
+
+inline fn isAHLetter(wbp: WordBreakProperty) bool {
+    return wbp == .ALetter or wbp == .Hebrew_Letter;
+}
+
+inline fn isMidVal(wbp: WordBreakProperty) bool {
+    return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
+}
+
+inline fn isMidNum(wbp: WordBreakProperty) bool {
+    return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote;
+}
+
+inline fn isExtensible(wbp: WordBreakProperty) bool {
+    return switch (wbp) {
+        .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
+        else => false,
+    };
+}
+
+test "Word Break Properties" {
+    const wb = try WordBreak.init(testing.allocator);
+    defer wb.deinit(testing.allocator);
+    try testing.expectEqual(.CR, wb.breakProperty('\r'));
+    try testing.expectEqual(.LF, wb.breakProperty('\n'));
+    try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש'));
+    try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}'));
+}
+
+test "ext_pict" {
+    try testing.expect(ext_pict.isMatch("👇"));
+    try testing.expect(ext_pict.isMatch("\u{2701}"));
+}
+
+test wordAtIndex {
+    const wb = try WordBreak.init(testing.allocator);
+    defer wb.deinit(testing.allocator);
+    const t_string = "first second third";
+    const second = wb.wordAtIndex(t_string, 8);
+    try testing.expectEqualStrings("second", second.bytes(t_string));
+    const third = wb.wordAtIndex(t_string, 14);
+    try testing.expectEqualStrings("third", third.bytes(t_string));
+    {
+        const first = wb.wordAtIndex(t_string, 3);
+        try testing.expectEqualStrings("first", first.bytes(t_string));
+    }
+    {
+        const first = wb.wordAtIndex(t_string, 0);
+        try testing.expectEqualStrings("first", first.bytes(t_string));
+    }
+    const last = wb.wordAtIndex(t_string, 14);
+    try testing.expectEqualStrings("third", last.bytes(t_string));
+}
+
+const testr = "don't a:ka fin!";
+
+test "reversal" {
+    const wb = try WordBreak.init(testing.allocator);
+    defer wb.deinit(testing.allocator);
+    {
+        var fwd = wb.iterator(testr);
+        var this_word: ?Word = fwd.next();
+
+        while (this_word) |this| : (this_word = fwd.next()) {
+            var back = fwd.reverseIterator();
+            const that_word = back.prev();
+            if (that_word) |that| {
+                try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr));
+            } else {
+                try testing.expect(false);
+            }
+        }
+    }
+    {
+        var back = wb.reverseIterator(testr);
+        var this_word: ?Word = back.prev();
+
+        while (this_word) |this| : (this_word = back.prev()) {
+            var fwd = back.forwardIterator();
+            const that_word = fwd.next();
+            if (that_word) |that| {
+                try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr));
+            } else {
+                try testing.expect(false);
+            }
+        }
+    }
+}
+
+fn testAllocations(allocator: Allocator) !void {
+    const wb = try WordBreak.init(allocator);
+    wb.deinit(allocator);
+}
+
+test "allocation safety" {
+    try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{});
+}
+
+const std = @import("std");
+const builtin = @import("builtin");
+const compress = std.compress;
+const mem = std.mem;
+const Allocator = mem.Allocator;
+const assert = std.debug.assert;
+const testing = std.testing;
+
+const code_point = @import("code_point");
+const CodepointIterator = code_point.Iterator;
+const ReverseCodepointIterator = code_point.ReverseIterator;
+const CodePoint = code_point.CodePoint;
+
+const ext_pict = @import("micro_runeset.zig").Extended_Pictographic;
-- 
cgit v1.2.3