Merge branch 'develop-next'HEAD v0.14.1 master

author: Sam Atman 2025-07-08 12:15:32 -0400
committer: Sam Atman 2025-07-08 12:15:32 -0400
commit: 9427a9e53aaa29ee071f4dcb35b809a699d75aa9 (patch)
tree: 2607c185fd8053b84d60041fadc35c05a0225d34 /src/Words.zig
parent: Merge pull request 'Fix benchmarks' (#56) from jacobsandlund/zg:benchmarks in... (diff)
parent: Add Words.zig example to README (diff)
download: zg-master.tar.gz
zg-master.tar.xz
zg-master.zip
1 files changed, 773 insertions, 0 deletions
diff --git a/src/Words.zig b/src/Words.zig
new file mode 100644
index 0000000..617c34d
--- /dev/null
+++ b/src/Words.zig
@@ -0,0 +1,773 @@
+//! Word Breaking Algorithm.
+//!
+//! https://www.unicode.org/reports/tr29/#Word_Boundaries
+//!
+const WordBreakProperty = enum(u5) {
+    none,
+    Double_Quote,
+    Single_Quote,
+    Hebrew_Letter,
+    CR,
+    LF,
+    Newline,
+    Extend,
+    Regional_Indicator,
+    Format,
+    Katakana,
+    ALetter,
+    MidLetter,
+    MidNum,
+    MidNumLet,
+    Numeric,
+    ExtendNumLet,
+    ZWJ,
+    WSegSpace,
+};
+s1: []u16 = undefined,
+s2: []u5 = undefined,
+const Words = @This();
+pub fn init(allocator: Allocator) Allocator.Error!Words {
+    var wb: Words = undefined;
+    try wb.setup(allocator);
+    return wb;
+}
+pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void {
+    wb.setupImpl(allocator) catch |err| {
+        switch (err) {
+            error.OutOfMemory => |e| return e,
+            else => unreachable,
+        }
+    };
+}
+pub fn deinit(words: *const Words, allocator: mem.Allocator) void {
+    allocator.free(words.s1);
+    allocator.free(words.s2);
+}
+/// Represents a Unicode word span, as an offset into the source string
+/// and the length of the word.
+pub const Word = struct {
+    offset: uoffset,
+    len: uoffset,
+    /// Returns a slice of the word given the source string.
+    pub fn bytes(word: Word, src: []const u8) []const u8 {
+        return src[word.offset..][0..word.len];
+    }
+};
+/// Returns the word break property type for `cp`.
+pub fn breakProperty(words: *const Words, cp: u21) WordBreakProperty {
+    return @enumFromInt(words.s2[words.s1[cp >> 8] + (cp & 0xff)]);
+}
+/// Convenience function for working with CodePoints
+fn breakProp(words: *const Words, point: CodePoint) WordBreakProperty {
+    return @enumFromInt(words.s2[words.s1[point.code >> 8] + (point.code & 0xff)]);
+}
+/// Returns the Word at the given index.  Asserts that the index is less than
+/// `string.len`, and that the string is not empty. Always returns a word.
+/// The index does not have to be the start of a codepoint in the word.
+pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word {
+    assert(index < string.len and string.len > 0);
+    var iter_back: ReverseIterator = reverseFromIndex(words, string, index);
+    const first_back = iter_back.prev();
+    if (first_back) |back| {
+        if (back.offset == 0) {
+            var iter_fwd = words.iterator(string);
+            while (iter_fwd.next()) |word| {
+                if (word.offset <= index and index < word.offset + word.len)
+                    return word;
+            }
+        }
+    } else {
+        var iter_fwd = words.iterator(string);
+        while (iter_fwd.next()) |word| {
+            if (word.offset <= index and index < word.offset + word.len)
+                return word;
+        }
+    }
+    _ = iter_back.prev();
+    // There's sometimes flags:
+    if (iter_back.flags > 0) {
+        while (iter_back.flags > 0) {
+            if (iter_back.prev()) |_| {
+                continue;
+            } else {
+                break;
+            }
+        }
+    }
+    var iter_fwd = iter_back.forwardIterator();
+    while (iter_fwd.next()) |word| {
+        if (word.offset <= index and index < word.offset + word.len)
+            return word;
+    }
+    unreachable;
+}
+/// Returns an iterator over words in `slice`.
+pub fn iterator(words: *const Words, slice: []const u8) Iterator {
+    return Iterator.init(words, slice);
+}
+/// Returns a reverse iterator over the words in `slice`.
+pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator {
+    return ReverseIterator.init(words, slice);
+}
+/// Returns an iterator after the `word` in `slice`.
+pub fn iterateAfterWord(words: *const Words, slice: []const u8, word: Word) Iterator {
+    return forwardFromIndex(words, slice, word.offset + word.len);
+}
+/// Returns a reverse iterator before the `word` in `slice`.
+pub fn iterateBeforeWord(words: *const Words, slice: []const u8, word: Word) ReverseIterator {
+    return reverseFromIndex(words, slice, word.offset);
+}
+/// An iterator, forward, over all words in a provided string.
+pub const Iterator = struct {
+    this: ?CodePoint = null,
+    that: ?CodePoint = null,
+    cp_iter: CodepointIterator,
+    wb: *const Words,
+    /// Assumes `str` is valid UTF-8.
+    pub fn init(words: *const Words, str: []const u8) Iterator {
+        var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = words };
+        wb_iter.advance();
+        return wb_iter;
+    }
+    /// Returns the next word segment, without advancing.
+    pub fn peek(iter: *Iterator) ?Word {
+        const cache = .{ iter.this, iter.that, iter.cp_iter };
+        defer {
+            iter.this, iter.that, iter.cp_iter = cache;
+        }
+        return iter.next();
+    }
+    /// Returns a reverse iterator from the point this iterator is paused
+    /// at.  Usually, and always when using the API to create iterators,
+    /// calling `prev()` will return the word just seen.
+    pub fn reverseIterator(iter: *Iterator) ReverseIterator {
+        var cp_it = iter.cp_iter.reverseIterator();
+        if (iter.that) |_|
+            _ = cp_it.prev();
+        if (iter.cp_iter.peek()) |_|
+            _ = cp_it.prev();
+        return .{
+            .wb = iter.wb,
+            .before = cp_it.prev(),
+            .after = iter.that,
+            .cp_iter = cp_it,
+        };
+    }
+    /// Returns the next word segment, if any.
+    pub fn next(iter: *Iterator) ?Word {
+        iter.advance();
+        // Done?
+        if (iter.this == null) return null;
+        // Last?
+        if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
+        const word_start = iter.this.?.offset;
+        var word_len: uoffset = 0;
+        // State variables.
+        var last_p: WordBreakProperty = .none;
+        var last_last_p: WordBreakProperty = .none;
+        var ri_count: usize = 0;
+        scan: while (true) : (iter.advance()) {
+            const this = iter.this.?;
+            word_len += this.len;
+            if (iter.that) |that| {
+                const this_p = iter.wb.breakProp(this);
+                const that_p = iter.wb.breakProp(that);
+                if (!isIgnorable(this_p)) {
+                    last_last_p = last_p;
+                    last_p = this_p;
+                }
+                // WB3  CR × LF
+                if (this_p == .CR and that_p == .LF) continue :scan;
+                // WB3a  (Newline | CR | LF) ÷
+                if (isNewline(this_p)) break :scan;
+                // WB3b  ÷ (Newline | CR | LF)
+                if (isNewline(that_p)) break :scan;
+                // WB3c  ZWJ × \p{Extended_Pictographic}
+                if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) {
+                    continue :scan;
+                }
+                // WB3d  WSegSpace × WSegSpace
+                if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
+                // WB4  X (Extend | Format | ZWJ)* → X
+                if (isIgnorable(that_p)) {
+                    continue :scan;
+                } // Now we use last_p instead of this_p for ignorable's sake
+                if (isAHLetter(last_p)) {
+                    // WB5  AHLetter × AHLetter
+                    if (isAHLetter(that_p)) continue :scan;
+                    // WB6  AHLetter × (MidLetter | MidNumLetQ) AHLetter
+                    if (isMidVal(that_p)) {
+                        const next_val = iter.peekPast();
+                        if (next_val) |next_cp| {
+                            const next_p = iter.wb.breakProp(next_cp);
+                            if (isAHLetter(next_p)) {
+                                continue :scan;
+                            }
+                        }
+                    }
+                }
+                // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
+                if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) {
+                    continue :scan;
+                }
+                if (last_p == .Hebrew_Letter) {
+                    // WB7a  Hebrew_Letter × Single_Quote
+                    if (that_p == .Single_Quote) continue :scan;
+                    // WB7b  Hebrew_Letter × Double_Quote Hebrew_Letter
+                    if (that_p == .Double_Quote) {
+                        const next_val = iter.peekPast();
+                        if (next_val) |next_cp| {
+                            const next_p = iter.wb.breakProp(next_cp);
+                            if (next_p == .Hebrew_Letter) {
+                                continue :scan;
+                            }
+                        }
+                    }
+                }
+                // WB7c  Hebrew_Letter Double_Quote × Hebrew_Letter
+                if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter)
+                    continue :scan;
+                // WB8  Numeric × Numeric
+                if (last_p == .Numeric and that_p == .Numeric) continue :scan;
+                // WB9  AHLetter × Numeric
+                if (isAHLetter(last_p) and that_p == .Numeric) continue :scan;
+                // WB10  Numeric ×  AHLetter
+                if (last_p == .Numeric and isAHLetter(that_p)) continue :scan;
+                // WB11  Numeric (MidNum | MidNumLetQ) × Numeric
+                if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric)
+                    continue :scan;
+                // WB12  Numeric × (MidNum | MidNumLetQ) Numeric
+                if (last_p == .Numeric and isMidNum(that_p)) {
+                    const next_val = iter.peekPast();
+                    if (next_val) |next_cp| {
+                        const next_p = iter.wb.breakProp(next_cp);
+                        if (next_p == .Numeric) {
+                            continue :scan;
+                        }
+                    }
+                }
+                // WB13  Katakana × Katakana
+                if (last_p == .Katakana and that_p == .Katakana) continue :scan;
+                // WB13a  (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+                if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan;
+                // WB13b  ExtendNumLet × (AHLetter | Numeric | Katakana)
+                if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
+                // WB15, WB16  ([^RI] | sot) (RI RI)* RI × RI
+                const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator;
+                if (maybe_flag) {
+                    ri_count += 1;
+                    if (ri_count % 2 == 1) continue :scan;
+                }
+                // WB999  Any ÷ Any
+                break :scan;
+            } else { // iter.that == null
+                break :scan;
+            }
+        }
+        return Word{ .len = word_len, .offset = word_start };
+    }
+    pub fn format(iter: Iterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
+        try writer.print(
+            "Iterator {{ .this = {any}, .that = {any} }}",
+            .{ iter.this, iter.that },
+        );
+    }
+    fn advance(iter: *Iterator) void {
+        iter.this = iter.that;
+        iter.that = iter.cp_iter.next();
+    }
+    fn peekPast(iter: *Iterator) ?CodePoint {
+        const save_cp = iter.cp_iter;
+        defer iter.cp_iter = save_cp;
+        while (iter.cp_iter.peek()) |peeked| {
+            if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
+            _ = iter.cp_iter.next();
+        }
+        return null;
+    }
+};
+/// An iterator, backward, over all words in a provided string.
+pub const ReverseIterator = struct {
+    after: ?CodePoint = null,
+    before: ?CodePoint = null,
+    cp_iter: ReverseCodepointIterator,
+    wb: *const Words,
+    flags: usize = 0,
+    /// Assumes `str` is valid UTF-8.
+    pub fn init(words: *const Words, str: []const u8) ReverseIterator {
+        var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = words };
+        wb_iter.advance();
+        return wb_iter;
+    }
+    /// Returns the previous word segment, if any, without advancing.
+    pub fn peek(iter: *ReverseIterator) ?Word {
+        const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags };
+        defer {
+            iter.before, iter.after, iter.cp_iter, iter.flags = cache;
+        }
+        return iter.prev();
+    }
+    /// Return a forward iterator from where this iterator paused.  Usually,
+    /// and always when using the API to create iterators, calling `next()`
+    /// will return the word just seen.
+    pub fn forwardIterator(iter: *ReverseIterator) Iterator {
+        var cp_it = iter.cp_iter.forwardIterator();
+        if (iter.before) |_|
+            _ = cp_it.next();
+        return .{
+            .wb = iter.wb,
+            .this = cp_it.next(),
+            .that = iter.after,
+            .cp_iter = cp_it,
+        };
+    }
+    /// Return the previous word, if any.
+    pub fn prev(iter: *ReverseIterator) ?Word {
+        iter.advance();
+        // Done?
+        if (iter.after == null) return null;
+        // Last?
+        if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 };
+        const word_end = iter.after.?.offset + iter.after.?.len;
+        var word_len: uoffset = 0;
+        // State variables.
+        var last_p: WordBreakProperty = .none;
+        var last_last_p: WordBreakProperty = .none;
+        scan: while (true) : (iter.advance()) {
+            const after = iter.after.?;
+            word_len += after.len;
+            if (iter.before) |before| {
+                var sneak = sneaky(iter); // 'sneaks' past ignorables
+                const after_p = iter.wb.breakProp(after);
+                var before_p = iter.wb.breakProp(before);
+                if (!isIgnorable(after_p)) {
+                    last_last_p = last_p;
+                    last_p = after_p;
+                }
+                // WB3  CR × LF
+                if (before_p == .CR and after_p == .LF) continue :scan;
+                // WB3a  (Newline | CR | LF) ÷
+                if (isNewline(before_p)) break :scan;
+                // WB3b  ÷ (Newline | CR | LF)
+                if (isNewline(after_p)) break :scan;
+                // WB3c  ZWJ × \p{Extended_Pictographic}
+                if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) {
+                    continue :scan;
+                }
+                // WB3d  WSegSpace × WSegSpace
+                if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan;
+                // WB4  X (Extend | Format | ZWJ)* → X
+                if (isIgnorable(before_p)) {
+                    const maybe_before = sneak.prev();
+                    if (maybe_before) |valid_before| {
+                        before_p = iter.wb.breakProp(valid_before);
+                    } else if (!isIgnorable(after_p)) {
+                        // We're done
+                        break :scan;
+                    }
+                }
+                if (isIgnorable(after_p)) continue :scan;
+                // WB5  AHLetter × AHLetter
+                if (isAHLetter(last_p) and isAHLetter(before_p)) {
+                    continue :scan;
+                }
+                // WB6  AHLetter × (MidLetter | MidNumLetQ) AHLetter
+                if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) {
+                    continue :scan;
+                }
+                // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
+                if (isMidVal(before_p) and isAHLetter(last_p)) {
+                    const prev_val = sneak.peek();
+                    if (prev_val) |prev_cp| {
+                        const prev_p = iter.wb.breakProp(prev_cp);
+                        if (isAHLetter(prev_p)) {
+                            continue :scan;
+                        }
+                    }
+                }
+                // WB7a  Hebrew_Letter × Single_Quote
+                if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan;
+                // WB7b  Hebrew_Letter × Double_Quote Hebrew_Letter
+                if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) {
+                    continue :scan;
+                }
+                // WB7c  Hebrew_Letter Double_Quote × Hebrew_Letter
+                if (before_p == .Double_Quote and last_p == .Hebrew_Letter) {
+                    const prev_val = sneak.peek();
+                    if (prev_val) |prev_cp| {
+                        const prev_p = iter.wb.breakProp(prev_cp);
+                        if (prev_p == .Hebrew_Letter) {
+                            continue :scan;
+                        }
+                    }
+                }
+                // WB8  Numeric × Numeric
+                if (before_p == .Numeric and last_p == .Numeric) continue :scan;
+                // WB9  AHLetter × Numeric
+                if (isAHLetter(before_p) and last_p == .Numeric) continue :scan;
+                // WB10  Numeric ×  AHLetter
+                if (before_p == .Numeric and isAHLetter(last_p)) continue :scan;
+                // WB11  Numeric (MidNum | MidNumLetQ) × Numeric
+                if (isMidNum(before_p) and last_p == .Numeric) {
+                    const prev_val = sneak.peek();
+                    if (prev_val) |prev_cp| {
+                        const prev_p = iter.wb.breakProp(prev_cp);
+                        if (prev_p == .Numeric) {
+                            continue :scan;
+                        }
+                    }
+                }
+                // WB12  Numeric × (MidNum | MidNumLetQ) Numeric
+                if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) {
+                    continue :scan;
+                }
+                // WB13  Katakana × Katakana
+                if (before_p == .Katakana and last_p == .Katakana) continue :scan;
+                // WB13a  (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+                if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan;
+                // WB13b  ExtendNumLet × (AHLetter | Numeric | Katakana)
+                if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan;
+                // WB15, WB16  ([^RI] | sot) (RI RI)* RI × RI
+                // NOTE:
+                // So here we simply have to know whether a run of flags is even or odd.
+                // The whole run.  To avoid quadratic behavior (and long flag runs are
+                // actually a thing in the wild), we have to count them once, store that
+                // on the iterator, and decrement each time we see two, possibly breaking
+                // once extra at the beginning. They break up one per flag, once we hit
+                // zero, that's all the flags.  If we see another flag we do it again.
+                if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) {
+                    defer {
+                        if (iter.flags > 0) iter.flags -= 1;
+                    }
+                    if (iter.flags == 0) {
+                        iter.flags = sneak.countFlags();
+                    }
+                    if (iter.flags % 2 == 0) {
+                        continue :scan;
+                    }
+                }
+                // WB999  Any ÷ Any
+                break :scan;
+            }
+            break :scan;
+        }
+        return Word{ .len = word_len, .offset = word_end - word_len };
+    }
+    pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
+        try writer.print(
+            "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}",
+            .{ iter.before, iter.after, iter.flags },
+        );
+    }
+    fn peekPast(iter: *ReverseIterator) ?CodePoint {
+        const save_cp = iter.cp_iter;
+        defer iter.cp_iter = save_cp;
+        while (iter.cp_iter.peek()) |peeked| {
+            if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
+            _ = iter.cp_iter.prev();
+        }
+        return null;
+    }
+    fn advance(iter: *ReverseIterator) void {
+        iter.after = iter.before;
+        iter.before = iter.cp_iter.prev();
+    }
+};
+//| Implementation Details
+/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`.
+fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator {
+    var idx: uoffset = @intCast(index);
+    // Find the next lead byte:
+    while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
+    if (idx == string.len) return words.reverseIterator(string);
+    var iter: ReverseIterator = undefined;
+    iter.wb = words;
+    iter.flags = 0;
+    // We need to populate the CodePoints, and the codepoint iterator.
+    // Consider "abc| def" with the cursor as |.
+    // We need `before` to be `c` and `after` to be ' ',
+    // and `cp_iter.prev()` to be `b`.
+    var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
+    iter.after = cp_iter.prev();
+    iter.before = cp_iter.prev();
+    iter.cp_iter = cp_iter;
+    return iter;
+}
+fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator {
+    var idx: uoffset = @intCast(index);
+    if (idx == string.len) {
+        return .{
+            .cp_iter = .{ .bytes = string, .i = idx },
+            .this = null,
+            .that = null,
+            .wb = words,
+        };
+    }
+    while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
+    if (idx == 0) return words.iterator(string);
+    var iter: Iterator = undefined;
+    iter.wb = words;
+    // We need to populate the CodePoints, and the codepoint iterator.
+    // Consider "abc |def" with the cursor as |.
+    // We need `this` to be ` ` and `that` to be 'd',
+    // and `cp_iter.next()` to be `d`.
+    idx -= 1;
+    while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
+    // "abc| def"
+    var cp_iter: CodepointIterator = .{ .bytes = string, .i = idx };
+    iter.this = cp_iter.next();
+    iter.that = cp_iter.next();
+    iter.cp_iter = cp_iter;
+    return iter;
+}
+fn sneaky(iter: *const ReverseIterator) SneakIterator {
+    return .{ .cp_iter = iter.cp_iter, .wb = iter.wb };
+}
+const SneakIterator = struct {
+    cp_iter: ReverseCodepointIterator,
+    wb: *const Words,
+    fn peek(iter: *SneakIterator) ?CodePoint {
+        const save_cp = iter.cp_iter;
+        defer iter.cp_iter = save_cp;
+        while (iter.cp_iter.peek()) |peeked| {
+            if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
+            _ = iter.cp_iter.prev();
+        }
+        return null;
+    }
+    fn countFlags(iter: *SneakIterator) usize {
+        var flags: usize = 0;
+        const save_cp = iter.cp_iter;
+        defer iter.cp_iter = save_cp;
+        while (iter.cp_iter.prev()) |cp| {
+            const prop = iter.wb.breakProp(cp);
+            if (isIgnorable(prop)) continue;
+            if (prop == .Regional_Indicator) {
+                flags += 1;
+            } else break;
+        }
+        return flags;
+    }
+    fn prev(iter: *SneakIterator) ?CodePoint {
+        while (iter.cp_iter.prev()) |peeked| {
+            if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
+        }
+        return null;
+    }
+};
+inline fn setupImpl(wb: *Words, allocator: Allocator) !void {
+    const decompressor = compress.flate.inflate.decompressor;
+    const in_bytes = @embedFile("wbp");
+    var in_fbs = std.io.fixedBufferStream(in_bytes);
+    var in_decomp = decompressor(.raw, in_fbs.reader());
+    var reader = in_decomp.reader();
+    const endian = builtin.cpu.arch.endian();
+    const stage_1_len: u16 = try reader.readInt(u16, endian);
+    wb.s1 = try allocator.alloc(u16, stage_1_len);
+    errdefer allocator.free(wb.s1);
+    for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian);
+    const stage_2_len: u16 = try reader.readInt(u16, endian);
+    wb.s2 = try allocator.alloc(u5, stage_2_len);
+    errdefer allocator.free(wb.s2);
+    for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian));
+    var count_0: usize = 0;
+    for (wb.s2) |nyb| {
+        if (nyb == 0) count_0 += 1;
+    }
+}
+//| Predicates
+inline fn isNewline(wbp: WordBreakProperty) bool {
+    return wbp == .CR or wbp == .LF or wbp == .Newline;
+}
+inline fn isIgnorable(wbp: WordBreakProperty) bool {
+    return switch (wbp) {
+        .Format, .Extend, .ZWJ => true,
+        else => false,
+    };
+}
+inline fn isAHLetter(wbp: WordBreakProperty) bool {
+    return wbp == .ALetter or wbp == .Hebrew_Letter;
+}
+inline fn isMidVal(wbp: WordBreakProperty) bool {
+    return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
+}
+inline fn isMidNum(wbp: WordBreakProperty) bool {
+    return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote;
+}
+inline fn isExtensible(wbp: WordBreakProperty) bool {
+    return switch (wbp) {
+        .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
+        else => false,
+    };
+}
+test "Word Break Properties" {
+    const wb = try Words.init(testing.allocator);
+    defer wb.deinit(testing.allocator);
+    try testing.expectEqual(.CR, wb.breakProperty('\r'));
+    try testing.expectEqual(.LF, wb.breakProperty('\n'));
+    try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש'));
+    try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}'));
+}
+test "ext_pict" {
+    try testing.expect(ext_pict.isMatch("👇"));
+    try testing.expect(ext_pict.isMatch("\u{2701}"));
+}
+test "Words" {
+    const wb = try Words.init(testing.allocator);
+    defer wb.deinit(testing.allocator);
+    const word_str = "Metonym   Μετωνύμιο メトニム";
+    var w_iter = wb.iterator(word_str);
+    try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str));
+    // Spaces are "words" too!
+    try testing.expectEqualStrings("   ", w_iter.next().?.bytes(word_str));
+    const in_greek = w_iter.next().?;
+    for (in_greek.offset..in_greek.offset + in_greek.len) |i| {
+        const at_index = wb.wordAtIndex(word_str, i).bytes(word_str);
+        try testing.expectEqualStrings("Μετωνύμιο", at_index);
+    }
+    _ = w_iter.next();
+    try testing.expectEqualStrings("メトニム", w_iter.next().?.bytes(word_str));
+}
+test wordAtIndex {
+    const wb = try Words.init(testing.allocator);
+    defer wb.deinit(testing.allocator);
+    const t_string = "first second third";
+    const second = wb.wordAtIndex(t_string, 8);
+    try testing.expectEqualStrings("second", second.bytes(t_string));
+    const third = wb.wordAtIndex(t_string, 14);
+    try testing.expectEqualStrings("third", third.bytes(t_string));
+    {
+        const first = wb.wordAtIndex(t_string, 3);
+        try testing.expectEqualStrings("first", first.bytes(t_string));
+    }
+    {
+        const first = wb.wordAtIndex(t_string, 0);
+        try testing.expectEqualStrings("first", first.bytes(t_string));
+    }
+    const last = wb.wordAtIndex(t_string, 14);
+    try testing.expectEqualStrings("third", last.bytes(t_string));
+}
+const testr = "don't a:ka fin!";
+test "reversal" {
+    const wb = try Words.init(testing.allocator);
+    defer wb.deinit(testing.allocator);
+    {
+        var fwd = wb.iterator(testr);
+        var this_word: ?Word = fwd.next();
+        while (this_word) |this| : (this_word = fwd.next()) {
+            var back = fwd.reverseIterator();
+            const that_word = back.prev();
+            if (that_word) |that| {
+                try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr));
+            } else {
+                try testing.expect(false);
+            }
+        }
+    }
+    {
+        var back = wb.reverseIterator(testr);
+        var this_word: ?Word = back.prev();
+        while (this_word) |this| : (this_word = back.prev()) {
+            var fwd = back.forwardIterator();
+            const that_word = fwd.next();
+            if (that_word) |that| {
+                try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr));
+            } else {
+                try testing.expect(false);
+            }
+        }
+    }
+}
+fn testAllocations(allocator: Allocator) !void {
+    const wb = try Words.init(allocator);
+    wb.deinit(allocator);
+}
+test "allocation safety" {
+    try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{});
+}
+const std = @import("std");
+const builtin = @import("builtin");
+const compress = std.compress;
+const mem = std.mem;
+const Allocator = mem.Allocator;
+const assert = std.debug.assert;
+const testing = std.testing;
+const uoffset = code_point.uoffset;
+const code_point = @import("code_point");
+const CodepointIterator = code_point.Iterator;
+const ReverseCodepointIterator = code_point.ReverseIterator;
+const CodePoint = code_point.CodePoint;
+const ext_pict = @import("micro_runeset.zig").Extended_Pictographic;
author	Sam Atman	2025-07-08 12:15:32 -0400
committer	Sam Atman	2025-07-08 12:15:32 -0400
commit	9427a9e53aaa29ee071f4dcb35b809a699d75aa9 (patch)
tree	2607c185fd8053b84d60041fadc35c05a0225d34 /src/Words.zig
parent	Merge pull request 'Fix benchmarks' (#56) from jacobsandlund/zg:benchmarks in... (diff)
parent	Add Words.zig example to README (diff)
download	zg-master.tar.gz zg-master.tar.xz zg-master.zip