From 21d55b730e25733c98fa58de42d256adf5f80d88 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 16 May 2025 12:06:32 -0400 Subject: Move WordBreak to Words --- src/Words.zig | 720 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 720 insertions(+) create mode 100644 src/Words.zig (limited to 'src/Words.zig') diff --git a/src/Words.zig b/src/Words.zig new file mode 100644 index 0000000..6a532f5 --- /dev/null +++ b/src/Words.zig @@ -0,0 +1,720 @@ +//! Word Breaking Algorithm. + +const WordBreakProperty = enum(u5) { + none, + Double_Quote, + Single_Quote, + Hebrew_Letter, + CR, + LF, + Newline, + Extend, + Regional_Indicator, + Format, + Katakana, + ALetter, + MidLetter, + MidNum, + MidNumLet, + Numeric, + ExtendNumLet, + ZWJ, + WSegSpace, +}; + +s1: []u16 = undefined, +s2: []u5 = undefined, + +const WordBreak = @This(); + +pub fn init(allocator: Allocator) Allocator.Error!WordBreak { + var wb: WordBreak = undefined; + try wb.setup(allocator); + return wb; +} + +pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { + wb.setupImpl(allocator) catch |err| { + switch (err) { + error.OutOfMemory => |e| return e, + else => unreachable, + } + }; +} + +pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { + allocator.free(wordbreak.s1); + allocator.free(wordbreak.s2); +} + +/// Represents a Unicode word span, as an offset into the source string +/// and the length of the word. +pub const Word = struct { + offset: u32, + len: u32, + + /// Returns a slice of the word given the source string. + pub fn bytes(self: Word, src: []const u8) []const u8 { + return src[self.offset..][0..self.len]; + } +}; + +/// Returns the word break property type for `cp`. +pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { + return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); +} + +/// Convenience function for working with CodePoints +fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { + return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); +} + +/// Returns the Word at the given index. Asserts that the index is less than +/// `string.len`, and that the string is not empty. Always returns a word. +/// The index does not have to be the start of a codepoint in the word. +pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { + assert(index < string.len and string.len > 0); + var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); + const first_back = iter_back.prev(); + if (first_back) |back| { + if (back.offset == 0) { + var iter_fwd = wordbreak.iterator(string); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index < word.offset + word.len) + return word; + } + } + } else { + var iter_fwd = wordbreak.iterator(string); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index < word.offset + word.len) + return word; + } + } + const second_back = iter_back.prev(); + if (second_back) |back| if (back.offset == 0) { + var iter_fwd = wordbreak.iterator(string); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index < word.offset + word.len) + return word; + } + }; + // There's sometimes flags: + if (iter_back.flags > 0) { + while (iter_back.flags > 0) { + if (iter_back.prev()) |_| { + continue; + } else { + break; + } + } + } + var iter_fwd = iter_back.forwardIterator(); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index < word.offset + word.len) + return word; + } + unreachable; +} + +/// Returns an iterator over words in `slice`. +pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { + return Iterator.init(wordbreak, slice); +} + +/// Returns a reverse iterator over the words in `slice`. +pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator { + return ReverseIterator.init(wordbreak, slice); +} + +/// An iterator, forward, over all words in a provided string. +pub const Iterator = struct { + this: ?CodePoint = null, + that: ?CodePoint = null, + cp_iter: CodepointIterator, + wb: *const WordBreak, + + /// Assumes `str` is valid UTF-8. + pub fn init(wb: *const WordBreak, str: []const u8) Iterator { + var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; + wb_iter.advance(); + return wb_iter; + } + + /// Returns the next word segment, without advancing. + pub fn peek(iter: *Iterator) ?Word { + const cache = .{ iter.this, iter.that, iter.cp_iter }; + defer { + iter.this, iter.that, iter.cp_iter = cache; + } + return iter.next(); + } + + /// Returns a reverse iterator from the point this iterator is paused + /// at. Usually, and always when using the API to create iterators, + /// calling `prev()` will return the word just seen. + pub fn reverseIterator(iter: *Iterator) ReverseIterator { + var cp_it = iter.cp_iter.reverseIterator(); + if (iter.that) |_| + _ = cp_it.prev(); + if (iter.cp_iter.peek()) |_| + _ = cp_it.prev(); + return .{ + .wb = iter.wb, + .before = cp_it.prev(), + .after = iter.that, + .cp_iter = cp_it, + }; + } + + /// Returns the next word segment, if any. + pub fn next(iter: *Iterator) ?Word { + iter.advance(); + + // Done? + if (iter.this == null) return null; + // Last? + if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; + + const word_start = iter.this.?.offset; + var word_len: u32 = 0; + + // State variables. + var last_p: WordBreakProperty = .none; + var last_last_p: WordBreakProperty = .none; + var ri_count: usize = 0; + + scan: while (true) : (iter.advance()) { + const this = iter.this.?; + word_len += this.len; + if (iter.that) |that| { + const this_p = iter.wb.breakProp(this); + const that_p = iter.wb.breakProp(that); + if (!isIgnorable(this_p)) { + last_last_p = last_p; + last_p = this_p; + } + // WB3 CR × LF + if (this_p == .CR and that_p == .LF) continue :scan; + // WB3a (Newline | CR | LF) ÷ + if (isNewline(this_p)) break :scan; + // WB3b ÷ (Newline | CR | LF) + if (isNewline(that_p)) break :scan; + // WB3c ZWJ × \p{Extended_Pictographic} + if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { + continue :scan; + } + // WB3d WSegSpace × WSegSpace + if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; + // WB4 X (Extend | Format | ZWJ)* → X + if (isIgnorable(that_p)) { + continue :scan; + } // Now we use last_p instead of this_p for ignorable's sake + if (isAHLetter(last_p)) { + // WB5 AHLetter × AHLetter + if (isAHLetter(that_p)) continue :scan; + // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter + if (isMidVal(that_p)) { + const next_val = iter.peekPast(); + if (next_val) |next_cp| { + const next_p = iter.wb.breakProp(next_cp); + if (isAHLetter(next_p)) { + continue :scan; + } + } + } + } + // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter + if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) { + continue :scan; + } + if (last_p == .Hebrew_Letter) { + // WB7a Hebrew_Letter × Single_Quote + if (that_p == .Single_Quote) continue :scan; + // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter + if (that_p == .Double_Quote) { + const next_val = iter.peekPast(); + if (next_val) |next_cp| { + const next_p = iter.wb.breakProp(next_cp); + if (next_p == .Hebrew_Letter) { + continue :scan; + } + } + } + } + // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter + if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter) + continue :scan; + // WB8 Numeric × Numeric + if (last_p == .Numeric and that_p == .Numeric) continue :scan; + // WB9 AHLetter × Numeric + if (isAHLetter(last_p) and that_p == .Numeric) continue :scan; + // WB10 Numeric × AHLetter + if (last_p == .Numeric and isAHLetter(that_p)) continue :scan; + // WB11 Numeric (MidNum | MidNumLetQ) × Numeric + if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric) + continue :scan; + // WB12 Numeric × (MidNum | MidNumLetQ) Numeric + if (last_p == .Numeric and isMidNum(that_p)) { + const next_val = iter.peekPast(); + if (next_val) |next_cp| { + const next_p = iter.wb.breakProp(next_cp); + if (next_p == .Numeric) { + continue :scan; + } + } + } + // WB13 Katakana × Katakana + if (last_p == .Katakana and that_p == .Katakana) continue :scan; + // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet + if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan; + // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) + if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; + // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI + const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator; + if (maybe_flag) { + ri_count += 1; + if (ri_count % 2 == 1) continue :scan; + } + // WB999 Any ÷ Any + break :scan; + } else { // iter.that == null + break :scan; + } + } + + return Word{ .len = word_len, .offset = word_start }; + } + + pub fn format(iter: Iterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + try writer.print( + "Iterator {{ .this = {any}, .that = {any} }}", + .{ iter.this, iter.that }, + ); + } + + fn advance(iter: *Iterator) void { + iter.this = iter.that; + iter.that = iter.cp_iter.next(); + } + + fn peekPast(iter: *Iterator) ?CodePoint { + const save_cp = iter.cp_iter; + defer iter.cp_iter = save_cp; + while (iter.cp_iter.peek()) |peeked| { + if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + _ = iter.cp_iter.next(); + } + return null; + } +}; + +/// An iterator, backward, over all words in a provided string. +pub const ReverseIterator = struct { + after: ?CodePoint = null, + before: ?CodePoint = null, + cp_iter: ReverseCodepointIterator, + wb: *const WordBreak, + flags: usize = 0, + + /// Assumes `str` is valid UTF-8. + pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { + var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; + wb_iter.advance(); + return wb_iter; + } + + /// Returns the previous word segment, if any, without advancing. + pub fn peek(iter: *ReverseIterator) ?Word { + const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; + defer { + iter.before, iter.after, iter.cp_iter, iter.flags = cache; + } + return iter.prev(); + } + + /// Return a forward iterator from where this iterator paused. Usually, + /// and always when using the API to create iterators, calling `next()` + /// will return the word just seen. + pub fn forwardIterator(iter: *ReverseIterator) Iterator { + var cp_it = iter.cp_iter.forwardIterator(); + if (iter.before) |_| + _ = cp_it.next(); + return .{ + .wb = iter.wb, + .this = cp_it.next(), + .that = iter.after, + .cp_iter = cp_it, + }; + } + + /// Return the previous word, if any. + pub fn prev(iter: *ReverseIterator) ?Word { + iter.advance(); + + // Done? + if (iter.after == null) return null; + // Last? + if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 }; + + const word_end = iter.after.?.offset + iter.after.?.len; + var word_len: u32 = 0; + + // State variables. + var last_p: WordBreakProperty = .none; + var last_last_p: WordBreakProperty = .none; + + scan: while (true) : (iter.advance()) { + const after = iter.after.?; + word_len += after.len; + if (iter.before) |before| { + var sneak = sneaky(iter); // 'sneaks' past ignorables + const after_p = iter.wb.breakProp(after); + var before_p = iter.wb.breakProp(before); + if (!isIgnorable(after_p)) { + last_last_p = last_p; + last_p = after_p; + } + // WB3 CR × LF + if (before_p == .CR and after_p == .LF) continue :scan; + // WB3a (Newline | CR | LF) ÷ + if (isNewline(before_p)) break :scan; + // WB3b ÷ (Newline | CR | LF) + if (isNewline(after_p)) break :scan; + // WB3c ZWJ × \p{Extended_Pictographic} + if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) { + continue :scan; + } + // WB3d WSegSpace × WSegSpace + if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; + // WB4 X (Extend | Format | ZWJ)* → X + if (isIgnorable(before_p)) { + const maybe_before = sneak.prev(); + if (maybe_before) |valid_before| { + before_p = iter.wb.breakProp(valid_before); + } else if (!isIgnorable(after_p)) { + // We're done + break :scan; + } + } + if (isIgnorable(after_p)) continue :scan; + // WB5 AHLetter × AHLetter + if (isAHLetter(last_p) and isAHLetter(before_p)) { + continue :scan; + } + // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter + if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) { + continue :scan; + } + // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter + if (isMidVal(before_p) and isAHLetter(last_p)) { + const prev_val = sneak.peek(); + if (prev_val) |prev_cp| { + const prev_p = iter.wb.breakProp(prev_cp); + if (isAHLetter(prev_p)) { + continue :scan; + } + } + } + // WB7a Hebrew_Letter × Single_Quote + if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan; + // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter + if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) { + continue :scan; + } + // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter + if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { + const prev_val = sneak.peek(); + if (prev_val) |prev_cp| { + const prev_p = iter.wb.breakProp(prev_cp); + if (prev_p == .Hebrew_Letter) { + continue :scan; + } + } + } + // WB8 Numeric × Numeric + if (before_p == .Numeric and last_p == .Numeric) continue :scan; + // WB9 AHLetter × Numeric + if (isAHLetter(before_p) and last_p == .Numeric) continue :scan; + // WB10 Numeric × AHLetter + if (before_p == .Numeric and isAHLetter(last_p)) continue :scan; + // WB11 Numeric (MidNum | MidNumLetQ) × Numeric + if (isMidNum(before_p) and last_p == .Numeric) { + const prev_val = sneak.peek(); + if (prev_val) |prev_cp| { + const prev_p = iter.wb.breakProp(prev_cp); + if (prev_p == .Numeric) { + continue :scan; + } + } + } + // WB12 Numeric × (MidNum | MidNumLetQ) Numeric + if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) { + continue :scan; + } + // WB13 Katakana × Katakana + if (before_p == .Katakana and last_p == .Katakana) continue :scan; + // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet + if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan; + // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) + if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan; + // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI + // NOTE: + // So here we simply have to know whether a run of flags is even or odd. + // The whole run. To avoid quadratic behavior (and long flag runs are + // actually a thing in the wild), we have to count them once, store that + // on the iterator, and decrement each time we see two, possibly breaking + // once extra at the beginning. They break up one per flag, once we hit + // zero, that's all the flags. If we see another flag we do it again. + if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) { + defer { + if (iter.flags > 0) iter.flags -= 1; + } + if (iter.flags == 0) { + iter.flags = sneak.countFlags(); + } + if (iter.flags % 2 == 0) { + continue :scan; + } + } + // WB999 Any ÷ Any + break :scan; + } + break :scan; + } + return Word{ .len = word_len, .offset = word_end - word_len }; + } + + pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + try writer.print( + "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}", + .{ iter.before, iter.after, iter.flags }, + ); + } + + fn peekPast(iter: *ReverseIterator) ?CodePoint { + const save_cp = iter.cp_iter; + defer iter.cp_iter = save_cp; + while (iter.cp_iter.peek()) |peeked| { + if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + _ = iter.cp_iter.prev(); + } + return null; + } + + fn advance(iter: *ReverseIterator) void { + iter.after = iter.before; + iter.before = iter.cp_iter.prev(); + } +}; + +//| Implementation Details + +/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. +fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator { + var idx: u32 = @intCast(index); + // Find the next lead byte: + while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} + if (idx == string.len) return wb.reverseIterator(string); + var iter: ReverseIterator = undefined; + iter.wb = wb; + iter.flags = 0; + // We need to populate the CodePoints, and the codepoint iterator. + // Consider "abc| def" with the cursor as |. + // We need `before` to be `c` and `after` to be ' ', + // and `cp_iter.prev()` to be `b`. + var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx }; + iter.after = cp_iter.prev(); + iter.before = cp_iter.prev(); + iter.cp_iter = cp_iter; + return iter; +} + +fn sneaky(iter: *const ReverseIterator) SneakIterator { + return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; +} + +const SneakIterator = struct { + cp_iter: ReverseCodepointIterator, + wb: *const WordBreak, + + fn peek(iter: *SneakIterator) ?CodePoint { + const save_cp = iter.cp_iter; + defer iter.cp_iter = save_cp; + while (iter.cp_iter.peek()) |peeked| { + if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + _ = iter.cp_iter.prev(); + } + return null; + } + + fn countFlags(iter: *SneakIterator) usize { + var flags: usize = 0; + const save_cp = iter.cp_iter; + defer iter.cp_iter = save_cp; + while (iter.cp_iter.prev()) |cp| { + const prop = iter.wb.breakProp(cp); + if (isIgnorable(prop)) continue; + if (prop == .Regional_Indicator) { + flags += 1; + } else break; + } + return flags; + } + + fn prev(iter: *SneakIterator) ?CodePoint { + while (iter.cp_iter.prev()) |peeked| { + if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + } + return null; + } +}; + +inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { + const decompressor = compress.flate.inflate.decompressor; + const in_bytes = @embedFile("wbp"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = decompressor(.raw, in_fbs.reader()); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + const stage_1_len: u16 = try reader.readInt(u16, endian); + wb.s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(wb.s1); + for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); + + const stage_2_len: u16 = try reader.readInt(u16, endian); + wb.s2 = try allocator.alloc(u5, stage_2_len); + errdefer allocator.free(wb.s2); + for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); + var count_0: usize = 0; + for (wb.s2) |nyb| { + if (nyb == 0) count_0 += 1; + } +} + +//| Predicates + +inline fn isNewline(wbp: WordBreakProperty) bool { + return wbp == .CR or wbp == .LF or wbp == .Newline; +} + +inline fn isIgnorable(wbp: WordBreakProperty) bool { + return switch (wbp) { + .Format, .Extend, .ZWJ => true, + else => false, + }; +} + +inline fn isAHLetter(wbp: WordBreakProperty) bool { + return wbp == .ALetter or wbp == .Hebrew_Letter; +} + +inline fn isMidVal(wbp: WordBreakProperty) bool { + return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; +} + +inline fn isMidNum(wbp: WordBreakProperty) bool { + return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote; +} + +inline fn isExtensible(wbp: WordBreakProperty) bool { + return switch (wbp) { + .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, + else => false, + }; +} + +test "Word Break Properties" { + const wb = try WordBreak.init(testing.allocator); + defer wb.deinit(testing.allocator); + try testing.expectEqual(.CR, wb.breakProperty('\r')); + try testing.expectEqual(.LF, wb.breakProperty('\n')); + try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); + try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); +} + +test "ext_pict" { + try testing.expect(ext_pict.isMatch("👇")); + try testing.expect(ext_pict.isMatch("\u{2701}")); +} + +test wordAtIndex { + const wb = try WordBreak.init(testing.allocator); + defer wb.deinit(testing.allocator); + const t_string = "first second third"; + const second = wb.wordAtIndex(t_string, 8); + try testing.expectEqualStrings("second", second.bytes(t_string)); + const third = wb.wordAtIndex(t_string, 14); + try testing.expectEqualStrings("third", third.bytes(t_string)); + { + const first = wb.wordAtIndex(t_string, 3); + try testing.expectEqualStrings("first", first.bytes(t_string)); + } + { + const first = wb.wordAtIndex(t_string, 0); + try testing.expectEqualStrings("first", first.bytes(t_string)); + } + const last = wb.wordAtIndex(t_string, 14); + try testing.expectEqualStrings("third", last.bytes(t_string)); +} + +const testr = "don't a:ka fin!"; + +test "reversal" { + const wb = try WordBreak.init(testing.allocator); + defer wb.deinit(testing.allocator); + { + var fwd = wb.iterator(testr); + var this_word: ?Word = fwd.next(); + + while (this_word) |this| : (this_word = fwd.next()) { + var back = fwd.reverseIterator(); + const that_word = back.prev(); + if (that_word) |that| { + try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); + } else { + try testing.expect(false); + } + } + } + { + var back = wb.reverseIterator(testr); + var this_word: ?Word = back.prev(); + + while (this_word) |this| : (this_word = back.prev()) { + var fwd = back.forwardIterator(); + const that_word = fwd.next(); + if (that_word) |that| { + try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); + } else { + try testing.expect(false); + } + } + } +} + +fn testAllocations(allocator: Allocator) !void { + const wb = try WordBreak.init(allocator); + wb.deinit(allocator); +} + +test "allocation safety" { + try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{}); +} + +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; +const Allocator = mem.Allocator; +const assert = std.debug.assert; +const testing = std.testing; + +const code_point = @import("code_point"); +const CodepointIterator = code_point.Iterator; +const ReverseCodepointIterator = code_point.ReverseIterator; +const CodePoint = code_point.CodePoint; + +const ext_pict = @import("micro_runeset.zig").Extended_Pictographic; -- cgit v1.2.3 From aa20bebade8eeb3ca75199dc252feb3edb203fb1 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 16 May 2025 12:06:36 -0400 Subject: Words module In keeping with the new nomenclature, we're calling the module "Words", not "WordBreak". The latter is Unicode jargon, the module provides word iterators. Words are the figure, word breaks are the ground. --- src/Words.zig | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'src/Words.zig') diff --git a/src/Words.zig b/src/Words.zig index 6a532f5..565a2fb 100644 --- a/src/Words.zig +++ b/src/Words.zig @@ -25,15 +25,15 @@ const WordBreakProperty = enum(u5) { s1: []u16 = undefined, s2: []u5 = undefined, -const WordBreak = @This(); +const Words = @This(); -pub fn init(allocator: Allocator) Allocator.Error!WordBreak { - var wb: WordBreak = undefined; +pub fn init(allocator: Allocator) Allocator.Error!Words { + var wb: Words = undefined; try wb.setup(allocator); return wb; } -pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { +pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void { wb.setupImpl(allocator) catch |err| { switch (err) { error.OutOfMemory => |e| return e, @@ -42,7 +42,7 @@ pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { }; } -pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { +pub fn deinit(wordbreak: *const Words, allocator: mem.Allocator) void { allocator.free(wordbreak.s1); allocator.free(wordbreak.s2); } @@ -60,19 +60,19 @@ pub const Word = struct { }; /// Returns the word break property type for `cp`. -pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { +pub fn breakProperty(wordbreak: *const Words, cp: u21) WordBreakProperty { return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); } /// Convenience function for working with CodePoints -fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { +fn breakProp(wb: *const Words, point: CodePoint) WordBreakProperty { return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); } /// Returns the Word at the given index. Asserts that the index is less than /// `string.len`, and that the string is not empty. Always returns a word. /// The index does not have to be the start of a codepoint in the word. -pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { +pub fn wordAtIndex(wordbreak: *const Words, string: []const u8, index: usize) Word { assert(index < string.len and string.len > 0); var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); const first_back = iter_back.prev(); @@ -118,12 +118,12 @@ pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize } /// Returns an iterator over words in `slice`. -pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { +pub fn iterator(wordbreak: *const Words, slice: []const u8) Iterator { return Iterator.init(wordbreak, slice); } /// Returns a reverse iterator over the words in `slice`. -pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator { +pub fn reverseIterator(wordbreak: *const Words, slice: []const u8) ReverseIterator { return ReverseIterator.init(wordbreak, slice); } @@ -132,10 +132,10 @@ pub const Iterator = struct { this: ?CodePoint = null, that: ?CodePoint = null, cp_iter: CodepointIterator, - wb: *const WordBreak, + wb: *const Words, /// Assumes `str` is valid UTF-8. - pub fn init(wb: *const WordBreak, str: []const u8) Iterator { + pub fn init(wb: *const Words, str: []const u8) Iterator { var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; wb_iter.advance(); return wb_iter; @@ -314,11 +314,11 @@ pub const ReverseIterator = struct { after: ?CodePoint = null, before: ?CodePoint = null, cp_iter: ReverseCodepointIterator, - wb: *const WordBreak, + wb: *const Words, flags: usize = 0, /// Assumes `str` is valid UTF-8. - pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { + pub fn init(wb: *const Words, str: []const u8) ReverseIterator { var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; wb_iter.advance(); return wb_iter; @@ -511,7 +511,7 @@ pub const ReverseIterator = struct { //| Implementation Details /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. -fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator { +fn initAtIndex(wb: *const Words, string: []const u8, index: usize) ReverseIterator { var idx: u32 = @intCast(index); // Find the next lead byte: while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} @@ -536,7 +536,7 @@ fn sneaky(iter: *const ReverseIterator) SneakIterator { const SneakIterator = struct { cp_iter: ReverseCodepointIterator, - wb: *const WordBreak, + wb: *const Words, fn peek(iter: *SneakIterator) ?CodePoint { const save_cp = iter.cp_iter; @@ -570,7 +570,7 @@ const SneakIterator = struct { } }; -inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { +inline fn setupImpl(wb: *Words, allocator: Allocator) !void { const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("wbp"); var in_fbs = std.io.fixedBufferStream(in_bytes); @@ -627,7 +627,7 @@ inline fn isExtensible(wbp: WordBreakProperty) bool { } test "Word Break Properties" { - const wb = try WordBreak.init(testing.allocator); + const wb = try Words.init(testing.allocator); defer wb.deinit(testing.allocator); try testing.expectEqual(.CR, wb.breakProperty('\r')); try testing.expectEqual(.LF, wb.breakProperty('\n')); @@ -641,7 +641,7 @@ test "ext_pict" { } test wordAtIndex { - const wb = try WordBreak.init(testing.allocator); + const wb = try Words.init(testing.allocator); defer wb.deinit(testing.allocator); const t_string = "first second third"; const second = wb.wordAtIndex(t_string, 8); @@ -663,7 +663,7 @@ test wordAtIndex { const testr = "don't a:ka fin!"; test "reversal" { - const wb = try WordBreak.init(testing.allocator); + const wb = try Words.init(testing.allocator); defer wb.deinit(testing.allocator); { var fwd = wb.iterator(testr); @@ -696,7 +696,7 @@ test "reversal" { } fn testAllocations(allocator: Allocator) !void { - const wb = try WordBreak.init(allocator); + const wb = try Words.init(allocator); wb.deinit(allocator); } -- cgit v1.2.3 From ef27c51b8e46f3909a27fd137429b717797f1fd9 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 23 May 2025 16:48:55 -0400 Subject: Add iterateBefore and iterateAfter These create reverse or forward iterators before or after a Word. So this way, the user can get the word at an index, then iterate forward or back from that word. Also: Fixes #59 Which was fixed awhile back, but I don't feel like doing repo surgery to tag the fix where it happened. We have blame for that kind of thing. --- src/Words.zig | 98 ++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 66 insertions(+), 32 deletions(-) (limited to 'src/Words.zig') diff --git a/src/Words.zig b/src/Words.zig index 565a2fb..1d10b2a 100644 --- a/src/Words.zig +++ b/src/Words.zig @@ -1,4 +1,7 @@ //! Word Breaking Algorithm. +//! +//! https://www.unicode.org/reports/tr29/#Word_Boundaries +//! const WordBreakProperty = enum(u5) { none, @@ -42,9 +45,9 @@ pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void { }; } -pub fn deinit(wordbreak: *const Words, allocator: mem.Allocator) void { - allocator.free(wordbreak.s1); - allocator.free(wordbreak.s2); +pub fn deinit(words: *const Words, allocator: mem.Allocator) void { + allocator.free(words.s1); + allocator.free(words.s2); } /// Represents a Unicode word span, as an offset into the source string @@ -54,51 +57,44 @@ pub const Word = struct { len: u32, /// Returns a slice of the word given the source string. - pub fn bytes(self: Word, src: []const u8) []const u8 { - return src[self.offset..][0..self.len]; + pub fn bytes(word: Word, src: []const u8) []const u8 { + return src[word.offset..][0..word.len]; } }; /// Returns the word break property type for `cp`. -pub fn breakProperty(wordbreak: *const Words, cp: u21) WordBreakProperty { - return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); +pub fn breakProperty(words: *const Words, cp: u21) WordBreakProperty { + return @enumFromInt(words.s2[words.s1[cp >> 8] + (cp & 0xff)]); } /// Convenience function for working with CodePoints -fn breakProp(wb: *const Words, point: CodePoint) WordBreakProperty { - return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); +fn breakProp(words: *const Words, point: CodePoint) WordBreakProperty { + return @enumFromInt(words.s2[words.s1[point.code >> 8] + (point.code & 0xff)]); } /// Returns the Word at the given index. Asserts that the index is less than /// `string.len`, and that the string is not empty. Always returns a word. /// The index does not have to be the start of a codepoint in the word. -pub fn wordAtIndex(wordbreak: *const Words, string: []const u8, index: usize) Word { +pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word { assert(index < string.len and string.len > 0); - var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); + var iter_back: ReverseIterator = reverseFromIndex(words, string, index); const first_back = iter_back.prev(); if (first_back) |back| { if (back.offset == 0) { - var iter_fwd = wordbreak.iterator(string); + var iter_fwd = words.iterator(string); while (iter_fwd.next()) |word| { if (word.offset <= index and index < word.offset + word.len) return word; } } } else { - var iter_fwd = wordbreak.iterator(string); + var iter_fwd = words.iterator(string); while (iter_fwd.next()) |word| { if (word.offset <= index and index < word.offset + word.len) return word; } } - const second_back = iter_back.prev(); - if (second_back) |back| if (back.offset == 0) { - var iter_fwd = wordbreak.iterator(string); - while (iter_fwd.next()) |word| { - if (word.offset <= index and index < word.offset + word.len) - return word; - } - }; + _ = iter_back.prev(); // There's sometimes flags: if (iter_back.flags > 0) { while (iter_back.flags > 0) { @@ -118,13 +114,23 @@ pub fn wordAtIndex(wordbreak: *const Words, string: []const u8, index: usize) Wo } /// Returns an iterator over words in `slice`. -pub fn iterator(wordbreak: *const Words, slice: []const u8) Iterator { - return Iterator.init(wordbreak, slice); +pub fn iterator(words: *const Words, slice: []const u8) Iterator { + return Iterator.init(words, slice); } /// Returns a reverse iterator over the words in `slice`. -pub fn reverseIterator(wordbreak: *const Words, slice: []const u8) ReverseIterator { - return ReverseIterator.init(wordbreak, slice); +pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator { + return ReverseIterator.init(words, slice); +} + +/// Returns an iterator after the `word` in `slice`. +pub fn iterateAfter(words: *const Words, slice: []const u8, word: Word) Iterator { + return forwardFromIndex(words, slice, word.offset + word.len); +} + +/// Returns a reverse iterator before the `word` in `slice`. +pub fn iterateBefore(words: *const Words, slice: []const u8, word: Word) ReverseIterator { + return reverseFromIndex(words, slice, word.offset); } /// An iterator, forward, over all words in a provided string. @@ -135,8 +141,8 @@ pub const Iterator = struct { wb: *const Words, /// Assumes `str` is valid UTF-8. - pub fn init(wb: *const Words, str: []const u8) Iterator { - var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; + pub fn init(words: *const Words, str: []const u8) Iterator { + var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = words }; wb_iter.advance(); return wb_iter; } @@ -318,8 +324,8 @@ pub const ReverseIterator = struct { flags: usize = 0, /// Assumes `str` is valid UTF-8. - pub fn init(wb: *const Words, str: []const u8) ReverseIterator { - var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; + pub fn init(words: *const Words, str: []const u8) ReverseIterator { + var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = words }; wb_iter.advance(); return wb_iter; } @@ -511,13 +517,13 @@ pub const ReverseIterator = struct { //| Implementation Details /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. -fn initAtIndex(wb: *const Words, string: []const u8, index: usize) ReverseIterator { +fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator { var idx: u32 = @intCast(index); // Find the next lead byte: while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} - if (idx == string.len) return wb.reverseIterator(string); + if (idx == string.len) return words.reverseIterator(string); var iter: ReverseIterator = undefined; - iter.wb = wb; + iter.wb = words; iter.flags = 0; // We need to populate the CodePoints, and the codepoint iterator. // Consider "abc| def" with the cursor as |. @@ -530,6 +536,34 @@ fn initAtIndex(wb: *const Words, string: []const u8, index: usize) ReverseIterat return iter; } +fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator { + var idx: u32 = @intCast(index); + if (idx == string.len) { + return .{ + .cp_iter = .{ .bytes = string, .i = idx }, + .this = null, + .that = null, + .wb = words, + }; + } + while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} + if (idx == 0) return words.iterator(string); + var iter: Iterator = undefined; + iter.wb = words; + // We need to populate the CodePoints, and the codepoint iterator. + // Consider "abc |def" with the cursor as |. + // We need `this` to be ` ` and `that` to be 'd', + // and `cp_iter.next()` to be `d`. + idx -= 1; + while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} + // "abc| def" + var cp_iter: CodepointIterator = .{ .bytes = string, .i = idx }; + iter.this = cp_iter.next(); + iter.that = cp_iter.next(); + iter.cp_iter = cp_iter; + return iter; +} + fn sneaky(iter: *const ReverseIterator) SneakIterator { return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; } -- cgit v1.2.3 From c9a1b3392973ee30e6a9a532f1da8605619b5b06 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 23 May 2025 18:46:30 -0400 Subject: Make offset size configurable Hopefully I can talk users out of taking advantage of this configuration but I'll have better luck with that if it's available. --- src/Words.zig | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'src/Words.zig') diff --git a/src/Words.zig b/src/Words.zig index 1d10b2a..1707881 100644 --- a/src/Words.zig +++ b/src/Words.zig @@ -53,8 +53,8 @@ pub fn deinit(words: *const Words, allocator: mem.Allocator) void { /// Represents a Unicode word span, as an offset into the source string /// and the length of the word. pub const Word = struct { - offset: u32, - len: u32, + offset: uoffset, + len: uoffset, /// Returns a slice of the word given the source string. pub fn bytes(word: Word, src: []const u8) []const u8 { @@ -183,7 +183,7 @@ pub const Iterator = struct { if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; const word_start = iter.this.?.offset; - var word_len: u32 = 0; + var word_len: uoffset = 0; // State variables. var last_p: WordBreakProperty = .none; @@ -364,7 +364,7 @@ pub const ReverseIterator = struct { if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 }; const word_end = iter.after.?.offset + iter.after.?.len; - var word_len: u32 = 0; + var word_len: uoffset = 0; // State variables. var last_p: WordBreakProperty = .none; @@ -518,7 +518,7 @@ pub const ReverseIterator = struct { /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator { - var idx: u32 = @intCast(index); + var idx: uoffset = @intCast(index); // Find the next lead byte: while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} if (idx == string.len) return words.reverseIterator(string); @@ -537,7 +537,7 @@ fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) Rever } fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator { - var idx: u32 = @intCast(index); + var idx: uoffset = @intCast(index); if (idx == string.len) { return .{ .cp_iter = .{ .bytes = string, .i = idx }, @@ -746,6 +746,8 @@ const Allocator = mem.Allocator; const assert = std.debug.assert; const testing = std.testing; +const uoffset = code_point.uoffset; + const code_point = @import("code_point"); const CodepointIterator = code_point.Iterator; const ReverseCodepointIterator = code_point.ReverseIterator; -- cgit v1.2.3 From 8f5209fa095c2ed9114ce102b2f9b2cc90d66b13 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Sun, 1 Jun 2025 14:08:25 -0400 Subject: Add graphemeAtIndex + iterate before and after That completes the set. I do think it's possible to bum a few more cycles from the implementation, but, I'm not going to. It passes the acceptance suite and that's what it needs to do. --- src/Words.zig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/Words.zig') diff --git a/src/Words.zig b/src/Words.zig index 1707881..af82562 100644 --- a/src/Words.zig +++ b/src/Words.zig @@ -124,12 +124,12 @@ pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator { } /// Returns an iterator after the `word` in `slice`. -pub fn iterateAfter(words: *const Words, slice: []const u8, word: Word) Iterator { +pub fn iterateAfterWord(words: *const Words, slice: []const u8, word: Word) Iterator { return forwardFromIndex(words, slice, word.offset + word.len); } /// Returns a reverse iterator before the `word` in `slice`. -pub fn iterateBefore(words: *const Words, slice: []const u8, word: Word) ReverseIterator { +pub fn iterateBeforeWord(words: *const Words, slice: []const u8, word: Word) ReverseIterator { return reverseFromIndex(words, slice, word.offset); } -- cgit v1.2.3 From e3082e64b3ab8a8aa0777d63be69eb8b6d50a654 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Tue, 8 Jul 2025 12:12:20 -0400 Subject: Add Words.zig example to README --- src/Words.zig | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'src/Words.zig') diff --git a/src/Words.zig b/src/Words.zig index af82562..617c34d 100644 --- a/src/Words.zig +++ b/src/Words.zig @@ -674,6 +674,23 @@ test "ext_pict" { try testing.expect(ext_pict.isMatch("\u{2701}")); } +test "Words" { + const wb = try Words.init(testing.allocator); + defer wb.deinit(testing.allocator); + const word_str = "Metonym Μετωνύμιο メトニム"; + var w_iter = wb.iterator(word_str); + try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str)); + // Spaces are "words" too! + try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str)); + const in_greek = w_iter.next().?; + for (in_greek.offset..in_greek.offset + in_greek.len) |i| { + const at_index = wb.wordAtIndex(word_str, i).bytes(word_str); + try testing.expectEqualStrings("Μετωνύμιο", at_index); + } + _ = w_iter.next(); + try testing.expectEqualStrings("メトニム", w_iter.next().?.bytes(word_str)); +} + test wordAtIndex { const wb = try Words.init(testing.allocator); defer wb.deinit(testing.allocator); -- cgit v1.2.3