From 21d55b730e25733c98fa58de42d256adf5f80d88 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 16 May 2025 12:06:32 -0400 Subject: Move WordBreak to Words --- src/WordBreak.zig | 720 ------------------------------------------------------ src/Words.zig | 720 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 720 insertions(+), 720 deletions(-) delete mode 100644 src/WordBreak.zig create mode 100644 src/Words.zig (limited to 'src') diff --git a/src/WordBreak.zig b/src/WordBreak.zig deleted file mode 100644 index 6a532f5..0000000 --- a/src/WordBreak.zig +++ /dev/null @@ -1,720 +0,0 @@ -//! Word Breaking Algorithm. - -const WordBreakProperty = enum(u5) { - none, - Double_Quote, - Single_Quote, - Hebrew_Letter, - CR, - LF, - Newline, - Extend, - Regional_Indicator, - Format, - Katakana, - ALetter, - MidLetter, - MidNum, - MidNumLet, - Numeric, - ExtendNumLet, - ZWJ, - WSegSpace, -}; - -s1: []u16 = undefined, -s2: []u5 = undefined, - -const WordBreak = @This(); - -pub fn init(allocator: Allocator) Allocator.Error!WordBreak { - var wb: WordBreak = undefined; - try wb.setup(allocator); - return wb; -} - -pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { - wb.setupImpl(allocator) catch |err| { - switch (err) { - error.OutOfMemory => |e| return e, - else => unreachable, - } - }; -} - -pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { - allocator.free(wordbreak.s1); - allocator.free(wordbreak.s2); -} - -/// Represents a Unicode word span, as an offset into the source string -/// and the length of the word. -pub const Word = struct { - offset: u32, - len: u32, - - /// Returns a slice of the word given the source string. - pub fn bytes(self: Word, src: []const u8) []const u8 { - return src[self.offset..][0..self.len]; - } -}; - -/// Returns the word break property type for `cp`. -pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { - return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); -} - -/// Convenience function for working with CodePoints -fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { - return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); -} - -/// Returns the Word at the given index. Asserts that the index is less than -/// `string.len`, and that the string is not empty. Always returns a word. -/// The index does not have to be the start of a codepoint in the word. -pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { - assert(index < string.len and string.len > 0); - var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); - const first_back = iter_back.prev(); - if (first_back) |back| { - if (back.offset == 0) { - var iter_fwd = wordbreak.iterator(string); - while (iter_fwd.next()) |word| { - if (word.offset <= index and index < word.offset + word.len) - return word; - } - } - } else { - var iter_fwd = wordbreak.iterator(string); - while (iter_fwd.next()) |word| { - if (word.offset <= index and index < word.offset + word.len) - return word; - } - } - const second_back = iter_back.prev(); - if (second_back) |back| if (back.offset == 0) { - var iter_fwd = wordbreak.iterator(string); - while (iter_fwd.next()) |word| { - if (word.offset <= index and index < word.offset + word.len) - return word; - } - }; - // There's sometimes flags: - if (iter_back.flags > 0) { - while (iter_back.flags > 0) { - if (iter_back.prev()) |_| { - continue; - } else { - break; - } - } - } - var iter_fwd = iter_back.forwardIterator(); - while (iter_fwd.next()) |word| { - if (word.offset <= index and index < word.offset + word.len) - return word; - } - unreachable; -} - -/// Returns an iterator over words in `slice`. -pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { - return Iterator.init(wordbreak, slice); -} - -/// Returns a reverse iterator over the words in `slice`. -pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator { - return ReverseIterator.init(wordbreak, slice); -} - -/// An iterator, forward, over all words in a provided string. -pub const Iterator = struct { - this: ?CodePoint = null, - that: ?CodePoint = null, - cp_iter: CodepointIterator, - wb: *const WordBreak, - - /// Assumes `str` is valid UTF-8. - pub fn init(wb: *const WordBreak, str: []const u8) Iterator { - var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; - wb_iter.advance(); - return wb_iter; - } - - /// Returns the next word segment, without advancing. - pub fn peek(iter: *Iterator) ?Word { - const cache = .{ iter.this, iter.that, iter.cp_iter }; - defer { - iter.this, iter.that, iter.cp_iter = cache; - } - return iter.next(); - } - - /// Returns a reverse iterator from the point this iterator is paused - /// at. Usually, and always when using the API to create iterators, - /// calling `prev()` will return the word just seen. - pub fn reverseIterator(iter: *Iterator) ReverseIterator { - var cp_it = iter.cp_iter.reverseIterator(); - if (iter.that) |_| - _ = cp_it.prev(); - if (iter.cp_iter.peek()) |_| - _ = cp_it.prev(); - return .{ - .wb = iter.wb, - .before = cp_it.prev(), - .after = iter.that, - .cp_iter = cp_it, - }; - } - - /// Returns the next word segment, if any. - pub fn next(iter: *Iterator) ?Word { - iter.advance(); - - // Done? - if (iter.this == null) return null; - // Last? - if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; - - const word_start = iter.this.?.offset; - var word_len: u32 = 0; - - // State variables. - var last_p: WordBreakProperty = .none; - var last_last_p: WordBreakProperty = .none; - var ri_count: usize = 0; - - scan: while (true) : (iter.advance()) { - const this = iter.this.?; - word_len += this.len; - if (iter.that) |that| { - const this_p = iter.wb.breakProp(this); - const that_p = iter.wb.breakProp(that); - if (!isIgnorable(this_p)) { - last_last_p = last_p; - last_p = this_p; - } - // WB3 CR × LF - if (this_p == .CR and that_p == .LF) continue :scan; - // WB3a (Newline | CR | LF) ÷ - if (isNewline(this_p)) break :scan; - // WB3b ÷ (Newline | CR | LF) - if (isNewline(that_p)) break :scan; - // WB3c ZWJ × \p{Extended_Pictographic} - if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { - continue :scan; - } - // WB3d WSegSpace × WSegSpace - if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; - // WB4 X (Extend | Format | ZWJ)* → X - if (isIgnorable(that_p)) { - continue :scan; - } // Now we use last_p instead of this_p for ignorable's sake - if (isAHLetter(last_p)) { - // WB5 AHLetter × AHLetter - if (isAHLetter(that_p)) continue :scan; - // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter - if (isMidVal(that_p)) { - const next_val = iter.peekPast(); - if (next_val) |next_cp| { - const next_p = iter.wb.breakProp(next_cp); - if (isAHLetter(next_p)) { - continue :scan; - } - } - } - } - // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter - if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) { - continue :scan; - } - if (last_p == .Hebrew_Letter) { - // WB7a Hebrew_Letter × Single_Quote - if (that_p == .Single_Quote) continue :scan; - // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter - if (that_p == .Double_Quote) { - const next_val = iter.peekPast(); - if (next_val) |next_cp| { - const next_p = iter.wb.breakProp(next_cp); - if (next_p == .Hebrew_Letter) { - continue :scan; - } - } - } - } - // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter - if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter) - continue :scan; - // WB8 Numeric × Numeric - if (last_p == .Numeric and that_p == .Numeric) continue :scan; - // WB9 AHLetter × Numeric - if (isAHLetter(last_p) and that_p == .Numeric) continue :scan; - // WB10 Numeric × AHLetter - if (last_p == .Numeric and isAHLetter(that_p)) continue :scan; - // WB11 Numeric (MidNum | MidNumLetQ) × Numeric - if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric) - continue :scan; - // WB12 Numeric × (MidNum | MidNumLetQ) Numeric - if (last_p == .Numeric and isMidNum(that_p)) { - const next_val = iter.peekPast(); - if (next_val) |next_cp| { - const next_p = iter.wb.breakProp(next_cp); - if (next_p == .Numeric) { - continue :scan; - } - } - } - // WB13 Katakana × Katakana - if (last_p == .Katakana and that_p == .Katakana) continue :scan; - // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet - if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan; - // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) - if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; - // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI - const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator; - if (maybe_flag) { - ri_count += 1; - if (ri_count % 2 == 1) continue :scan; - } - // WB999 Any ÷ Any - break :scan; - } else { // iter.that == null - break :scan; - } - } - - return Word{ .len = word_len, .offset = word_start }; - } - - pub fn format(iter: Iterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { - try writer.print( - "Iterator {{ .this = {any}, .that = {any} }}", - .{ iter.this, iter.that }, - ); - } - - fn advance(iter: *Iterator) void { - iter.this = iter.that; - iter.that = iter.cp_iter.next(); - } - - fn peekPast(iter: *Iterator) ?CodePoint { - const save_cp = iter.cp_iter; - defer iter.cp_iter = save_cp; - while (iter.cp_iter.peek()) |peeked| { - if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; - _ = iter.cp_iter.next(); - } - return null; - } -}; - -/// An iterator, backward, over all words in a provided string. -pub const ReverseIterator = struct { - after: ?CodePoint = null, - before: ?CodePoint = null, - cp_iter: ReverseCodepointIterator, - wb: *const WordBreak, - flags: usize = 0, - - /// Assumes `str` is valid UTF-8. - pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { - var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; - wb_iter.advance(); - return wb_iter; - } - - /// Returns the previous word segment, if any, without advancing. - pub fn peek(iter: *ReverseIterator) ?Word { - const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; - defer { - iter.before, iter.after, iter.cp_iter, iter.flags = cache; - } - return iter.prev(); - } - - /// Return a forward iterator from where this iterator paused. Usually, - /// and always when using the API to create iterators, calling `next()` - /// will return the word just seen. - pub fn forwardIterator(iter: *ReverseIterator) Iterator { - var cp_it = iter.cp_iter.forwardIterator(); - if (iter.before) |_| - _ = cp_it.next(); - return .{ - .wb = iter.wb, - .this = cp_it.next(), - .that = iter.after, - .cp_iter = cp_it, - }; - } - - /// Return the previous word, if any. - pub fn prev(iter: *ReverseIterator) ?Word { - iter.advance(); - - // Done? - if (iter.after == null) return null; - // Last? - if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 }; - - const word_end = iter.after.?.offset + iter.after.?.len; - var word_len: u32 = 0; - - // State variables. - var last_p: WordBreakProperty = .none; - var last_last_p: WordBreakProperty = .none; - - scan: while (true) : (iter.advance()) { - const after = iter.after.?; - word_len += after.len; - if (iter.before) |before| { - var sneak = sneaky(iter); // 'sneaks' past ignorables - const after_p = iter.wb.breakProp(after); - var before_p = iter.wb.breakProp(before); - if (!isIgnorable(after_p)) { - last_last_p = last_p; - last_p = after_p; - } - // WB3 CR × LF - if (before_p == .CR and after_p == .LF) continue :scan; - // WB3a (Newline | CR | LF) ÷ - if (isNewline(before_p)) break :scan; - // WB3b ÷ (Newline | CR | LF) - if (isNewline(after_p)) break :scan; - // WB3c ZWJ × \p{Extended_Pictographic} - if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) { - continue :scan; - } - // WB3d WSegSpace × WSegSpace - if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; - // WB4 X (Extend | Format | ZWJ)* → X - if (isIgnorable(before_p)) { - const maybe_before = sneak.prev(); - if (maybe_before) |valid_before| { - before_p = iter.wb.breakProp(valid_before); - } else if (!isIgnorable(after_p)) { - // We're done - break :scan; - } - } - if (isIgnorable(after_p)) continue :scan; - // WB5 AHLetter × AHLetter - if (isAHLetter(last_p) and isAHLetter(before_p)) { - continue :scan; - } - // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter - if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) { - continue :scan; - } - // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter - if (isMidVal(before_p) and isAHLetter(last_p)) { - const prev_val = sneak.peek(); - if (prev_val) |prev_cp| { - const prev_p = iter.wb.breakProp(prev_cp); - if (isAHLetter(prev_p)) { - continue :scan; - } - } - } - // WB7a Hebrew_Letter × Single_Quote - if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan; - // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter - if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) { - continue :scan; - } - // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter - if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { - const prev_val = sneak.peek(); - if (prev_val) |prev_cp| { - const prev_p = iter.wb.breakProp(prev_cp); - if (prev_p == .Hebrew_Letter) { - continue :scan; - } - } - } - // WB8 Numeric × Numeric - if (before_p == .Numeric and last_p == .Numeric) continue :scan; - // WB9 AHLetter × Numeric - if (isAHLetter(before_p) and last_p == .Numeric) continue :scan; - // WB10 Numeric × AHLetter - if (before_p == .Numeric and isAHLetter(last_p)) continue :scan; - // WB11 Numeric (MidNum | MidNumLetQ) × Numeric - if (isMidNum(before_p) and last_p == .Numeric) { - const prev_val = sneak.peek(); - if (prev_val) |prev_cp| { - const prev_p = iter.wb.breakProp(prev_cp); - if (prev_p == .Numeric) { - continue :scan; - } - } - } - // WB12 Numeric × (MidNum | MidNumLetQ) Numeric - if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) { - continue :scan; - } - // WB13 Katakana × Katakana - if (before_p == .Katakana and last_p == .Katakana) continue :scan; - // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet - if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan; - // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) - if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan; - // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI - // NOTE: - // So here we simply have to know whether a run of flags is even or odd. - // The whole run. To avoid quadratic behavior (and long flag runs are - // actually a thing in the wild), we have to count them once, store that - // on the iterator, and decrement each time we see two, possibly breaking - // once extra at the beginning. They break up one per flag, once we hit - // zero, that's all the flags. If we see another flag we do it again. - if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) { - defer { - if (iter.flags > 0) iter.flags -= 1; - } - if (iter.flags == 0) { - iter.flags = sneak.countFlags(); - } - if (iter.flags % 2 == 0) { - continue :scan; - } - } - // WB999 Any ÷ Any - break :scan; - } - break :scan; - } - return Word{ .len = word_len, .offset = word_end - word_len }; - } - - pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { - try writer.print( - "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}", - .{ iter.before, iter.after, iter.flags }, - ); - } - - fn peekPast(iter: *ReverseIterator) ?CodePoint { - const save_cp = iter.cp_iter; - defer iter.cp_iter = save_cp; - while (iter.cp_iter.peek()) |peeked| { - if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; - _ = iter.cp_iter.prev(); - } - return null; - } - - fn advance(iter: *ReverseIterator) void { - iter.after = iter.before; - iter.before = iter.cp_iter.prev(); - } -}; - -//| Implementation Details - -/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. -fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator { - var idx: u32 = @intCast(index); - // Find the next lead byte: - while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} - if (idx == string.len) return wb.reverseIterator(string); - var iter: ReverseIterator = undefined; - iter.wb = wb; - iter.flags = 0; - // We need to populate the CodePoints, and the codepoint iterator. - // Consider "abc| def" with the cursor as |. - // We need `before` to be `c` and `after` to be ' ', - // and `cp_iter.prev()` to be `b`. - var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx }; - iter.after = cp_iter.prev(); - iter.before = cp_iter.prev(); - iter.cp_iter = cp_iter; - return iter; -} - -fn sneaky(iter: *const ReverseIterator) SneakIterator { - return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; -} - -const SneakIterator = struct { - cp_iter: ReverseCodepointIterator, - wb: *const WordBreak, - - fn peek(iter: *SneakIterator) ?CodePoint { - const save_cp = iter.cp_iter; - defer iter.cp_iter = save_cp; - while (iter.cp_iter.peek()) |peeked| { - if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; - _ = iter.cp_iter.prev(); - } - return null; - } - - fn countFlags(iter: *SneakIterator) usize { - var flags: usize = 0; - const save_cp = iter.cp_iter; - defer iter.cp_iter = save_cp; - while (iter.cp_iter.prev()) |cp| { - const prop = iter.wb.breakProp(cp); - if (isIgnorable(prop)) continue; - if (prop == .Regional_Indicator) { - flags += 1; - } else break; - } - return flags; - } - - fn prev(iter: *SneakIterator) ?CodePoint { - while (iter.cp_iter.prev()) |peeked| { - if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; - } - return null; - } -}; - -inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { - const decompressor = compress.flate.inflate.decompressor; - const in_bytes = @embedFile("wbp"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = decompressor(.raw, in_fbs.reader()); - var reader = in_decomp.reader(); - - const endian = builtin.cpu.arch.endian(); - - const stage_1_len: u16 = try reader.readInt(u16, endian); - wb.s1 = try allocator.alloc(u16, stage_1_len); - errdefer allocator.free(wb.s1); - for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); - - const stage_2_len: u16 = try reader.readInt(u16, endian); - wb.s2 = try allocator.alloc(u5, stage_2_len); - errdefer allocator.free(wb.s2); - for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); - var count_0: usize = 0; - for (wb.s2) |nyb| { - if (nyb == 0) count_0 += 1; - } -} - -//| Predicates - -inline fn isNewline(wbp: WordBreakProperty) bool { - return wbp == .CR or wbp == .LF or wbp == .Newline; -} - -inline fn isIgnorable(wbp: WordBreakProperty) bool { - return switch (wbp) { - .Format, .Extend, .ZWJ => true, - else => false, - }; -} - -inline fn isAHLetter(wbp: WordBreakProperty) bool { - return wbp == .ALetter or wbp == .Hebrew_Letter; -} - -inline fn isMidVal(wbp: WordBreakProperty) bool { - return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; -} - -inline fn isMidNum(wbp: WordBreakProperty) bool { - return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote; -} - -inline fn isExtensible(wbp: WordBreakProperty) bool { - return switch (wbp) { - .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, - else => false, - }; -} - -test "Word Break Properties" { - const wb = try WordBreak.init(testing.allocator); - defer wb.deinit(testing.allocator); - try testing.expectEqual(.CR, wb.breakProperty('\r')); - try testing.expectEqual(.LF, wb.breakProperty('\n')); - try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); - try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); -} - -test "ext_pict" { - try testing.expect(ext_pict.isMatch("👇")); - try testing.expect(ext_pict.isMatch("\u{2701}")); -} - -test wordAtIndex { - const wb = try WordBreak.init(testing.allocator); - defer wb.deinit(testing.allocator); - const t_string = "first second third"; - const second = wb.wordAtIndex(t_string, 8); - try testing.expectEqualStrings("second", second.bytes(t_string)); - const third = wb.wordAtIndex(t_string, 14); - try testing.expectEqualStrings("third", third.bytes(t_string)); - { - const first = wb.wordAtIndex(t_string, 3); - try testing.expectEqualStrings("first", first.bytes(t_string)); - } - { - const first = wb.wordAtIndex(t_string, 0); - try testing.expectEqualStrings("first", first.bytes(t_string)); - } - const last = wb.wordAtIndex(t_string, 14); - try testing.expectEqualStrings("third", last.bytes(t_string)); -} - -const testr = "don't a:ka fin!"; - -test "reversal" { - const wb = try WordBreak.init(testing.allocator); - defer wb.deinit(testing.allocator); - { - var fwd = wb.iterator(testr); - var this_word: ?Word = fwd.next(); - - while (this_word) |this| : (this_word = fwd.next()) { - var back = fwd.reverseIterator(); - const that_word = back.prev(); - if (that_word) |that| { - try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); - } else { - try testing.expect(false); - } - } - } - { - var back = wb.reverseIterator(testr); - var this_word: ?Word = back.prev(); - - while (this_word) |this| : (this_word = back.prev()) { - var fwd = back.forwardIterator(); - const that_word = fwd.next(); - if (that_word) |that| { - try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); - } else { - try testing.expect(false); - } - } - } -} - -fn testAllocations(allocator: Allocator) !void { - const wb = try WordBreak.init(allocator); - wb.deinit(allocator); -} - -test "allocation safety" { - try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{}); -} - -const std = @import("std"); -const builtin = @import("builtin"); -const compress = std.compress; -const mem = std.mem; -const Allocator = mem.Allocator; -const assert = std.debug.assert; -const testing = std.testing; - -const code_point = @import("code_point"); -const CodepointIterator = code_point.Iterator; -const ReverseCodepointIterator = code_point.ReverseIterator; -const CodePoint = code_point.CodePoint; - -const ext_pict = @import("micro_runeset.zig").Extended_Pictographic; diff --git a/src/Words.zig b/src/Words.zig new file mode 100644 index 0000000..6a532f5 --- /dev/null +++ b/src/Words.zig @@ -0,0 +1,720 @@ +//! Word Breaking Algorithm. + +const WordBreakProperty = enum(u5) { + none, + Double_Quote, + Single_Quote, + Hebrew_Letter, + CR, + LF, + Newline, + Extend, + Regional_Indicator, + Format, + Katakana, + ALetter, + MidLetter, + MidNum, + MidNumLet, + Numeric, + ExtendNumLet, + ZWJ, + WSegSpace, +}; + +s1: []u16 = undefined, +s2: []u5 = undefined, + +const WordBreak = @This(); + +pub fn init(allocator: Allocator) Allocator.Error!WordBreak { + var wb: WordBreak = undefined; + try wb.setup(allocator); + return wb; +} + +pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { + wb.setupImpl(allocator) catch |err| { + switch (err) { + error.OutOfMemory => |e| return e, + else => unreachable, + } + }; +} + +pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { + allocator.free(wordbreak.s1); + allocator.free(wordbreak.s2); +} + +/// Represents a Unicode word span, as an offset into the source string +/// and the length of the word. +pub const Word = struct { + offset: u32, + len: u32, + + /// Returns a slice of the word given the source string. + pub fn bytes(self: Word, src: []const u8) []const u8 { + return src[self.offset..][0..self.len]; + } +}; + +/// Returns the word break property type for `cp`. +pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { + return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); +} + +/// Convenience function for working with CodePoints +fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { + return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); +} + +/// Returns the Word at the given index. Asserts that the index is less than +/// `string.len`, and that the string is not empty. Always returns a word. +/// The index does not have to be the start of a codepoint in the word. +pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { + assert(index < string.len and string.len > 0); + var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); + const first_back = iter_back.prev(); + if (first_back) |back| { + if (back.offset == 0) { + var iter_fwd = wordbreak.iterator(string); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index < word.offset + word.len) + return word; + } + } + } else { + var iter_fwd = wordbreak.iterator(string); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index < word.offset + word.len) + return word; + } + } + const second_back = iter_back.prev(); + if (second_back) |back| if (back.offset == 0) { + var iter_fwd = wordbreak.iterator(string); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index < word.offset + word.len) + return word; + } + }; + // There's sometimes flags: + if (iter_back.flags > 0) { + while (iter_back.flags > 0) { + if (iter_back.prev()) |_| { + continue; + } else { + break; + } + } + } + var iter_fwd = iter_back.forwardIterator(); + while (iter_fwd.next()) |word| { + if (word.offset <= index and index < word.offset + word.len) + return word; + } + unreachable; +} + +/// Returns an iterator over words in `slice`. +pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { + return Iterator.init(wordbreak, slice); +} + +/// Returns a reverse iterator over the words in `slice`. +pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator { + return ReverseIterator.init(wordbreak, slice); +} + +/// An iterator, forward, over all words in a provided string. +pub const Iterator = struct { + this: ?CodePoint = null, + that: ?CodePoint = null, + cp_iter: CodepointIterator, + wb: *const WordBreak, + + /// Assumes `str` is valid UTF-8. + pub fn init(wb: *const WordBreak, str: []const u8) Iterator { + var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; + wb_iter.advance(); + return wb_iter; + } + + /// Returns the next word segment, without advancing. + pub fn peek(iter: *Iterator) ?Word { + const cache = .{ iter.this, iter.that, iter.cp_iter }; + defer { + iter.this, iter.that, iter.cp_iter = cache; + } + return iter.next(); + } + + /// Returns a reverse iterator from the point this iterator is paused + /// at. Usually, and always when using the API to create iterators, + /// calling `prev()` will return the word just seen. + pub fn reverseIterator(iter: *Iterator) ReverseIterator { + var cp_it = iter.cp_iter.reverseIterator(); + if (iter.that) |_| + _ = cp_it.prev(); + if (iter.cp_iter.peek()) |_| + _ = cp_it.prev(); + return .{ + .wb = iter.wb, + .before = cp_it.prev(), + .after = iter.that, + .cp_iter = cp_it, + }; + } + + /// Returns the next word segment, if any. + pub fn next(iter: *Iterator) ?Word { + iter.advance(); + + // Done? + if (iter.this == null) return null; + // Last? + if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; + + const word_start = iter.this.?.offset; + var word_len: u32 = 0; + + // State variables. + var last_p: WordBreakProperty = .none; + var last_last_p: WordBreakProperty = .none; + var ri_count: usize = 0; + + scan: while (true) : (iter.advance()) { + const this = iter.this.?; + word_len += this.len; + if (iter.that) |that| { + const this_p = iter.wb.breakProp(this); + const that_p = iter.wb.breakProp(that); + if (!isIgnorable(this_p)) { + last_last_p = last_p; + last_p = this_p; + } + // WB3 CR × LF + if (this_p == .CR and that_p == .LF) continue :scan; + // WB3a (Newline | CR | LF) ÷ + if (isNewline(this_p)) break :scan; + // WB3b ÷ (Newline | CR | LF) + if (isNewline(that_p)) break :scan; + // WB3c ZWJ × \p{Extended_Pictographic} + if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { + continue :scan; + } + // WB3d WSegSpace × WSegSpace + if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; + // WB4 X (Extend | Format | ZWJ)* → X + if (isIgnorable(that_p)) { + continue :scan; + } // Now we use last_p instead of this_p for ignorable's sake + if (isAHLetter(last_p)) { + // WB5 AHLetter × AHLetter + if (isAHLetter(that_p)) continue :scan; + // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter + if (isMidVal(that_p)) { + const next_val = iter.peekPast(); + if (next_val) |next_cp| { + const next_p = iter.wb.breakProp(next_cp); + if (isAHLetter(next_p)) { + continue :scan; + } + } + } + } + // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter + if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) { + continue :scan; + } + if (last_p == .Hebrew_Letter) { + // WB7a Hebrew_Letter × Single_Quote + if (that_p == .Single_Quote) continue :scan; + // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter + if (that_p == .Double_Quote) { + const next_val = iter.peekPast(); + if (next_val) |next_cp| { + const next_p = iter.wb.breakProp(next_cp); + if (next_p == .Hebrew_Letter) { + continue :scan; + } + } + } + } + // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter + if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter) + continue :scan; + // WB8 Numeric × Numeric + if (last_p == .Numeric and that_p == .Numeric) continue :scan; + // WB9 AHLetter × Numeric + if (isAHLetter(last_p) and that_p == .Numeric) continue :scan; + // WB10 Numeric × AHLetter + if (last_p == .Numeric and isAHLetter(that_p)) continue :scan; + // WB11 Numeric (MidNum | MidNumLetQ) × Numeric + if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric) + continue :scan; + // WB12 Numeric × (MidNum | MidNumLetQ) Numeric + if (last_p == .Numeric and isMidNum(that_p)) { + const next_val = iter.peekPast(); + if (next_val) |next_cp| { + const next_p = iter.wb.breakProp(next_cp); + if (next_p == .Numeric) { + continue :scan; + } + } + } + // WB13 Katakana × Katakana + if (last_p == .Katakana and that_p == .Katakana) continue :scan; + // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet + if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan; + // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) + if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; + // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI + const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator; + if (maybe_flag) { + ri_count += 1; + if (ri_count % 2 == 1) continue :scan; + } + // WB999 Any ÷ Any + break :scan; + } else { // iter.that == null + break :scan; + } + } + + return Word{ .len = word_len, .offset = word_start }; + } + + pub fn format(iter: Iterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + try writer.print( + "Iterator {{ .this = {any}, .that = {any} }}", + .{ iter.this, iter.that }, + ); + } + + fn advance(iter: *Iterator) void { + iter.this = iter.that; + iter.that = iter.cp_iter.next(); + } + + fn peekPast(iter: *Iterator) ?CodePoint { + const save_cp = iter.cp_iter; + defer iter.cp_iter = save_cp; + while (iter.cp_iter.peek()) |peeked| { + if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + _ = iter.cp_iter.next(); + } + return null; + } +}; + +/// An iterator, backward, over all words in a provided string. +pub const ReverseIterator = struct { + after: ?CodePoint = null, + before: ?CodePoint = null, + cp_iter: ReverseCodepointIterator, + wb: *const WordBreak, + flags: usize = 0, + + /// Assumes `str` is valid UTF-8. + pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { + var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; + wb_iter.advance(); + return wb_iter; + } + + /// Returns the previous word segment, if any, without advancing. + pub fn peek(iter: *ReverseIterator) ?Word { + const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; + defer { + iter.before, iter.after, iter.cp_iter, iter.flags = cache; + } + return iter.prev(); + } + + /// Return a forward iterator from where this iterator paused. Usually, + /// and always when using the API to create iterators, calling `next()` + /// will return the word just seen. + pub fn forwardIterator(iter: *ReverseIterator) Iterator { + var cp_it = iter.cp_iter.forwardIterator(); + if (iter.before) |_| + _ = cp_it.next(); + return .{ + .wb = iter.wb, + .this = cp_it.next(), + .that = iter.after, + .cp_iter = cp_it, + }; + } + + /// Return the previous word, if any. + pub fn prev(iter: *ReverseIterator) ?Word { + iter.advance(); + + // Done? + if (iter.after == null) return null; + // Last? + if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 }; + + const word_end = iter.after.?.offset + iter.after.?.len; + var word_len: u32 = 0; + + // State variables. + var last_p: WordBreakProperty = .none; + var last_last_p: WordBreakProperty = .none; + + scan: while (true) : (iter.advance()) { + const after = iter.after.?; + word_len += after.len; + if (iter.before) |before| { + var sneak = sneaky(iter); // 'sneaks' past ignorables + const after_p = iter.wb.breakProp(after); + var before_p = iter.wb.breakProp(before); + if (!isIgnorable(after_p)) { + last_last_p = last_p; + last_p = after_p; + } + // WB3 CR × LF + if (before_p == .CR and after_p == .LF) continue :scan; + // WB3a (Newline | CR | LF) ÷ + if (isNewline(before_p)) break :scan; + // WB3b ÷ (Newline | CR | LF) + if (isNewline(after_p)) break :scan; + // WB3c ZWJ × \p{Extended_Pictographic} + if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) { + continue :scan; + } + // WB3d WSegSpace × WSegSpace + if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; + // WB4 X (Extend | Format | ZWJ)* → X + if (isIgnorable(before_p)) { + const maybe_before = sneak.prev(); + if (maybe_before) |valid_before| { + before_p = iter.wb.breakProp(valid_before); + } else if (!isIgnorable(after_p)) { + // We're done + break :scan; + } + } + if (isIgnorable(after_p)) continue :scan; + // WB5 AHLetter × AHLetter + if (isAHLetter(last_p) and isAHLetter(before_p)) { + continue :scan; + } + // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter + if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) { + continue :scan; + } + // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter + if (isMidVal(before_p) and isAHLetter(last_p)) { + const prev_val = sneak.peek(); + if (prev_val) |prev_cp| { + const prev_p = iter.wb.breakProp(prev_cp); + if (isAHLetter(prev_p)) { + continue :scan; + } + } + } + // WB7a Hebrew_Letter × Single_Quote + if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan; + // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter + if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) { + continue :scan; + } + // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter + if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { + const prev_val = sneak.peek(); + if (prev_val) |prev_cp| { + const prev_p = iter.wb.breakProp(prev_cp); + if (prev_p == .Hebrew_Letter) { + continue :scan; + } + } + } + // WB8 Numeric × Numeric + if (before_p == .Numeric and last_p == .Numeric) continue :scan; + // WB9 AHLetter × Numeric + if (isAHLetter(before_p) and last_p == .Numeric) continue :scan; + // WB10 Numeric × AHLetter + if (before_p == .Numeric and isAHLetter(last_p)) continue :scan; + // WB11 Numeric (MidNum | MidNumLetQ) × Numeric + if (isMidNum(before_p) and last_p == .Numeric) { + const prev_val = sneak.peek(); + if (prev_val) |prev_cp| { + const prev_p = iter.wb.breakProp(prev_cp); + if (prev_p == .Numeric) { + continue :scan; + } + } + } + // WB12 Numeric × (MidNum | MidNumLetQ) Numeric + if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) { + continue :scan; + } + // WB13 Katakana × Katakana + if (before_p == .Katakana and last_p == .Katakana) continue :scan; + // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet + if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan; + // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) + if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan; + // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI + // NOTE: + // So here we simply have to know whether a run of flags is even or odd. + // The whole run. To avoid quadratic behavior (and long flag runs are + // actually a thing in the wild), we have to count them once, store that + // on the iterator, and decrement each time we see two, possibly breaking + // once extra at the beginning. They break up one per flag, once we hit + // zero, that's all the flags. If we see another flag we do it again. + if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) { + defer { + if (iter.flags > 0) iter.flags -= 1; + } + if (iter.flags == 0) { + iter.flags = sneak.countFlags(); + } + if (iter.flags % 2 == 0) { + continue :scan; + } + } + // WB999 Any ÷ Any + break :scan; + } + break :scan; + } + return Word{ .len = word_len, .offset = word_end - word_len }; + } + + pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + try writer.print( + "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}", + .{ iter.before, iter.after, iter.flags }, + ); + } + + fn peekPast(iter: *ReverseIterator) ?CodePoint { + const save_cp = iter.cp_iter; + defer iter.cp_iter = save_cp; + while (iter.cp_iter.peek()) |peeked| { + if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + _ = iter.cp_iter.prev(); + } + return null; + } + + fn advance(iter: *ReverseIterator) void { + iter.after = iter.before; + iter.before = iter.cp_iter.prev(); + } +}; + +//| Implementation Details + +/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. +fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator { + var idx: u32 = @intCast(index); + // Find the next lead byte: + while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} + if (idx == string.len) return wb.reverseIterator(string); + var iter: ReverseIterator = undefined; + iter.wb = wb; + iter.flags = 0; + // We need to populate the CodePoints, and the codepoint iterator. + // Consider "abc| def" with the cursor as |. + // We need `before` to be `c` and `after` to be ' ', + // and `cp_iter.prev()` to be `b`. + var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx }; + iter.after = cp_iter.prev(); + iter.before = cp_iter.prev(); + iter.cp_iter = cp_iter; + return iter; +} + +fn sneaky(iter: *const ReverseIterator) SneakIterator { + return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; +} + +const SneakIterator = struct { + cp_iter: ReverseCodepointIterator, + wb: *const WordBreak, + + fn peek(iter: *SneakIterator) ?CodePoint { + const save_cp = iter.cp_iter; + defer iter.cp_iter = save_cp; + while (iter.cp_iter.peek()) |peeked| { + if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + _ = iter.cp_iter.prev(); + } + return null; + } + + fn countFlags(iter: *SneakIterator) usize { + var flags: usize = 0; + const save_cp = iter.cp_iter; + defer iter.cp_iter = save_cp; + while (iter.cp_iter.prev()) |cp| { + const prop = iter.wb.breakProp(cp); + if (isIgnorable(prop)) continue; + if (prop == .Regional_Indicator) { + flags += 1; + } else break; + } + return flags; + } + + fn prev(iter: *SneakIterator) ?CodePoint { + while (iter.cp_iter.prev()) |peeked| { + if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + } + return null; + } +}; + +inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { + const decompressor = compress.flate.inflate.decompressor; + const in_bytes = @embedFile("wbp"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = decompressor(.raw, in_fbs.reader()); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + const stage_1_len: u16 = try reader.readInt(u16, endian); + wb.s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(wb.s1); + for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); + + const stage_2_len: u16 = try reader.readInt(u16, endian); + wb.s2 = try allocator.alloc(u5, stage_2_len); + errdefer allocator.free(wb.s2); + for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); + var count_0: usize = 0; + for (wb.s2) |nyb| { + if (nyb == 0) count_0 += 1; + } +} + +//| Predicates + +inline fn isNewline(wbp: WordBreakProperty) bool { + return wbp == .CR or wbp == .LF or wbp == .Newline; +} + +inline fn isIgnorable(wbp: WordBreakProperty) bool { + return switch (wbp) { + .Format, .Extend, .ZWJ => true, + else => false, + }; +} + +inline fn isAHLetter(wbp: WordBreakProperty) bool { + return wbp == .ALetter or wbp == .Hebrew_Letter; +} + +inline fn isMidVal(wbp: WordBreakProperty) bool { + return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; +} + +inline fn isMidNum(wbp: WordBreakProperty) bool { + return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote; +} + +inline fn isExtensible(wbp: WordBreakProperty) bool { + return switch (wbp) { + .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, + else => false, + }; +} + +test "Word Break Properties" { + const wb = try WordBreak.init(testing.allocator); + defer wb.deinit(testing.allocator); + try testing.expectEqual(.CR, wb.breakProperty('\r')); + try testing.expectEqual(.LF, wb.breakProperty('\n')); + try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); + try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); +} + +test "ext_pict" { + try testing.expect(ext_pict.isMatch("👇")); + try testing.expect(ext_pict.isMatch("\u{2701}")); +} + +test wordAtIndex { + const wb = try WordBreak.init(testing.allocator); + defer wb.deinit(testing.allocator); + const t_string = "first second third"; + const second = wb.wordAtIndex(t_string, 8); + try testing.expectEqualStrings("second", second.bytes(t_string)); + const third = wb.wordAtIndex(t_string, 14); + try testing.expectEqualStrings("third", third.bytes(t_string)); + { + const first = wb.wordAtIndex(t_string, 3); + try testing.expectEqualStrings("first", first.bytes(t_string)); + } + { + const first = wb.wordAtIndex(t_string, 0); + try testing.expectEqualStrings("first", first.bytes(t_string)); + } + const last = wb.wordAtIndex(t_string, 14); + try testing.expectEqualStrings("third", last.bytes(t_string)); +} + +const testr = "don't a:ka fin!"; + +test "reversal" { + const wb = try WordBreak.init(testing.allocator); + defer wb.deinit(testing.allocator); + { + var fwd = wb.iterator(testr); + var this_word: ?Word = fwd.next(); + + while (this_word) |this| : (this_word = fwd.next()) { + var back = fwd.reverseIterator(); + const that_word = back.prev(); + if (that_word) |that| { + try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); + } else { + try testing.expect(false); + } + } + } + { + var back = wb.reverseIterator(testr); + var this_word: ?Word = back.prev(); + + while (this_word) |this| : (this_word = back.prev()) { + var fwd = back.forwardIterator(); + const that_word = fwd.next(); + if (that_word) |that| { + try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); + } else { + try testing.expect(false); + } + } + } +} + +fn testAllocations(allocator: Allocator) !void { + const wb = try WordBreak.init(allocator); + wb.deinit(allocator); +} + +test "allocation safety" { + try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{}); +} + +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; +const Allocator = mem.Allocator; +const assert = std.debug.assert; +const testing = std.testing; + +const code_point = @import("code_point"); +const CodepointIterator = code_point.Iterator; +const ReverseCodepointIterator = code_point.ReverseIterator; +const CodePoint = code_point.CodePoint; + +const ext_pict = @import("micro_runeset.zig").Extended_Pictographic; -- cgit v1.2.3