//! Word Breaking Algorithm. const WordBreakProperty = enum(u5) { none, Double_Quote, Single_Quote, Hebrew_Letter, CR, LF, Newline, Extend, Regional_Indicator, Format, Katakana, ALetter, MidLetter, MidNum, MidNumLet, Numeric, ExtendNumLet, ZWJ, WSegSpace, }; s1: []u16 = undefined, s2: []u5 = undefined, const WordBreak = @This(); pub fn init(allocator: Allocator) Allocator.Error!WordBreak { var wb: WordBreak = undefined; try wb.setup(allocator); return wb; } pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { wb.setupImpl(allocator) catch |err| { switch (err) { error.OutOfMemory => |e| return e, else => unreachable, } }; } inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("wbp"); var in_fbs = std.io.fixedBufferStream(in_bytes); var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); const stage_1_len: u16 = try reader.readInt(u16, endian); wb.s1 = try allocator.alloc(u16, stage_1_len); errdefer allocator.free(wb.s1); for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); const stage_2_len: u16 = try reader.readInt(u16, endian); wb.s2 = try allocator.alloc(u5, stage_2_len); errdefer allocator.free(wb.s2); for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); var count_0: usize = 0; for (wb.s2) |nyb| { if (nyb == 0) count_0 += 1; } } pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { allocator.free(wordbreak.s1); allocator.free(wordbreak.s2); } /// Represents a Unicode word span, as an offset into the source string /// and the length of the word. pub const Word = struct { offset: u32, len: u32, /// Returns a slice of the word given the source string. pub fn bytes(self: Word, src: []const u8) []const u8 { return src[self.offset..][0..self.len]; } }; /// Returns the word break property type for `cp`. pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); } /// Returns an iterator over words in `slice` pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { return Iterator.init(wordbreak, slice); } const IterState = packed struct { mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter regional: bool, // [^RI] (RI RI)* RI × RI pub const initial: IterState = .{ .mid_punct = false, .mid_num = false, .quote_heb = false, .regional = false, }; }; pub const Iterator = struct { this: ?CodePoint = null, that: ?CodePoint = null, cache: ?WordBreakProperty = null, cp_iter: CodepointIterator, wb: *const WordBreak, /// Assumes `str` is valid UTF-8. pub fn init(wb: *const WordBreak, str: []const u8) Iterator { var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; wb_iter.advance(); return wb_iter; } pub fn next(iter: *Iterator) ?Word { iter.advance(); // Done? if (iter.this == null) return null; // Last? if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; const word_start = iter.this.?.offset; var word_len: u32 = 0; var state: IterState = .initial; scan: while (true) : (iter.advance()) { const this = iter.this.?; word_len += this.len; var ignored = false; if (iter.that) |that| { const that_p = iter.wb.breakProperty(that.code); const this_p = this_p: { if (!isIgnorable(that_p) and iter.cache != null) { // TODO: might not need these what with peekPast ignored = true; defer iter.cache = null; // Fixup some state, apply pre-4 rules const restore = iter.cache.?; if (restore == .WSegSpace) break :this_p .none; break :this_p restore; } else { break :this_p iter.wb.breakProperty(this.code); } }; // WB3 CR × LF if (this_p == .CR and that_p == .LF) continue :scan; // WB3a (Newline | CR | LF) ÷ if (isNewline(this_p)) break :scan; // WB3b ÷ (Newline | CR | LF) if (isNewline(that_p)) break :scan; // WB3c ZWJ × \p{Extended_Pictographic} if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { // Invalid after ignoring if (ignored) break :scan else continue :scan; } // WB3d WSegSpace × WSegSpace if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; // WB4 X (Extend | Format | ZWJ)* → X if (isIgnorable(that_p)) { if (that_p == .ZWJ) { const next_val = iter.peekPast(); if (next_val) |next_cp| { if (ext_pict.isMatch(next_cp.bytes(iter.cp_iter.bytes))) { continue :scan; } } } if (iter.cache == null) { iter.cache = this_p; } continue :scan; } if (isAHLetter(this_p)) { // WB5 AHLetter × AHLetter if (isAHLetter(that_p)) continue :scan; // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter if (isMidVal(that_p)) { const next_val = iter.peekPast(); if (next_val) |next_cp| { const next_p = iter.wb.breakProperty(next_cp.code); if (isAHLetter(next_p)) { state.mid_punct = true; continue :scan; } } } } // AHLetter (MidLetter | MidNumLetQ) × AHLetter if (state.mid_punct) { // Should always be true: assert(isMidVal(this_p)); assert(isAHLetter(that_p)); state.mid_punct = false; continue :scan; } if (this_p == .Hebrew_Letter) { // WB7a Hebrew_Letter × Single_Quote if (that_p == .Single_Quote) continue :scan; // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter if (that_p == .Double_Quote) { const next_val = iter.peekPast(); if (next_val) |next_cp| { const next_p = iter.wb.breakProperty(next_cp.code); if (next_p == .Hebrew_Letter) { state.quote_heb = true; continue :scan; } } else break :scan; } } // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter if (state.quote_heb) { // Should always be true: assert(this_p == .Double_Quote); assert(that_p == .Hebrew_Letter); state.quote_heb = false; continue :scan; } // WB8 Numeric × Numeric if (this_p == .Numeric and that_p == .Numeric) continue :scan; // WB9 AHLetter × Numeric if (isAHLetter(this_p) and that_p == .Numeric) continue :scan; // WB10 Numeric × AHLetter if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; // WB12 Numeric × (MidNum | MidNumLetQ) Numeric if (this_p == .Numeric and isMidNum(that_p)) { const next_val = iter.peekPast(); if (next_val) |next_cp| { const next_p = iter.wb.breakProperty(next_cp.code); if (next_p == .Numeric) { state.mid_num = true; continue :scan; } } else break :scan; } // WB11 Numeric (MidNum | MidNumLetQ) × Numeric if (state.mid_num) { assert(isMidNum(this_p)); assert(that_p == .Numeric); state.mid_num = false; continue :scan; } // WB13 Katakana × Katakana if (this_p == .Katakana and that_p == .Katakana) continue :scan; // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI if (this_p == .Regional_Indicator) { if (that_p == .Regional_Indicator) { if (state.regional == true or this.offset == 0) { state.regional = false; continue :scan; } } else { state.regional = true; } } else if (that_p == .Regional_Indicator) { state.regional = true; } // WB999 Any ÷ Any break :scan; } else { // iter.that == null break :scan; } } return Word{ .len = word_len, .offset = word_start }; } fn advance(iter: *Iterator) void { iter.this = iter.that; iter.that = iter.cp_iter.next(); } fn peekPast(iter: *Iterator) ?CodePoint { const save_cp = iter.cp_iter; defer iter.cp_iter = save_cp; while (iter.cp_iter.peek()) |peeked| { if (!isIgnorable(iter.wb.breakProperty(peeked.code))) return peeked; _ = iter.cp_iter.next(); } return null; } }; //| Predicates inline fn isNewline(wbp: WordBreakProperty) bool { return wbp == .CR or wbp == .LF or wbp == .Newline; } inline fn isIgnorable(wbp: WordBreakProperty) bool { return switch (wbp) { .Format, .Extend, .ZWJ => true, else => false, }; } inline fn isAHLetter(wbp: WordBreakProperty) bool { return wbp == .ALetter or wbp == .Hebrew_Letter; } inline fn isMidVal(wbp: WordBreakProperty) bool { return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; } inline fn isMidNum(wbp: WordBreakProperty) bool { return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote; } inline fn isExtensible(wbp: WordBreakProperty) bool { return switch (wbp) { .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, else => false, }; } test "Word Break Properties" { const wb = try WordBreak.init(testing.allocator); defer wb.deinit(testing.allocator); try testing.expectEqual(.CR, wb.breakProperty('\r')); try testing.expectEqual(.LF, wb.breakProperty('\n')); try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); } fn testAllocations(allocator: Allocator) !void { const wb = try WordBreak.init(allocator); wb.deinit(allocator); } test "allocation safety" { try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{}); } const std = @import("std"); const builtin = @import("builtin"); const compress = std.compress; const mem = std.mem; const Allocator = mem.Allocator; const assert = std.debug.assert; const testing = std.testing; const code_point = @import("code_point"); const CodepointIterator = code_point.Iterator; const CodePoint = code_point.CodePoint; const ext_pict = @import("micro_runeset.zig").Extended_Pictographic;