From cf8d8fe5d640511f6c4134fdaa36e930232ca7da Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Mon, 12 May 2025 15:22:37 -0400 Subject: Begin conformance test I'm not sure the details of this strategy can actually be made to work. But, something can. --- src/Graphemes.zig | 22 ++++++ src/WordBreak.zig | 83 +++++++++++++------- src/code_point.zig | 5 ++ src/micro_runeset.zig | 207 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/unicode_tests.zig | 102 +++++++++++++++++-------- 5 files changed, 361 insertions(+), 58 deletions(-) create mode 100644 src/micro_runeset.zig (limited to 'src') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 1ce1ea6..1f67fc6 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -364,3 +364,25 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { try std.testing.expectEqual(@as(usize, 2), i); } } + +test "Iterator.peek" { + const peek_seq = "aΔ👨🏻‍🌾→"; + const data = try Graphemes.init(std.testing.allocator); + defer data.deinit(std.testing.allocator); + + var iter = data.iterator(peek_seq); + const peek_a = iter.peek().?; + const next_a = iter.next().?; + try std.testing.expectEqual(peek_a, next_a); + try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq)); + const peek_d1 = iter.peek().?; + const peek_d2 = iter.peek().?; + try std.testing.expectEqual(peek_d1, peek_d2); + const next_d = iter.next().?; + try std.testing.expectEqual(peek_d2, next_d); + try std.testing.expectEqual(iter.peek(), iter.next()); + try std.testing.expectEqual(iter.peek(), iter.next()); + try std.testing.expectEqual(null, iter.peek()); + try std.testing.expectEqual(null, iter.peek()); + try std.testing.expectEqual(iter.peek(), iter.next()); +} diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 84fd1f7..53db76b 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -88,6 +88,11 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); } +/// Returns an iterator over words in `slice` +pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { + return Iterator.init(wordbreak, slice); +} + const IterState = packed struct { mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric @@ -113,7 +118,7 @@ pub const Iterator = struct { pub fn init(wb: *const WordBreak, str: []const u8) Iterator { var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; wb_iter.advance(); - return wb; + return wb_iter; } pub fn next(iter: *Iterator) ?Word { @@ -132,12 +137,18 @@ pub const Iterator = struct { scan: while (true) : (iter.advance()) { const this = iter.this.?; word_len += this.len; + var ignored = false; if (iter.that) |that| { const that_p = iter.wb.breakProperty(that.code); const this_p = this_p: { if (!isIgnorable(that_p) and iter.cache != null) { + // TODO: might not need these what with peekPast + ignored = true; defer iter.cache = null; - break :this_p iter.cache.?; + // Fixup some state, apply pre-4 rules + const restore = iter.cache.?; + if (restore == .WSegSpace) break :this_p .none; + break :this_p restore; } else { break :this_p iter.wb.breakProperty(this.code); } @@ -149,11 +160,22 @@ pub const Iterator = struct { // WB3b ÷ (Newline | CR | LF) if (isNewline(that_p)) break :scan; // WB3c ZWJ × \p{Extended_Pictographic} - // The right way to do this one is a RuneSet, TODO: circle back + if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { + // Invalid after ignoring + if (ignored) break :scan else continue :scan; + } // WB3d WSegSpace × WSegSpace if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; // WB4 X (Extend | Format | ZWJ)* → X if (isIgnorable(that_p)) { + if (that_p == .ZWJ) { + const next_val = iter.peekPast(); + if (next_val) |next_cp| { + if (ext_pict.isMatch(next_cp.bytes(iter.cp_iter.bytes))) { + continue :scan; + } + } + } if (iter.cache == null) { iter.cache = this_p; } @@ -164,14 +186,14 @@ pub const Iterator = struct { if (isAHLetter(that_p)) continue :scan; // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter if (isMidVal(that_p)) { - const next_val = iter.cp_iter.peek(); + const next_val = iter.peekPast(); if (next_val) |next_cp| { const next_p = iter.wb.breakProperty(next_cp.code); if (isAHLetter(next_p)) { state.mid_punct = true; continue :scan; } - } else break :scan; + } } } // AHLetter (MidLetter | MidNumLetQ) × AHLetter @@ -187,7 +209,7 @@ pub const Iterator = struct { if (that_p == .Single_Quote) continue :scan; // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter if (that_p == .Double_Quote) { - const next_val = iter.cp_iter.peek(); + const next_val = iter.peekPast(); if (next_val) |next_cp| { const next_p = iter.wb.breakProperty(next_cp.code); if (next_p == .Hebrew_Letter) { @@ -212,8 +234,8 @@ pub const Iterator = struct { // WB10 Numeric × AHLetter if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; // WB12 Numeric × (MidNum | MidNumLetQ) Numeric - if (this_p == .Numeric and isMidVal(that_p)) { - const next_val = iter.cp_iter.peek(); + if (this_p == .Numeric and isMidNum(that_p)) { + const next_val = iter.peekPast(); if (next_val) |next_cp| { const next_p = iter.wb.breakProperty(next_cp.code); if (next_p == .Numeric) { @@ -224,7 +246,7 @@ pub const Iterator = struct { } // WB11 Numeric (MidNum | MidNumLetQ) × Numeric if (state.mid_num) { - assert(isMidVal(this_p)); + assert(isMidNum(this_p)); assert(that_p == .Numeric); state.mid_num = false; continue :scan; @@ -235,25 +257,18 @@ pub const Iterator = struct { if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; - // WB15, WB16 ([^RI] ! sot) (RI RI)* RI × RI - if (that_p == .Regional_Indicator) { - if (this_p == .Regional_Indicator) { - if (state.regional) { + // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI + if (this_p == .Regional_Indicator) { + if (that_p == .Regional_Indicator) { + if (state.regional == true or this.offset == 0) { state.regional = false; continue :scan; - } else { - break :scan; } } else { - const next_val = iter.cp_iter.peek(); - if (next_val) |next_cp| { - const next_p = iter.wb.breakProperty(next_cp.code); - if (next_p == .Regional_Indicator) { - state.regional = true; - continue :scan; - } - } else break :scan; + state.regional = true; } + } else if (that_p == .Regional_Indicator) { + state.regional = true; } // WB999 Any ÷ Any break :scan; @@ -265,9 +280,19 @@ pub const Iterator = struct { return Word{ .len = word_len, .offset = word_start }; } - fn advance(wb_iter: *Iterator) void { - wb_iter.this = wb_iter.that; - wb_iter.that = wb_iter.cp_iter.next(); + fn advance(iter: *Iterator) void { + iter.this = iter.that; + iter.that = iter.cp_iter.next(); + } + + fn peekPast(iter: *Iterator) ?CodePoint { + const save_cp = iter.cp_iter; + defer iter.cp_iter = save_cp; + while (iter.cp_iter.peek()) |peeked| { + if (!isIgnorable(iter.wb.breakProperty(peeked.code))) return peeked; + _ = iter.cp_iter.next(); + } + return null; } }; @@ -292,6 +317,10 @@ inline fn isMidVal(wbp: WordBreakProperty) bool { return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; } +inline fn isMidNum(wbp: WordBreakProperty) bool { + return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote; +} + inline fn isExtensible(wbp: WordBreakProperty) bool { return switch (wbp) { .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, @@ -328,3 +357,5 @@ const testing = std.testing; const code_point = @import("code_point"); const CodepointIterator = code_point.Iterator; const CodePoint = code_point.CodePoint; + +const ext_pict = @import("micro_runeset.zig").Extended_Pictographic; diff --git a/src/code_point.zig b/src/code_point.zig index b3c9aea..79ee5cd 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -10,6 +10,11 @@ pub const CodePoint = struct { code: u21, len: u3, offset: u32, + + /// Return the slice of this codepoint, given the original string. + pub fn bytes(cp: CodePoint, str: []const u8) []const u8 { + return str[cp.offset..][0..cp.len]; + } }; /// This function is deprecated and will be removed in a later release. diff --git a/src/micro_runeset.zig b/src/micro_runeset.zig new file mode 100644 index 0000000..34fbcd3 --- /dev/null +++ b/src/micro_runeset.zig @@ -0,0 +1,207 @@ +//! Minimal RuneSet implementation +//! +//! This is copied from the full RuneSet module, so that `zg` doesn't +//! depend on it. There's one spot in the WordBreak algorithm which +//! needs to identify the emoji Extended_Pictographic property, which +//! is not otherwise used in ZG. The Runeset is 89 words, while the +//! trie lookup used throughout ZG would be much larger. +//! +//! The RuneSet is borrowed from Runicode, which encodes Unicode things +//! in RuneSet form. This will need updating for each version of Unicode. + +pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x8000060, 0x10000000000000, 0x8001000200600000, 0x7800985090, 0x801022055ef2d, 0xedf57effffffdf57, 0xaffd75bd6f7d001f, 0xdbffffbbbff7ff7f, 0x7d7fddd76f56dfb5, 0x3800000000000001, 0x40040000000000, 0x4, 0x30bae0000008000, 0x100, 0x10004000000, 0x20001f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xfffffffffffffff7, 0xffffffffffffffff, 0xffffffffffffffff, 0x7fffffffffffbfff, 0x800000006000, 0x4001700000000000, 0xffffe00003fe4000, 0x1fffffffff, 0x73fc800004007ffa, 0xfffffffffffd7e00, 0xffffffffffffffff, 0x7fffffffffffffff, 0xffd56ff6bedfafff, 0x77ffffffffff7bff, 0xffffffff5757ffff, 0x3fafff77ff7bfef, 0xbffffdfffffab77f, 0xffffd7efffffffff, 0xff5fefffffffffff, 0xef6fd7ffffffffff, 0x1fffd7ffffefff7b, 0xfdfabf7ff7ffbac0, 0xf7faff77ffaf5dbf, 0x7dfbbf7eb7f6ffed, 0xfff7775fbfefdebf, 0x7fee, 0xbedddfddfbf7f7db, 0x6ebb6edf776b7bdf, 0x7ff0000000000000, 0x7fff77ff7fe00000, 0x7000, 0x7c007f00, 0xffffc00000007f00, 0x7fffffffffffffff, 0xb3fb7f7fbeff7000, 0x7ebef7ffbfff779f, 0x7dff5bebff7dffef, 0x7fffffbfffff7bfb, 0xffffffffffffffff, 0x6b777fffffffffff, 0xdbbf6effffdfbebb, 0x7ebf7f7fb5bf5fdb, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x1fffffffffffffff } }; + +// Meaningful names for the T1 slots +const LOW = 0; +const HI = 1; +const LEAD = 2; +const T4_OFF = 3; + +/// Minimum Viable Runeset. Must be statically created, strictly boolean matching. +pub const RuneSet = struct { + body: []const u64, + + // Returns whether the slice is a match. This assumes the validity of the + // string, which can be ensured by, in particular, deriving it from a CodePoint. + pub fn isMatch(runeset: RuneSet, str: []const u8) bool { + const set = runeset.body; + const a = codeunit(str[0]); + switch (a.kind) { + .follow => return false, + .low => { + const mask = toMask(set[LOW]); + if (mask.isIn(a)) + return true + else + return false; + }, + .hi => { + const mask = toMask(set[HI]); + if (mask.isIn(a)) + return true + else + return false; + }, + .lead => { + const nB = a.nMultiBytes().?; + const a_mask = toMask(set[LEAD]); + if (!a_mask.isIn(a)) return false; + const b = codeunit(str[1]); + const b_loc = 4 + a_mask.lowerThan(a).?; + const b_mask = toMask(set[b_loc]); + if (!b_mask.isIn(b)) return false; + if (nB == 2) return true; + const t3_off = 4 + @popCount(set[LEAD]); + const c = codeunit(str[2]); + // Slice is safe because we know the T2 span has at least one word. + const c_off = b_mask.higherThan(b).? + popCountSlice(set[b_loc + 1 .. t3_off]); + const c_loc = t3_off + c_off; + const c_mask = toMask(set[c_loc]); + if (!c_mask.isIn(c)) return false; + if (nB == 3) return true; + const d_off = c_mask.lowerThan(c).? + popCountSlice(set[t3_off..c_loc]); + const d_loc = set[T4_OFF] + d_off; + const d = codeunit(str[3]); + const d_mask = toMask(set[d_loc]); + if (d_mask.isIn(d)) return true else return false; + }, + } + } +}; + +/// Kinds of most significant bits in UTF-8 +const RuneKind = enum(u2) { + low, + hi, + follow, + lead, +}; + +/// Packed `u8` struct representing one codeunit of UTF-8. +const CodeUnit = packed struct(u8) { + body: u6, + kind: RuneKind, + + /// Mask to check presence + pub inline fn inMask(self: *const CodeUnit) u64 { + return @as(u64, 1) << self.body; + } + + // TODO consider an nMultiBytesFast, for the cases where we + // know that invalid lead bytes are never present (such as in set) + // operations, where we may assume that (and will assert that) the + // LEAD mask contains no such bytes. + + /// Number of bytes in known multi-byte rune. + /// + /// Caller guarantees that the CodeUnit is a lead byte + /// of a multi-byte rune: `cu.kind == .lead`. + /// + /// Invalid lead bytes will return null. + pub inline fn nMultiBytes(self: *const CodeUnit) ?u8 { + return switch (self.body) { + // 0 and 1 are invalid for overlong reasons, + // but RuneSet supports overlong encodings + 0...31 => 2, + 32...47 => 3, + 48...55 => 4, + // Wasted space 56...61 is due entirely to Microsoft's + // lack of vision and insistence on a substandard + // and utterly inadequate encoding for Unicode + // "64k should be enough for anyone" + 56...63 => null, + }; + } + + /// Given a valid lead byte, return the number of bytes that should + /// make up the code unit sequence. Will return `null` if the lead + /// byte is invalid. + pub inline fn nBytes(self: *const CodeUnit) ?u8 { + switch (self.kind) { + .low, .hi => return 1, + .lead => return self.nMultiBytes(), + .follow => return null, + } + } + + /// Mask off all bits >= cu.body + pub inline fn hiMask(self: *const CodeUnit) u64 { + return (@as(u64, 1) << self.body) - 1; + } + + /// Mask off all bits <= cu.body + pub inline fn lowMask(self: *const CodeUnit) u64 { + if (self.body == 63) + return 0 + else + return ~((@as(u64, 1) << (self.body + 1)) - 1); + } + + /// Cast the `CodeUnit` to its backing `u8`. + pub inline fn byte(self: *const CodeUnit) u8 { + return @bitCast(self.*); + } +}; + +/// Cast raw byte to CodeUnit +inline fn codeunit(b: u8) CodeUnit { + return @bitCast(b); +} + +inline fn toMask(w: u64) Mask { + return Mask.toMask(w); +} + +/// Bitmask for runesets +/// +/// We define our own bitset, because the operations we need to +/// perform only overlap with IntegerBitSet for trivial one-liners, +/// and furthermore, we need nondestructive versions of the basic +/// operations, which aren't a part of the IntegerBitSet interface. +/// +/// Note that Masks do not track which kind of byte they apply to, +/// since they will be stored as ordinary u64s. User code must +/// ensure that CodeUnits tested against a Mask are of the appropriate +/// type, and otherwise valid for the test performed. +/// +const Mask = struct { + m: u64, + + pub fn toMask(w: u64) Mask { + return Mask{ .m = w }; + } + + /// Test if a CodeUnit's low bytes are present in mask + pub inline fn isIn(self: Mask, cu: CodeUnit) bool { + return self.m | cu.inMask() == self.m; + } + + /// Return number of bytes lower than cu.body in mask, + /// if cu inhabits the mask. Otherwise return null. + pub inline fn lowerThan(self: Mask, cu: CodeUnit) ?u64 { + if (self.isIn(cu)) { + const m = cu.hiMask(); + return @popCount(self.m & m); + } else { + return null; + } + } + + /// Return number of bytes higher than cu.body in mask, + /// if cu inhabits the mask. Otherwise return null. + pub inline fn higherThan(self: Mask, cu: CodeUnit) ?u64 { + if (self.isIn(cu)) { + const m = cu.lowMask(); + return @popCount(self.m & m); + } else { + return null; + } + } +}; + +/// Sum of @popCount of all words in region. +inline fn popCountSlice(region: []const u64) usize { + var ct: usize = 0; + for (region) |w| ct += @popCount(w); + return ct; +} diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index ee259a3..7ce2b4e 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -1,31 +1,5 @@ const dbg_print = false; -comptime { - testing.refAllDecls(grapheme); -} - -test "Iterator.peek" { - const peek_seq = "aΔ👨🏻‍🌾→"; - const data = try Graphemes.init(std.testing.allocator); - defer data.deinit(std.testing.allocator); - - var iter = data.iterator(peek_seq); - const peek_a = iter.peek().?; - const next_a = iter.next().?; - try std.testing.expectEqual(peek_a, next_a); - try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq)); - const peek_d1 = iter.peek().?; - const peek_d2 = iter.peek().?; - try std.testing.expectEqual(peek_d1, peek_d2); - const next_d = iter.next().?; - try std.testing.expectEqual(peek_d2, next_d); - try std.testing.expectEqual(iter.peek(), iter.next()); - try std.testing.expectEqual(iter.peek(), iter.next()); - try std.testing.expectEqual(null, iter.peek()); - try std.testing.expectEqual(null, iter.peek()); - try std.testing.expectEqual(iter.peek(), iter.next()); -} - test "Unicode normalization tests" { var arena = heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -147,15 +121,13 @@ test "Segmentation GraphemeIterator" { var buf_reader = std.io.bufferedReader(file.reader()); var input_stream = buf_reader.reader(); - const data = try Graphemes.init(allocator); - defer data.deinit(allocator); + const graph = try Graphemes.init(allocator); + defer graph.deinit(allocator); var buf: [4096]u8 = undefined; var line_iter: IterRead = .{ .read = &input_stream }; while (try line_iter.next(&buf)) |raw| { - // Skip comments or empty lines. - // if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; // Clean up. var line = std.mem.trimLeft(u8, raw, "÷ "); if (std.mem.indexOf(u8, line, " ÷\t")) |final| { @@ -190,7 +162,7 @@ test "Segmentation GraphemeIterator" { bytes_index += cp_index; } - var iter = data.iterator(all_bytes.items); + var iter = graph.iterator(all_bytes.items); // Check. for (want.items) |want_gc| { @@ -203,6 +175,71 @@ test "Segmentation GraphemeIterator" { } } +test "Segmentation Word Iterator" { + const allocator = std.testing.allocator; + var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{}); + defer file.close(); + var buf_reader = std.io.bufferedReader(file.reader()); + var input_stream = buf_reader.reader(); + + const wb = try WordBreak.init(allocator); + defer wb.deinit(allocator); + + var buf: [4096]u8 = undefined; + var line_iter: IterRead = .{ .read = &input_stream }; + + while (try line_iter.next(&buf)) |raw| { + // Clean up. + var line = std.mem.trimLeft(u8, raw, "÷ "); + if (std.mem.indexOf(u8, line, " ÷\t")) |final| { + line = line[0..final]; + } + // Iterate over fields. + var want = std.ArrayList(Grapheme).init(allocator); + defer want.deinit(); + + var all_bytes = std.ArrayList(u8).init(allocator); + defer all_bytes.deinit(); + + var words = std.mem.splitSequence(u8, line, " ÷ "); + var bytes_index: u32 = 0; + + while (words.next()) |field| { + var code_points = std.mem.splitScalar(u8, field, ' '); + var cp_buf: [4]u8 = undefined; + var cp_index: u32 = 0; + var gc_len: u8 = 0; + + while (code_points.next()) |code_point| { + if (std.mem.eql(u8, code_point, "×")) continue; + const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); + const len = try unicode.utf8Encode(cp, &cp_buf); + try all_bytes.appendSlice(cp_buf[0..len]); + cp_index += len; + gc_len += len; + } + + try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); + bytes_index += cp_index; + } + + var iter = wb.iterator(all_bytes.items); + + // Check. + for (want.items, 1..) |want_word, i| { + const got_word = (iter.next()).?; + std.testing.expectEqualSlices( + u8, + want_word.bytes(all_bytes.items), + got_word.bytes(all_bytes.items), + ) catch |err| { + debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); + return err; + }; + } + } +} + const IterRead = struct { read: *Reader, line: usize = 0, @@ -235,8 +272,9 @@ const debug = std.debug; const testing = std.testing; const unicode = std.unicode; -const grapheme = @import("Graphemes"); const Grapheme = @import("Graphemes").Grapheme; const Graphemes = @import("Graphemes"); const GraphemeIterator = @import("Graphemes").Iterator; const Normalize = @import("Normalize"); + +const WordBreak = @import("WordBreak"); -- cgit v1.2.3