From cf8d8fe5d640511f6c4134fdaa36e930232ca7da Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Mon, 12 May 2025 15:22:37 -0400 Subject: Begin conformance test I'm not sure the details of this strategy can actually be made to work. But, something can. --- src/micro_runeset.zig | 207 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 src/micro_runeset.zig (limited to 'src/micro_runeset.zig') diff --git a/src/micro_runeset.zig b/src/micro_runeset.zig new file mode 100644 index 0000000..34fbcd3 --- /dev/null +++ b/src/micro_runeset.zig @@ -0,0 +1,207 @@ +//! Minimal RuneSet implementation +//! +//! This is copied from the full RuneSet module, so that `zg` doesn't +//! depend on it. There's one spot in the WordBreak algorithm which +//! needs to identify the emoji Extended_Pictographic property, which +//! is not otherwise used in ZG. The Runeset is 89 words, while the +//! trie lookup used throughout ZG would be much larger. +//! +//! The RuneSet is borrowed from Runicode, which encodes Unicode things +//! in RuneSet form. This will need updating for each version of Unicode. + +pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x8000060, 0x10000000000000, 0x8001000200600000, 0x7800985090, 0x801022055ef2d, 0xedf57effffffdf57, 0xaffd75bd6f7d001f, 0xdbffffbbbff7ff7f, 0x7d7fddd76f56dfb5, 0x3800000000000001, 0x40040000000000, 0x4, 0x30bae0000008000, 0x100, 0x10004000000, 0x20001f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xfffffffffffffff7, 0xffffffffffffffff, 0xffffffffffffffff, 0x7fffffffffffbfff, 0x800000006000, 0x4001700000000000, 0xffffe00003fe4000, 0x1fffffffff, 0x73fc800004007ffa, 0xfffffffffffd7e00, 0xffffffffffffffff, 0x7fffffffffffffff, 0xffd56ff6bedfafff, 0x77ffffffffff7bff, 0xffffffff5757ffff, 0x3fafff77ff7bfef, 0xbffffdfffffab77f, 0xffffd7efffffffff, 0xff5fefffffffffff, 0xef6fd7ffffffffff, 0x1fffd7ffffefff7b, 0xfdfabf7ff7ffbac0, 0xf7faff77ffaf5dbf, 0x7dfbbf7eb7f6ffed, 0xfff7775fbfefdebf, 0x7fee, 0xbedddfddfbf7f7db, 0x6ebb6edf776b7bdf, 0x7ff0000000000000, 0x7fff77ff7fe00000, 0x7000, 0x7c007f00, 0xffffc00000007f00, 0x7fffffffffffffff, 0xb3fb7f7fbeff7000, 0x7ebef7ffbfff779f, 0x7dff5bebff7dffef, 0x7fffffbfffff7bfb, 0xffffffffffffffff, 0x6b777fffffffffff, 0xdbbf6effffdfbebb, 0x7ebf7f7fb5bf5fdb, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x1fffffffffffffff } }; + +// Meaningful names for the T1 slots +const LOW = 0; +const HI = 1; +const LEAD = 2; +const T4_OFF = 3; + +/// Minimum Viable Runeset. Must be statically created, strictly boolean matching. +pub const RuneSet = struct { + body: []const u64, + + // Returns whether the slice is a match. This assumes the validity of the + // string, which can be ensured by, in particular, deriving it from a CodePoint. + pub fn isMatch(runeset: RuneSet, str: []const u8) bool { + const set = runeset.body; + const a = codeunit(str[0]); + switch (a.kind) { + .follow => return false, + .low => { + const mask = toMask(set[LOW]); + if (mask.isIn(a)) + return true + else + return false; + }, + .hi => { + const mask = toMask(set[HI]); + if (mask.isIn(a)) + return true + else + return false; + }, + .lead => { + const nB = a.nMultiBytes().?; + const a_mask = toMask(set[LEAD]); + if (!a_mask.isIn(a)) return false; + const b = codeunit(str[1]); + const b_loc = 4 + a_mask.lowerThan(a).?; + const b_mask = toMask(set[b_loc]); + if (!b_mask.isIn(b)) return false; + if (nB == 2) return true; + const t3_off = 4 + @popCount(set[LEAD]); + const c = codeunit(str[2]); + // Slice is safe because we know the T2 span has at least one word. + const c_off = b_mask.higherThan(b).? + popCountSlice(set[b_loc + 1 .. t3_off]); + const c_loc = t3_off + c_off; + const c_mask = toMask(set[c_loc]); + if (!c_mask.isIn(c)) return false; + if (nB == 3) return true; + const d_off = c_mask.lowerThan(c).? + popCountSlice(set[t3_off..c_loc]); + const d_loc = set[T4_OFF] + d_off; + const d = codeunit(str[3]); + const d_mask = toMask(set[d_loc]); + if (d_mask.isIn(d)) return true else return false; + }, + } + } +}; + +/// Kinds of most significant bits in UTF-8 +const RuneKind = enum(u2) { + low, + hi, + follow, + lead, +}; + +/// Packed `u8` struct representing one codeunit of UTF-8. +const CodeUnit = packed struct(u8) { + body: u6, + kind: RuneKind, + + /// Mask to check presence + pub inline fn inMask(self: *const CodeUnit) u64 { + return @as(u64, 1) << self.body; + } + + // TODO consider an nMultiBytesFast, for the cases where we + // know that invalid lead bytes are never present (such as in set) + // operations, where we may assume that (and will assert that) the + // LEAD mask contains no such bytes. + + /// Number of bytes in known multi-byte rune. + /// + /// Caller guarantees that the CodeUnit is a lead byte + /// of a multi-byte rune: `cu.kind == .lead`. + /// + /// Invalid lead bytes will return null. + pub inline fn nMultiBytes(self: *const CodeUnit) ?u8 { + return switch (self.body) { + // 0 and 1 are invalid for overlong reasons, + // but RuneSet supports overlong encodings + 0...31 => 2, + 32...47 => 3, + 48...55 => 4, + // Wasted space 56...61 is due entirely to Microsoft's + // lack of vision and insistence on a substandard + // and utterly inadequate encoding for Unicode + // "64k should be enough for anyone" + 56...63 => null, + }; + } + + /// Given a valid lead byte, return the number of bytes that should + /// make up the code unit sequence. Will return `null` if the lead + /// byte is invalid. + pub inline fn nBytes(self: *const CodeUnit) ?u8 { + switch (self.kind) { + .low, .hi => return 1, + .lead => return self.nMultiBytes(), + .follow => return null, + } + } + + /// Mask off all bits >= cu.body + pub inline fn hiMask(self: *const CodeUnit) u64 { + return (@as(u64, 1) << self.body) - 1; + } + + /// Mask off all bits <= cu.body + pub inline fn lowMask(self: *const CodeUnit) u64 { + if (self.body == 63) + return 0 + else + return ~((@as(u64, 1) << (self.body + 1)) - 1); + } + + /// Cast the `CodeUnit` to its backing `u8`. + pub inline fn byte(self: *const CodeUnit) u8 { + return @bitCast(self.*); + } +}; + +/// Cast raw byte to CodeUnit +inline fn codeunit(b: u8) CodeUnit { + return @bitCast(b); +} + +inline fn toMask(w: u64) Mask { + return Mask.toMask(w); +} + +/// Bitmask for runesets +/// +/// We define our own bitset, because the operations we need to +/// perform only overlap with IntegerBitSet for trivial one-liners, +/// and furthermore, we need nondestructive versions of the basic +/// operations, which aren't a part of the IntegerBitSet interface. +/// +/// Note that Masks do not track which kind of byte they apply to, +/// since they will be stored as ordinary u64s. User code must +/// ensure that CodeUnits tested against a Mask are of the appropriate +/// type, and otherwise valid for the test performed. +/// +const Mask = struct { + m: u64, + + pub fn toMask(w: u64) Mask { + return Mask{ .m = w }; + } + + /// Test if a CodeUnit's low bytes are present in mask + pub inline fn isIn(self: Mask, cu: CodeUnit) bool { + return self.m | cu.inMask() == self.m; + } + + /// Return number of bytes lower than cu.body in mask, + /// if cu inhabits the mask. Otherwise return null. + pub inline fn lowerThan(self: Mask, cu: CodeUnit) ?u64 { + if (self.isIn(cu)) { + const m = cu.hiMask(); + return @popCount(self.m & m); + } else { + return null; + } + } + + /// Return number of bytes higher than cu.body in mask, + /// if cu inhabits the mask. Otherwise return null. + pub inline fn higherThan(self: Mask, cu: CodeUnit) ?u64 { + if (self.isIn(cu)) { + const m = cu.lowMask(); + return @popCount(self.m & m); + } else { + return null; + } + } +}; + +/// Sum of @popCount of all words in region. +inline fn popCountSlice(region: []const u64) usize { + var ct: usize = 0; + for (region) |w| ct += @popCount(w); + return ct; +} -- cgit v1.2.3