diff options
| author | 2025-07-08 12:15:32 -0400 | |
|---|---|---|
| committer | 2025-07-08 12:15:32 -0400 | |
| commit | 9427a9e53aaa29ee071f4dcb35b809a699d75aa9 (patch) | |
| tree | 2607c185fd8053b84d60041fadc35c05a0225d34 /src/micro_runeset.zig | |
| parent | Merge pull request 'Fix benchmarks' (#56) from jacobsandlund/zg:benchmarks in... (diff) | |
| parent | Add Words.zig example to README (diff) | |
| download | zg-master.tar.gz zg-master.tar.xz zg-master.zip | |
Diffstat (limited to 'src/micro_runeset.zig')
| -rw-r--r-- | src/micro_runeset.zig | 207 |
1 files changed, 207 insertions, 0 deletions
diff --git a/src/micro_runeset.zig b/src/micro_runeset.zig new file mode 100644 index 0000000..80ce4bf --- /dev/null +++ b/src/micro_runeset.zig | |||
| @@ -0,0 +1,207 @@ | |||
| 1 | //! Minimal RuneSet implementation | ||
| 2 | //! | ||
| 3 | //! This is copied from the full RuneSet module, so that `zg` doesn't | ||
| 4 | //! depend on it. There's one spot in the WordBreak algorithm which | ||
| 5 | //! needs to identify the emoji Extended_Pictographic property, which | ||
| 6 | //! is not otherwise used in ZG. The Runeset is 89 words, while the | ||
| 7 | //! trie lookup used throughout ZG would be much larger. | ||
| 8 | //! | ||
| 9 | //! The RuneSet is borrowed from Runicode, which encodes Unicode things | ||
| 10 | //! in RuneSet form. This will need updating for each version of Unicode. | ||
| 11 | |||
| 12 | pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x180000e0, 0x30000000000000, 0x8001000200e00000, 0xf800b85090, 0x1801022057ff3f, 0xffffffffffffffff, 0xffffffffffff003f, 0xffffffffffffffff, 0xfffffffffff7ffbf, 0x7800000000000001, 0x400c0000000000, 0x4, 0x70ffe0000008000, 0x100, 0x1000c000000, 0x60003f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x80000000e000, 0xc003f00000000000, 0xffffe00007fe4000, 0x3fffffffff, 0xf7fc80000400fffe, 0xfffffffffffffe00, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x7ffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff, 0xffffffffffffffc0, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xfff0000000000000, 0xffffffffffe00000, 0xf000, 0xfc00ff00, 0xffffc0000000ff00, 0xffffffffffffffff, 0xf7fffffffffff000, 0xffffffffffffffbf, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff } }; | ||
| 13 | |||
| 14 | // Meaningful names for the T1 slots | ||
| 15 | const LOW = 0; | ||
| 16 | const HI = 1; | ||
| 17 | const LEAD = 2; | ||
| 18 | const T4_OFF = 3; | ||
| 19 | |||
| 20 | /// Minimum Viable Runeset. Must be statically created, strictly boolean matching. | ||
| 21 | pub const RuneSet = struct { | ||
| 22 | body: []const u64, | ||
| 23 | |||
| 24 | // Returns whether the slice is a match. This assumes the validity of the | ||
| 25 | // string, which can be ensured by, in particular, deriving it from a CodePoint. | ||
| 26 | pub fn isMatch(runeset: RuneSet, str: []const u8) bool { | ||
| 27 | const set = runeset.body; | ||
| 28 | const a = codeunit(str[0]); | ||
| 29 | switch (a.kind) { | ||
| 30 | .follow => unreachable, | ||
| 31 | .low => { | ||
| 32 | const mask = toMask(set[LOW]); | ||
| 33 | if (mask.isIn(a)) | ||
| 34 | return true | ||
| 35 | else | ||
| 36 | return false; | ||
| 37 | }, | ||
| 38 | .hi => { | ||
| 39 | const mask = toMask(set[HI]); | ||
| 40 | if (mask.isIn(a)) | ||
| 41 | return true | ||
| 42 | else | ||
| 43 | return false; | ||
| 44 | }, | ||
| 45 | .lead => { | ||
| 46 | const nB = a.nMultiBytes().?; | ||
| 47 | const a_mask = toMask(set[LEAD]); | ||
| 48 | if (!a_mask.isIn(a)) return false; | ||
| 49 | const b = codeunit(str[1]); | ||
| 50 | const b_loc = 4 + a_mask.lowerThan(a).?; | ||
| 51 | const b_mask = toMask(set[b_loc]); | ||
| 52 | if (!b_mask.isIn(b)) return false; | ||
| 53 | if (nB == 2) return true; | ||
| 54 | const t3_off = 4 + @popCount(set[LEAD]); | ||
| 55 | const c = codeunit(str[2]); | ||
| 56 | // Slice is safe because we know the T2 span has at least one word. | ||
| 57 | const c_off = b_mask.higherThan(b).? + popCountSlice(set[b_loc + 1 .. t3_off]); | ||
| 58 | const c_loc = t3_off + c_off; | ||
| 59 | const c_mask = toMask(set[c_loc]); | ||
| 60 | if (!c_mask.isIn(c)) return false; | ||
| 61 | if (nB == 3) return true; | ||
| 62 | const d_off = c_mask.lowerThan(c).? + popCountSlice(set[t3_off..c_loc]); | ||
| 63 | const d_loc = set[T4_OFF] + d_off; | ||
| 64 | const d = codeunit(str[3]); | ||
| 65 | const d_mask = toMask(set[d_loc]); | ||
| 66 | if (d_mask.isIn(d)) return true else return false; | ||
| 67 | }, | ||
| 68 | } | ||
| 69 | } | ||
| 70 | }; | ||
| 71 | |||
| 72 | /// Kinds of most significant bits in UTF-8 | ||
| 73 | const RuneKind = enum(u2) { | ||
| 74 | low, | ||
| 75 | hi, | ||
| 76 | follow, | ||
| 77 | lead, | ||
| 78 | }; | ||
| 79 | |||
| 80 | /// Packed `u8` struct representing one codeunit of UTF-8. | ||
| 81 | const CodeUnit = packed struct(u8) { | ||
| 82 | body: u6, | ||
| 83 | kind: RuneKind, | ||
| 84 | |||
| 85 | /// Mask to check presence | ||
| 86 | pub inline fn inMask(self: *const CodeUnit) u64 { | ||
| 87 | return @as(u64, 1) << self.body; | ||
| 88 | } | ||
| 89 | |||
| 90 | // TODO consider an nMultiBytesFast, for the cases where we | ||
| 91 | // know that invalid lead bytes are never present (such as in set) | ||
| 92 | // operations, where we may assume that (and will assert that) the | ||
| 93 | // LEAD mask contains no such bytes. | ||
| 94 | |||
| 95 | /// Number of bytes in known multi-byte rune. | ||
| 96 | /// | ||
| 97 | /// Caller guarantees that the CodeUnit is a lead byte | ||
| 98 | /// of a multi-byte rune: `cu.kind == .lead`. | ||
| 99 | /// | ||
| 100 | /// Invalid lead bytes will return null. | ||
| 101 | pub inline fn nMultiBytes(self: *const CodeUnit) ?u8 { | ||
| 102 | return switch (self.body) { | ||
| 103 | // 0 and 1 are invalid for overlong reasons, | ||
| 104 | // but RuneSet supports overlong encodings | ||
| 105 | 0...31 => 2, | ||
| 106 | 32...47 => 3, | ||
| 107 | 48...55 => 4, | ||
| 108 | // Wasted space 56...61 is due entirely to Microsoft's | ||
| 109 | // lack of vision and insistence on a substandard | ||
| 110 | // and utterly inadequate encoding for Unicode | ||
| 111 | // "64k should be enough for anyone" <spits> | ||
| 112 | 56...63 => null, | ||
| 113 | }; | ||
| 114 | } | ||
| 115 | |||
| 116 | /// Given a valid lead byte, return the number of bytes that should | ||
| 117 | /// make up the code unit sequence. Will return `null` if the lead | ||
| 118 | /// byte is invalid. | ||
| 119 | pub inline fn nBytes(self: *const CodeUnit) ?u8 { | ||
| 120 | switch (self.kind) { | ||
| 121 | .low, .hi => return 1, | ||
| 122 | .lead => return self.nMultiBytes(), | ||
| 123 | .follow => return null, | ||
| 124 | } | ||
| 125 | } | ||
| 126 | |||
| 127 | /// Mask off all bits >= cu.body | ||
| 128 | pub inline fn hiMask(self: *const CodeUnit) u64 { | ||
| 129 | return (@as(u64, 1) << self.body) - 1; | ||
| 130 | } | ||
| 131 | |||
| 132 | /// Mask off all bits <= cu.body | ||
| 133 | pub inline fn lowMask(self: *const CodeUnit) u64 { | ||
| 134 | if (self.body == 63) | ||
| 135 | return 0 | ||
| 136 | else | ||
| 137 | return ~((@as(u64, 1) << (self.body + 1)) - 1); | ||
| 138 | } | ||
| 139 | |||
| 140 | /// Cast the `CodeUnit` to its backing `u8`. | ||
| 141 | pub inline fn byte(self: *const CodeUnit) u8 { | ||
| 142 | return @bitCast(self.*); | ||
| 143 | } | ||
| 144 | }; | ||
| 145 | |||
| 146 | /// Cast raw byte to CodeUnit | ||
| 147 | inline fn codeunit(b: u8) CodeUnit { | ||
| 148 | return @bitCast(b); | ||
| 149 | } | ||
| 150 | |||
| 151 | inline fn toMask(w: u64) Mask { | ||
| 152 | return Mask.toMask(w); | ||
| 153 | } | ||
| 154 | |||
| 155 | /// Bitmask for runesets | ||
| 156 | /// | ||
| 157 | /// We define our own bitset, because the operations we need to | ||
| 158 | /// perform only overlap with IntegerBitSet for trivial one-liners, | ||
| 159 | /// and furthermore, we need nondestructive versions of the basic | ||
| 160 | /// operations, which aren't a part of the IntegerBitSet interface. | ||
| 161 | /// | ||
| 162 | /// Note that Masks do not track which kind of byte they apply to, | ||
| 163 | /// since they will be stored as ordinary u64s. User code must | ||
| 164 | /// ensure that CodeUnits tested against a Mask are of the appropriate | ||
| 165 | /// type, and otherwise valid for the test performed. | ||
| 166 | /// | ||
| 167 | const Mask = struct { | ||
| 168 | m: u64, | ||
| 169 | |||
| 170 | pub fn toMask(w: u64) Mask { | ||
| 171 | return Mask{ .m = w }; | ||
| 172 | } | ||
| 173 | |||
| 174 | /// Test if a CodeUnit's low bytes are present in mask | ||
| 175 | pub inline fn isIn(self: Mask, cu: CodeUnit) bool { | ||
| 176 | return self.m | cu.inMask() == self.m; | ||
| 177 | } | ||
| 178 | |||
| 179 | /// Return number of bytes lower than cu.body in mask, | ||
| 180 | /// if cu inhabits the mask. Otherwise return null. | ||
| 181 | pub inline fn lowerThan(self: Mask, cu: CodeUnit) ?u64 { | ||
| 182 | if (self.isIn(cu)) { | ||
| 183 | const m = cu.hiMask(); | ||
| 184 | return @popCount(self.m & m); | ||
| 185 | } else { | ||
| 186 | return null; | ||
| 187 | } | ||
| 188 | } | ||
| 189 | |||
| 190 | /// Return number of bytes higher than cu.body in mask, | ||
| 191 | /// if cu inhabits the mask. Otherwise return null. | ||
| 192 | pub inline fn higherThan(self: Mask, cu: CodeUnit) ?u64 { | ||
| 193 | if (self.isIn(cu)) { | ||
| 194 | const m = cu.lowMask(); | ||
| 195 | return @popCount(self.m & m); | ||
| 196 | } else { | ||
| 197 | return null; | ||
| 198 | } | ||
| 199 | } | ||
| 200 | }; | ||
| 201 | |||
| 202 | /// Sum of @popCount of all words in region. | ||
| 203 | inline fn popCountSlice(region: []const u64) usize { | ||
| 204 | var ct: usize = 0; | ||
| 205 | for (region) |w| ct += @popCount(w); | ||
| 206 | return ct; | ||
| 207 | } | ||