summaryrefslogtreecommitdiff
path: root/src/micro_runeset.zig
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-07-08 12:15:32 -0400
committerGravatar Sam Atman2025-07-08 12:15:32 -0400
commit9427a9e53aaa29ee071f4dcb35b809a699d75aa9 (patch)
tree2607c185fd8053b84d60041fadc35c05a0225d34 /src/micro_runeset.zig
parentMerge pull request 'Fix benchmarks' (#56) from jacobsandlund/zg:benchmarks in... (diff)
parentAdd Words.zig example to README (diff)
downloadzg-master.tar.gz
zg-master.tar.xz
zg-master.zip
Merge branch 'develop-next'HEADv0.14.1master
Diffstat (limited to 'src/micro_runeset.zig')
-rw-r--r--src/micro_runeset.zig207
1 files changed, 207 insertions, 0 deletions
diff --git a/src/micro_runeset.zig b/src/micro_runeset.zig
new file mode 100644
index 0000000..80ce4bf
--- /dev/null
+++ b/src/micro_runeset.zig
@@ -0,0 +1,207 @@
1//! Minimal RuneSet implementation
2//!
3//! This is copied from the full RuneSet module, so that `zg` doesn't
4//! depend on it. There's one spot in the WordBreak algorithm which
5//! needs to identify the emoji Extended_Pictographic property, which
6//! is not otherwise used in ZG. The Runeset is 89 words, while the
7//! trie lookup used throughout ZG would be much larger.
8//!
9//! The RuneSet is borrowed from Runicode, which encodes Unicode things
10//! in RuneSet form. This will need updating for each version of Unicode.
11
12pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x180000e0, 0x30000000000000, 0x8001000200e00000, 0xf800b85090, 0x1801022057ff3f, 0xffffffffffffffff, 0xffffffffffff003f, 0xffffffffffffffff, 0xfffffffffff7ffbf, 0x7800000000000001, 0x400c0000000000, 0x4, 0x70ffe0000008000, 0x100, 0x1000c000000, 0x60003f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x80000000e000, 0xc003f00000000000, 0xffffe00007fe4000, 0x3fffffffff, 0xf7fc80000400fffe, 0xfffffffffffffe00, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x7ffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff, 0xffffffffffffffc0, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xfff0000000000000, 0xffffffffffe00000, 0xf000, 0xfc00ff00, 0xffffc0000000ff00, 0xffffffffffffffff, 0xf7fffffffffff000, 0xffffffffffffffbf, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff } };
13
14// Meaningful names for the T1 slots
15const LOW = 0;
16const HI = 1;
17const LEAD = 2;
18const T4_OFF = 3;
19
20/// Minimum Viable Runeset. Must be statically created, strictly boolean matching.
21pub const RuneSet = struct {
22 body: []const u64,
23
24 // Returns whether the slice is a match. This assumes the validity of the
25 // string, which can be ensured by, in particular, deriving it from a CodePoint.
26 pub fn isMatch(runeset: RuneSet, str: []const u8) bool {
27 const set = runeset.body;
28 const a = codeunit(str[0]);
29 switch (a.kind) {
30 .follow => unreachable,
31 .low => {
32 const mask = toMask(set[LOW]);
33 if (mask.isIn(a))
34 return true
35 else
36 return false;
37 },
38 .hi => {
39 const mask = toMask(set[HI]);
40 if (mask.isIn(a))
41 return true
42 else
43 return false;
44 },
45 .lead => {
46 const nB = a.nMultiBytes().?;
47 const a_mask = toMask(set[LEAD]);
48 if (!a_mask.isIn(a)) return false;
49 const b = codeunit(str[1]);
50 const b_loc = 4 + a_mask.lowerThan(a).?;
51 const b_mask = toMask(set[b_loc]);
52 if (!b_mask.isIn(b)) return false;
53 if (nB == 2) return true;
54 const t3_off = 4 + @popCount(set[LEAD]);
55 const c = codeunit(str[2]);
56 // Slice is safe because we know the T2 span has at least one word.
57 const c_off = b_mask.higherThan(b).? + popCountSlice(set[b_loc + 1 .. t3_off]);
58 const c_loc = t3_off + c_off;
59 const c_mask = toMask(set[c_loc]);
60 if (!c_mask.isIn(c)) return false;
61 if (nB == 3) return true;
62 const d_off = c_mask.lowerThan(c).? + popCountSlice(set[t3_off..c_loc]);
63 const d_loc = set[T4_OFF] + d_off;
64 const d = codeunit(str[3]);
65 const d_mask = toMask(set[d_loc]);
66 if (d_mask.isIn(d)) return true else return false;
67 },
68 }
69 }
70};
71
72/// Kinds of most significant bits in UTF-8
73const RuneKind = enum(u2) {
74 low,
75 hi,
76 follow,
77 lead,
78};
79
80/// Packed `u8` struct representing one codeunit of UTF-8.
81const CodeUnit = packed struct(u8) {
82 body: u6,
83 kind: RuneKind,
84
85 /// Mask to check presence
86 pub inline fn inMask(self: *const CodeUnit) u64 {
87 return @as(u64, 1) << self.body;
88 }
89
90 // TODO consider an nMultiBytesFast, for the cases where we
91 // know that invalid lead bytes are never present (such as in set)
92 // operations, where we may assume that (and will assert that) the
93 // LEAD mask contains no such bytes.
94
95 /// Number of bytes in known multi-byte rune.
96 ///
97 /// Caller guarantees that the CodeUnit is a lead byte
98 /// of a multi-byte rune: `cu.kind == .lead`.
99 ///
100 /// Invalid lead bytes will return null.
101 pub inline fn nMultiBytes(self: *const CodeUnit) ?u8 {
102 return switch (self.body) {
103 // 0 and 1 are invalid for overlong reasons,
104 // but RuneSet supports overlong encodings
105 0...31 => 2,
106 32...47 => 3,
107 48...55 => 4,
108 // Wasted space 56...61 is due entirely to Microsoft's
109 // lack of vision and insistence on a substandard
110 // and utterly inadequate encoding for Unicode
111 // "64k should be enough for anyone" <spits>
112 56...63 => null,
113 };
114 }
115
116 /// Given a valid lead byte, return the number of bytes that should
117 /// make up the code unit sequence. Will return `null` if the lead
118 /// byte is invalid.
119 pub inline fn nBytes(self: *const CodeUnit) ?u8 {
120 switch (self.kind) {
121 .low, .hi => return 1,
122 .lead => return self.nMultiBytes(),
123 .follow => return null,
124 }
125 }
126
127 /// Mask off all bits >= cu.body
128 pub inline fn hiMask(self: *const CodeUnit) u64 {
129 return (@as(u64, 1) << self.body) - 1;
130 }
131
132 /// Mask off all bits <= cu.body
133 pub inline fn lowMask(self: *const CodeUnit) u64 {
134 if (self.body == 63)
135 return 0
136 else
137 return ~((@as(u64, 1) << (self.body + 1)) - 1);
138 }
139
140 /// Cast the `CodeUnit` to its backing `u8`.
141 pub inline fn byte(self: *const CodeUnit) u8 {
142 return @bitCast(self.*);
143 }
144};
145
146/// Cast raw byte to CodeUnit
147inline fn codeunit(b: u8) CodeUnit {
148 return @bitCast(b);
149}
150
151inline fn toMask(w: u64) Mask {
152 return Mask.toMask(w);
153}
154
155/// Bitmask for runesets
156///
157/// We define our own bitset, because the operations we need to
158/// perform only overlap with IntegerBitSet for trivial one-liners,
159/// and furthermore, we need nondestructive versions of the basic
160/// operations, which aren't a part of the IntegerBitSet interface.
161///
162/// Note that Masks do not track which kind of byte they apply to,
163/// since they will be stored as ordinary u64s. User code must
164/// ensure that CodeUnits tested against a Mask are of the appropriate
165/// type, and otherwise valid for the test performed.
166///
167const Mask = struct {
168 m: u64,
169
170 pub fn toMask(w: u64) Mask {
171 return Mask{ .m = w };
172 }
173
174 /// Test if a CodeUnit's low bytes are present in mask
175 pub inline fn isIn(self: Mask, cu: CodeUnit) bool {
176 return self.m | cu.inMask() == self.m;
177 }
178
179 /// Return number of bytes lower than cu.body in mask,
180 /// if cu inhabits the mask. Otherwise return null.
181 pub inline fn lowerThan(self: Mask, cu: CodeUnit) ?u64 {
182 if (self.isIn(cu)) {
183 const m = cu.hiMask();
184 return @popCount(self.m & m);
185 } else {
186 return null;
187 }
188 }
189
190 /// Return number of bytes higher than cu.body in mask,
191 /// if cu inhabits the mask. Otherwise return null.
192 pub inline fn higherThan(self: Mask, cu: CodeUnit) ?u64 {
193 if (self.isIn(cu)) {
194 const m = cu.lowMask();
195 return @popCount(self.m & m);
196 } else {
197 return null;
198 }
199 }
200};
201
202/// Sum of @popCount of all words in region.
203inline fn popCountSlice(region: []const u64) usize {
204 var ct: usize = 0;
205 for (region) |w| ct += @popCount(w);
206 return ct;
207}