summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-12 15:22:37 -0400
committerGravatar Sam Atman2025-05-15 15:31:16 -0400
commitcf8d8fe5d640511f6c4134fdaa36e930232ca7da (patch)
tree410a3c5195ea0780b637f740ebcb6e80e63db09c
parentImplement Word iterator (diff)
downloadzg-cf8d8fe5d640511f6c4134fdaa36e930232ca7da.tar.gz
zg-cf8d8fe5d640511f6c4134fdaa36e930232ca7da.tar.xz
zg-cf8d8fe5d640511f6c4134fdaa36e930232ca7da.zip
Begin conformance test
I'm not sure the details of this strategy can actually be made to work. But, something can.
-rw-r--r--build.zig1
-rw-r--r--src/Graphemes.zig22
-rw-r--r--src/WordBreak.zig83
-rw-r--r--src/code_point.zig5
-rw-r--r--src/micro_runeset.zig207
-rw-r--r--src/unicode_tests.zig102
6 files changed, 362 insertions, 58 deletions
diff --git a/build.zig b/build.zig
index f89e90c..387b4c3 100644
--- a/build.zig
+++ b/build.zig
@@ -471,6 +471,7 @@ pub fn build(b: *std.Build) void {
471 }); 471 });
472 unicode_tests.root_module.addImport("Graphemes", graphemes); 472 unicode_tests.root_module.addImport("Graphemes", graphemes);
473 unicode_tests.root_module.addImport("Normalize", norm); 473 unicode_tests.root_module.addImport("Normalize", norm);
474 unicode_tests.root_module.addImport("WordBreak", word_break);
474 475
475 const run_unicode_tests = b.addRunArtifact(unicode_tests); 476 const run_unicode_tests = b.addRunArtifact(unicode_tests);
476 477
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 1ce1ea6..1f67fc6 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -364,3 +364,25 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
364 try std.testing.expectEqual(@as(usize, 2), i); 364 try std.testing.expectEqual(@as(usize, 2), i);
365 } 365 }
366} 366}
367
368test "Iterator.peek" {
369 const peek_seq = "aΔ👨🏻‍🌾→";
370 const data = try Graphemes.init(std.testing.allocator);
371 defer data.deinit(std.testing.allocator);
372
373 var iter = data.iterator(peek_seq);
374 const peek_a = iter.peek().?;
375 const next_a = iter.next().?;
376 try std.testing.expectEqual(peek_a, next_a);
377 try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq));
378 const peek_d1 = iter.peek().?;
379 const peek_d2 = iter.peek().?;
380 try std.testing.expectEqual(peek_d1, peek_d2);
381 const next_d = iter.next().?;
382 try std.testing.expectEqual(peek_d2, next_d);
383 try std.testing.expectEqual(iter.peek(), iter.next());
384 try std.testing.expectEqual(iter.peek(), iter.next());
385 try std.testing.expectEqual(null, iter.peek());
386 try std.testing.expectEqual(null, iter.peek());
387 try std.testing.expectEqual(iter.peek(), iter.next());
388}
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 84fd1f7..53db76b 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -88,6 +88,11 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
88 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); 88 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
89} 89}
90 90
91/// Returns an iterator over words in `slice`
92pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator {
93 return Iterator.init(wordbreak, slice);
94}
95
91const IterState = packed struct { 96const IterState = packed struct {
92 mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter 97 mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter
93 mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric 98 mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric
@@ -113,7 +118,7 @@ pub const Iterator = struct {
113 pub fn init(wb: *const WordBreak, str: []const u8) Iterator { 118 pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
114 var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; 119 var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb };
115 wb_iter.advance(); 120 wb_iter.advance();
116 return wb; 121 return wb_iter;
117 } 122 }
118 123
119 pub fn next(iter: *Iterator) ?Word { 124 pub fn next(iter: *Iterator) ?Word {
@@ -132,12 +137,18 @@ pub const Iterator = struct {
132 scan: while (true) : (iter.advance()) { 137 scan: while (true) : (iter.advance()) {
133 const this = iter.this.?; 138 const this = iter.this.?;
134 word_len += this.len; 139 word_len += this.len;
140 var ignored = false;
135 if (iter.that) |that| { 141 if (iter.that) |that| {
136 const that_p = iter.wb.breakProperty(that.code); 142 const that_p = iter.wb.breakProperty(that.code);
137 const this_p = this_p: { 143 const this_p = this_p: {
138 if (!isIgnorable(that_p) and iter.cache != null) { 144 if (!isIgnorable(that_p) and iter.cache != null) {
145 // TODO: might not need these what with peekPast
146 ignored = true;
139 defer iter.cache = null; 147 defer iter.cache = null;
140 break :this_p iter.cache.?; 148 // Fixup some state, apply pre-4 rules
149 const restore = iter.cache.?;
150 if (restore == .WSegSpace) break :this_p .none;
151 break :this_p restore;
141 } else { 152 } else {
142 break :this_p iter.wb.breakProperty(this.code); 153 break :this_p iter.wb.breakProperty(this.code);
143 } 154 }
@@ -149,11 +160,22 @@ pub const Iterator = struct {
149 // WB3b ÷ (Newline | CR | LF) 160 // WB3b ÷ (Newline | CR | LF)
150 if (isNewline(that_p)) break :scan; 161 if (isNewline(that_p)) break :scan;
151 // WB3c ZWJ × \p{Extended_Pictographic} 162 // WB3c ZWJ × \p{Extended_Pictographic}
152 // The right way to do this one is a RuneSet, TODO: circle back 163 if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) {
164 // Invalid after ignoring
165 if (ignored) break :scan else continue :scan;
166 }
153 // WB3d WSegSpace × WSegSpace 167 // WB3d WSegSpace × WSegSpace
154 if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; 168 if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
155 // WB4 X (Extend | Format | ZWJ)* → X 169 // WB4 X (Extend | Format | ZWJ)* → X
156 if (isIgnorable(that_p)) { 170 if (isIgnorable(that_p)) {
171 if (that_p == .ZWJ) {
172 const next_val = iter.peekPast();
173 if (next_val) |next_cp| {
174 if (ext_pict.isMatch(next_cp.bytes(iter.cp_iter.bytes))) {
175 continue :scan;
176 }
177 }
178 }
157 if (iter.cache == null) { 179 if (iter.cache == null) {
158 iter.cache = this_p; 180 iter.cache = this_p;
159 } 181 }
@@ -164,14 +186,14 @@ pub const Iterator = struct {
164 if (isAHLetter(that_p)) continue :scan; 186 if (isAHLetter(that_p)) continue :scan;
165 // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter 187 // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter
166 if (isMidVal(that_p)) { 188 if (isMidVal(that_p)) {
167 const next_val = iter.cp_iter.peek(); 189 const next_val = iter.peekPast();
168 if (next_val) |next_cp| { 190 if (next_val) |next_cp| {
169 const next_p = iter.wb.breakProperty(next_cp.code); 191 const next_p = iter.wb.breakProperty(next_cp.code);
170 if (isAHLetter(next_p)) { 192 if (isAHLetter(next_p)) {
171 state.mid_punct = true; 193 state.mid_punct = true;
172 continue :scan; 194 continue :scan;
173 } 195 }
174 } else break :scan; 196 }
175 } 197 }
176 } 198 }
177 // AHLetter (MidLetter | MidNumLetQ) × AHLetter 199 // AHLetter (MidLetter | MidNumLetQ) × AHLetter
@@ -187,7 +209,7 @@ pub const Iterator = struct {
187 if (that_p == .Single_Quote) continue :scan; 209 if (that_p == .Single_Quote) continue :scan;
188 // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter 210 // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
189 if (that_p == .Double_Quote) { 211 if (that_p == .Double_Quote) {
190 const next_val = iter.cp_iter.peek(); 212 const next_val = iter.peekPast();
191 if (next_val) |next_cp| { 213 if (next_val) |next_cp| {
192 const next_p = iter.wb.breakProperty(next_cp.code); 214 const next_p = iter.wb.breakProperty(next_cp.code);
193 if (next_p == .Hebrew_Letter) { 215 if (next_p == .Hebrew_Letter) {
@@ -212,8 +234,8 @@ pub const Iterator = struct {
212 // WB10 Numeric × AHLetter 234 // WB10 Numeric × AHLetter
213 if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; 235 if (this_p == .Numeric and isAHLetter(that_p)) continue :scan;
214 // WB12 Numeric × (MidNum | MidNumLetQ) Numeric 236 // WB12 Numeric × (MidNum | MidNumLetQ) Numeric
215 if (this_p == .Numeric and isMidVal(that_p)) { 237 if (this_p == .Numeric and isMidNum(that_p)) {
216 const next_val = iter.cp_iter.peek(); 238 const next_val = iter.peekPast();
217 if (next_val) |next_cp| { 239 if (next_val) |next_cp| {
218 const next_p = iter.wb.breakProperty(next_cp.code); 240 const next_p = iter.wb.breakProperty(next_cp.code);
219 if (next_p == .Numeric) { 241 if (next_p == .Numeric) {
@@ -224,7 +246,7 @@ pub const Iterator = struct {
224 } 246 }
225 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric 247 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric
226 if (state.mid_num) { 248 if (state.mid_num) {
227 assert(isMidVal(this_p)); 249 assert(isMidNum(this_p));
228 assert(that_p == .Numeric); 250 assert(that_p == .Numeric);
229 state.mid_num = false; 251 state.mid_num = false;
230 continue :scan; 252 continue :scan;
@@ -235,25 +257,18 @@ pub const Iterator = struct {
235 if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; 257 if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan;
236 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) 258 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana)
237 if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; 259 if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
238 // WB15, WB16 ([^RI] ! sot) (RI RI)* RI × RI 260 // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI
239 if (that_p == .Regional_Indicator) { 261 if (this_p == .Regional_Indicator) {
240 if (this_p == .Regional_Indicator) { 262 if (that_p == .Regional_Indicator) {
241 if (state.regional) { 263 if (state.regional == true or this.offset == 0) {
242 state.regional = false; 264 state.regional = false;
243 continue :scan; 265 continue :scan;
244 } else {
245 break :scan;
246 } 266 }
247 } else { 267 } else {
248 const next_val = iter.cp_iter.peek(); 268 state.regional = true;
249 if (next_val) |next_cp| {
250 const next_p = iter.wb.breakProperty(next_cp.code);
251 if (next_p == .Regional_Indicator) {
252 state.regional = true;
253 continue :scan;
254 }
255 } else break :scan;
256 } 269 }
270 } else if (that_p == .Regional_Indicator) {
271 state.regional = true;
257 } 272 }
258 // WB999 Any ÷ Any 273 // WB999 Any ÷ Any
259 break :scan; 274 break :scan;
@@ -265,9 +280,19 @@ pub const Iterator = struct {
265 return Word{ .len = word_len, .offset = word_start }; 280 return Word{ .len = word_len, .offset = word_start };
266 } 281 }
267 282
268 fn advance(wb_iter: *Iterator) void { 283 fn advance(iter: *Iterator) void {
269 wb_iter.this = wb_iter.that; 284 iter.this = iter.that;
270 wb_iter.that = wb_iter.cp_iter.next(); 285 iter.that = iter.cp_iter.next();
286 }
287
288 fn peekPast(iter: *Iterator) ?CodePoint {
289 const save_cp = iter.cp_iter;
290 defer iter.cp_iter = save_cp;
291 while (iter.cp_iter.peek()) |peeked| {
292 if (!isIgnorable(iter.wb.breakProperty(peeked.code))) return peeked;
293 _ = iter.cp_iter.next();
294 }
295 return null;
271 } 296 }
272}; 297};
273 298
@@ -292,6 +317,10 @@ inline fn isMidVal(wbp: WordBreakProperty) bool {
292 return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; 317 return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
293} 318}
294 319
320inline fn isMidNum(wbp: WordBreakProperty) bool {
321 return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote;
322}
323
295inline fn isExtensible(wbp: WordBreakProperty) bool { 324inline fn isExtensible(wbp: WordBreakProperty) bool {
296 return switch (wbp) { 325 return switch (wbp) {
297 .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, 326 .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
@@ -328,3 +357,5 @@ const testing = std.testing;
328const code_point = @import("code_point"); 357const code_point = @import("code_point");
329const CodepointIterator = code_point.Iterator; 358const CodepointIterator = code_point.Iterator;
330const CodePoint = code_point.CodePoint; 359const CodePoint = code_point.CodePoint;
360
361const ext_pict = @import("micro_runeset.zig").Extended_Pictographic;
diff --git a/src/code_point.zig b/src/code_point.zig
index b3c9aea..79ee5cd 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -10,6 +10,11 @@ pub const CodePoint = struct {
10 code: u21, 10 code: u21,
11 len: u3, 11 len: u3,
12 offset: u32, 12 offset: u32,
13
14 /// Return the slice of this codepoint, given the original string.
15 pub fn bytes(cp: CodePoint, str: []const u8) []const u8 {
16 return str[cp.offset..][0..cp.len];
17 }
13}; 18};
14 19
15/// This function is deprecated and will be removed in a later release. 20/// This function is deprecated and will be removed in a later release.
diff --git a/src/micro_runeset.zig b/src/micro_runeset.zig
new file mode 100644
index 0000000..34fbcd3
--- /dev/null
+++ b/src/micro_runeset.zig
@@ -0,0 +1,207 @@
1//! Minimal RuneSet implementation
2//!
3//! This is copied from the full RuneSet module, so that `zg` doesn't
4//! depend on it. There's one spot in the WordBreak algorithm which
5//! needs to identify the emoji Extended_Pictographic property, which
6//! is not otherwise used in ZG. The Runeset is 89 words, while the
7//! trie lookup used throughout ZG would be much larger.
8//!
9//! The RuneSet is borrowed from Runicode, which encodes Unicode things
10//! in RuneSet form. This will need updating for each version of Unicode.
11
12pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x8000060, 0x10000000000000, 0x8001000200600000, 0x7800985090, 0x801022055ef2d, 0xedf57effffffdf57, 0xaffd75bd6f7d001f, 0xdbffffbbbff7ff7f, 0x7d7fddd76f56dfb5, 0x3800000000000001, 0x40040000000000, 0x4, 0x30bae0000008000, 0x100, 0x10004000000, 0x20001f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xfffffffffffffff7, 0xffffffffffffffff, 0xffffffffffffffff, 0x7fffffffffffbfff, 0x800000006000, 0x4001700000000000, 0xffffe00003fe4000, 0x1fffffffff, 0x73fc800004007ffa, 0xfffffffffffd7e00, 0xffffffffffffffff, 0x7fffffffffffffff, 0xffd56ff6bedfafff, 0x77ffffffffff7bff, 0xffffffff5757ffff, 0x3fafff77ff7bfef, 0xbffffdfffffab77f, 0xffffd7efffffffff, 0xff5fefffffffffff, 0xef6fd7ffffffffff, 0x1fffd7ffffefff7b, 0xfdfabf7ff7ffbac0, 0xf7faff77ffaf5dbf, 0x7dfbbf7eb7f6ffed, 0xfff7775fbfefdebf, 0x7fee, 0xbedddfddfbf7f7db, 0x6ebb6edf776b7bdf, 0x7ff0000000000000, 0x7fff77ff7fe00000, 0x7000, 0x7c007f00, 0xffffc00000007f00, 0x7fffffffffffffff, 0xb3fb7f7fbeff7000, 0x7ebef7ffbfff779f, 0x7dff5bebff7dffef, 0x7fffffbfffff7bfb, 0xffffffffffffffff, 0x6b777fffffffffff, 0xdbbf6effffdfbebb, 0x7ebf7f7fb5bf5fdb, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x1fffffffffffffff } };
13
14// Meaningful names for the T1 slots
15const LOW = 0;
16const HI = 1;
17const LEAD = 2;
18const T4_OFF = 3;
19
20/// Minimum Viable Runeset. Must be statically created, strictly boolean matching.
21pub const RuneSet = struct {
22 body: []const u64,
23
24 // Returns whether the slice is a match. This assumes the validity of the
25 // string, which can be ensured by, in particular, deriving it from a CodePoint.
26 pub fn isMatch(runeset: RuneSet, str: []const u8) bool {
27 const set = runeset.body;
28 const a = codeunit(str[0]);
29 switch (a.kind) {
30 .follow => return false,
31 .low => {
32 const mask = toMask(set[LOW]);
33 if (mask.isIn(a))
34 return true
35 else
36 return false;
37 },
38 .hi => {
39 const mask = toMask(set[HI]);
40 if (mask.isIn(a))
41 return true
42 else
43 return false;
44 },
45 .lead => {
46 const nB = a.nMultiBytes().?;
47 const a_mask = toMask(set[LEAD]);
48 if (!a_mask.isIn(a)) return false;
49 const b = codeunit(str[1]);
50 const b_loc = 4 + a_mask.lowerThan(a).?;
51 const b_mask = toMask(set[b_loc]);
52 if (!b_mask.isIn(b)) return false;
53 if (nB == 2) return true;
54 const t3_off = 4 + @popCount(set[LEAD]);
55 const c = codeunit(str[2]);
56 // Slice is safe because we know the T2 span has at least one word.
57 const c_off = b_mask.higherThan(b).? + popCountSlice(set[b_loc + 1 .. t3_off]);
58 const c_loc = t3_off + c_off;
59 const c_mask = toMask(set[c_loc]);
60 if (!c_mask.isIn(c)) return false;
61 if (nB == 3) return true;
62 const d_off = c_mask.lowerThan(c).? + popCountSlice(set[t3_off..c_loc]);
63 const d_loc = set[T4_OFF] + d_off;
64 const d = codeunit(str[3]);
65 const d_mask = toMask(set[d_loc]);
66 if (d_mask.isIn(d)) return true else return false;
67 },
68 }
69 }
70};
71
72/// Kinds of most significant bits in UTF-8
73const RuneKind = enum(u2) {
74 low,
75 hi,
76 follow,
77 lead,
78};
79
80/// Packed `u8` struct representing one codeunit of UTF-8.
81const CodeUnit = packed struct(u8) {
82 body: u6,
83 kind: RuneKind,
84
85 /// Mask to check presence
86 pub inline fn inMask(self: *const CodeUnit) u64 {
87 return @as(u64, 1) << self.body;
88 }
89
90 // TODO consider an nMultiBytesFast, for the cases where we
91 // know that invalid lead bytes are never present (such as in set)
92 // operations, where we may assume that (and will assert that) the
93 // LEAD mask contains no such bytes.
94
95 /// Number of bytes in known multi-byte rune.
96 ///
97 /// Caller guarantees that the CodeUnit is a lead byte
98 /// of a multi-byte rune: `cu.kind == .lead`.
99 ///
100 /// Invalid lead bytes will return null.
101 pub inline fn nMultiBytes(self: *const CodeUnit) ?u8 {
102 return switch (self.body) {
103 // 0 and 1 are invalid for overlong reasons,
104 // but RuneSet supports overlong encodings
105 0...31 => 2,
106 32...47 => 3,
107 48...55 => 4,
108 // Wasted space 56...61 is due entirely to Microsoft's
109 // lack of vision and insistence on a substandard
110 // and utterly inadequate encoding for Unicode
111 // "64k should be enough for anyone" <spits>
112 56...63 => null,
113 };
114 }
115
116 /// Given a valid lead byte, return the number of bytes that should
117 /// make up the code unit sequence. Will return `null` if the lead
118 /// byte is invalid.
119 pub inline fn nBytes(self: *const CodeUnit) ?u8 {
120 switch (self.kind) {
121 .low, .hi => return 1,
122 .lead => return self.nMultiBytes(),
123 .follow => return null,
124 }
125 }
126
127 /// Mask off all bits >= cu.body
128 pub inline fn hiMask(self: *const CodeUnit) u64 {
129 return (@as(u64, 1) << self.body) - 1;
130 }
131
132 /// Mask off all bits <= cu.body
133 pub inline fn lowMask(self: *const CodeUnit) u64 {
134 if (self.body == 63)
135 return 0
136 else
137 return ~((@as(u64, 1) << (self.body + 1)) - 1);
138 }
139
140 /// Cast the `CodeUnit` to its backing `u8`.
141 pub inline fn byte(self: *const CodeUnit) u8 {
142 return @bitCast(self.*);
143 }
144};
145
146/// Cast raw byte to CodeUnit
147inline fn codeunit(b: u8) CodeUnit {
148 return @bitCast(b);
149}
150
151inline fn toMask(w: u64) Mask {
152 return Mask.toMask(w);
153}
154
155/// Bitmask for runesets
156///
157/// We define our own bitset, because the operations we need to
158/// perform only overlap with IntegerBitSet for trivial one-liners,
159/// and furthermore, we need nondestructive versions of the basic
160/// operations, which aren't a part of the IntegerBitSet interface.
161///
162/// Note that Masks do not track which kind of byte they apply to,
163/// since they will be stored as ordinary u64s. User code must
164/// ensure that CodeUnits tested against a Mask are of the appropriate
165/// type, and otherwise valid for the test performed.
166///
167const Mask = struct {
168 m: u64,
169
170 pub fn toMask(w: u64) Mask {
171 return Mask{ .m = w };
172 }
173
174 /// Test if a CodeUnit's low bytes are present in mask
175 pub inline fn isIn(self: Mask, cu: CodeUnit) bool {
176 return self.m | cu.inMask() == self.m;
177 }
178
179 /// Return number of bytes lower than cu.body in mask,
180 /// if cu inhabits the mask. Otherwise return null.
181 pub inline fn lowerThan(self: Mask, cu: CodeUnit) ?u64 {
182 if (self.isIn(cu)) {
183 const m = cu.hiMask();
184 return @popCount(self.m & m);
185 } else {
186 return null;
187 }
188 }
189
190 /// Return number of bytes higher than cu.body in mask,
191 /// if cu inhabits the mask. Otherwise return null.
192 pub inline fn higherThan(self: Mask, cu: CodeUnit) ?u64 {
193 if (self.isIn(cu)) {
194 const m = cu.lowMask();
195 return @popCount(self.m & m);
196 } else {
197 return null;
198 }
199 }
200};
201
202/// Sum of @popCount of all words in region.
203inline fn popCountSlice(region: []const u64) usize {
204 var ct: usize = 0;
205 for (region) |w| ct += @popCount(w);
206 return ct;
207}
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index ee259a3..7ce2b4e 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -1,31 +1,5 @@
1const dbg_print = false; 1const dbg_print = false;
2 2
3comptime {
4 testing.refAllDecls(grapheme);
5}
6
7test "Iterator.peek" {
8 const peek_seq = "aΔ👨🏻‍🌾→";
9 const data = try Graphemes.init(std.testing.allocator);
10 defer data.deinit(std.testing.allocator);
11
12 var iter = data.iterator(peek_seq);
13 const peek_a = iter.peek().?;
14 const next_a = iter.next().?;
15 try std.testing.expectEqual(peek_a, next_a);
16 try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq));
17 const peek_d1 = iter.peek().?;
18 const peek_d2 = iter.peek().?;
19 try std.testing.expectEqual(peek_d1, peek_d2);
20 const next_d = iter.next().?;
21 try std.testing.expectEqual(peek_d2, next_d);
22 try std.testing.expectEqual(iter.peek(), iter.next());
23 try std.testing.expectEqual(iter.peek(), iter.next());
24 try std.testing.expectEqual(null, iter.peek());
25 try std.testing.expectEqual(null, iter.peek());
26 try std.testing.expectEqual(iter.peek(), iter.next());
27}
28
29test "Unicode normalization tests" { 3test "Unicode normalization tests" {
30 var arena = heap.ArenaAllocator.init(testing.allocator); 4 var arena = heap.ArenaAllocator.init(testing.allocator);
31 defer arena.deinit(); 5 defer arena.deinit();
@@ -147,15 +121,13 @@ test "Segmentation GraphemeIterator" {
147 var buf_reader = std.io.bufferedReader(file.reader()); 121 var buf_reader = std.io.bufferedReader(file.reader());
148 var input_stream = buf_reader.reader(); 122 var input_stream = buf_reader.reader();
149 123
150 const data = try Graphemes.init(allocator); 124 const graph = try Graphemes.init(allocator);
151 defer data.deinit(allocator); 125 defer graph.deinit(allocator);
152 126
153 var buf: [4096]u8 = undefined; 127 var buf: [4096]u8 = undefined;
154 var line_iter: IterRead = .{ .read = &input_stream }; 128 var line_iter: IterRead = .{ .read = &input_stream };
155 129
156 while (try line_iter.next(&buf)) |raw| { 130 while (try line_iter.next(&buf)) |raw| {
157 // Skip comments or empty lines.
158 // if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
159 // Clean up. 131 // Clean up.
160 var line = std.mem.trimLeft(u8, raw, "÷ "); 132 var line = std.mem.trimLeft(u8, raw, "÷ ");
161 if (std.mem.indexOf(u8, line, " ÷\t")) |final| { 133 if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
@@ -190,7 +162,7 @@ test "Segmentation GraphemeIterator" {
190 bytes_index += cp_index; 162 bytes_index += cp_index;
191 } 163 }
192 164
193 var iter = data.iterator(all_bytes.items); 165 var iter = graph.iterator(all_bytes.items);
194 166
195 // Check. 167 // Check.
196 for (want.items) |want_gc| { 168 for (want.items) |want_gc| {
@@ -203,6 +175,71 @@ test "Segmentation GraphemeIterator" {
203 } 175 }
204} 176}
205 177
178test "Segmentation Word Iterator" {
179 const allocator = std.testing.allocator;
180 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{});
181 defer file.close();
182 var buf_reader = std.io.bufferedReader(file.reader());
183 var input_stream = buf_reader.reader();
184
185 const wb = try WordBreak.init(allocator);
186 defer wb.deinit(allocator);
187
188 var buf: [4096]u8 = undefined;
189 var line_iter: IterRead = .{ .read = &input_stream };
190
191 while (try line_iter.next(&buf)) |raw| {
192 // Clean up.
193 var line = std.mem.trimLeft(u8, raw, "÷ ");
194 if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
195 line = line[0..final];
196 }
197 // Iterate over fields.
198 var want = std.ArrayList(Grapheme).init(allocator);
199 defer want.deinit();
200
201 var all_bytes = std.ArrayList(u8).init(allocator);
202 defer all_bytes.deinit();
203
204 var words = std.mem.splitSequence(u8, line, " ÷ ");
205 var bytes_index: u32 = 0;
206
207 while (words.next()) |field| {
208 var code_points = std.mem.splitScalar(u8, field, ' ');
209 var cp_buf: [4]u8 = undefined;
210 var cp_index: u32 = 0;
211 var gc_len: u8 = 0;
212
213 while (code_points.next()) |code_point| {
214 if (std.mem.eql(u8, code_point, "×")) continue;
215 const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
216 const len = try unicode.utf8Encode(cp, &cp_buf);
217 try all_bytes.appendSlice(cp_buf[0..len]);
218 cp_index += len;
219 gc_len += len;
220 }
221
222 try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
223 bytes_index += cp_index;
224 }
225
226 var iter = wb.iterator(all_bytes.items);
227
228 // Check.
229 for (want.items, 1..) |want_word, i| {
230 const got_word = (iter.next()).?;
231 std.testing.expectEqualSlices(
232 u8,
233 want_word.bytes(all_bytes.items),
234 got_word.bytes(all_bytes.items),
235 ) catch |err| {
236 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i });
237 return err;
238 };
239 }
240 }
241}
242
206const IterRead = struct { 243const IterRead = struct {
207 read: *Reader, 244 read: *Reader,
208 line: usize = 0, 245 line: usize = 0,
@@ -235,8 +272,9 @@ const debug = std.debug;
235const testing = std.testing; 272const testing = std.testing;
236const unicode = std.unicode; 273const unicode = std.unicode;
237 274
238const grapheme = @import("Graphemes");
239const Grapheme = @import("Graphemes").Grapheme; 275const Grapheme = @import("Graphemes").Grapheme;
240const Graphemes = @import("Graphemes"); 276const Graphemes = @import("Graphemes");
241const GraphemeIterator = @import("Graphemes").Iterator; 277const GraphemeIterator = @import("Graphemes").Iterator;
242const Normalize = @import("Normalize"); 278const Normalize = @import("Normalize");
279
280const WordBreak = @import("WordBreak");