diff options
| author | 2025-05-12 15:22:37 -0400 | |
|---|---|---|
| committer | 2025-05-15 15:31:16 -0400 | |
| commit | cf8d8fe5d640511f6c4134fdaa36e930232ca7da (patch) | |
| tree | 410a3c5195ea0780b637f740ebcb6e80e63db09c | |
| parent | Implement Word iterator (diff) | |
| download | zg-cf8d8fe5d640511f6c4134fdaa36e930232ca7da.tar.gz zg-cf8d8fe5d640511f6c4134fdaa36e930232ca7da.tar.xz zg-cf8d8fe5d640511f6c4134fdaa36e930232ca7da.zip | |
Begin conformance test
I'm not sure the details of this strategy can actually be made to work.
But, something can.
| -rw-r--r-- | build.zig | 1 | ||||
| -rw-r--r-- | src/Graphemes.zig | 22 | ||||
| -rw-r--r-- | src/WordBreak.zig | 83 | ||||
| -rw-r--r-- | src/code_point.zig | 5 | ||||
| -rw-r--r-- | src/micro_runeset.zig | 207 | ||||
| -rw-r--r-- | src/unicode_tests.zig | 102 |
6 files changed, 362 insertions, 58 deletions
| @@ -471,6 +471,7 @@ pub fn build(b: *std.Build) void { | |||
| 471 | }); | 471 | }); |
| 472 | unicode_tests.root_module.addImport("Graphemes", graphemes); | 472 | unicode_tests.root_module.addImport("Graphemes", graphemes); |
| 473 | unicode_tests.root_module.addImport("Normalize", norm); | 473 | unicode_tests.root_module.addImport("Normalize", norm); |
| 474 | unicode_tests.root_module.addImport("WordBreak", word_break); | ||
| 474 | 475 | ||
| 475 | const run_unicode_tests = b.addRunArtifact(unicode_tests); | 476 | const run_unicode_tests = b.addRunArtifact(unicode_tests); |
| 476 | 477 | ||
diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 1ce1ea6..1f67fc6 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig | |||
| @@ -364,3 +364,25 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { | |||
| 364 | try std.testing.expectEqual(@as(usize, 2), i); | 364 | try std.testing.expectEqual(@as(usize, 2), i); |
| 365 | } | 365 | } |
| 366 | } | 366 | } |
| 367 | |||
| 368 | test "Iterator.peek" { | ||
| 369 | const peek_seq = "aΔ👨🏻🌾→"; | ||
| 370 | const data = try Graphemes.init(std.testing.allocator); | ||
| 371 | defer data.deinit(std.testing.allocator); | ||
| 372 | |||
| 373 | var iter = data.iterator(peek_seq); | ||
| 374 | const peek_a = iter.peek().?; | ||
| 375 | const next_a = iter.next().?; | ||
| 376 | try std.testing.expectEqual(peek_a, next_a); | ||
| 377 | try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq)); | ||
| 378 | const peek_d1 = iter.peek().?; | ||
| 379 | const peek_d2 = iter.peek().?; | ||
| 380 | try std.testing.expectEqual(peek_d1, peek_d2); | ||
| 381 | const next_d = iter.next().?; | ||
| 382 | try std.testing.expectEqual(peek_d2, next_d); | ||
| 383 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 384 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 385 | try std.testing.expectEqual(null, iter.peek()); | ||
| 386 | try std.testing.expectEqual(null, iter.peek()); | ||
| 387 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 388 | } | ||
diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 84fd1f7..53db76b 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig | |||
| @@ -88,6 +88,11 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { | |||
| 88 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); | 88 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); |
| 89 | } | 89 | } |
| 90 | 90 | ||
| 91 | /// Returns an iterator over words in `slice` | ||
| 92 | pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { | ||
| 93 | return Iterator.init(wordbreak, slice); | ||
| 94 | } | ||
| 95 | |||
| 91 | const IterState = packed struct { | 96 | const IterState = packed struct { |
| 92 | mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter | 97 | mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter |
| 93 | mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric | 98 | mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric |
| @@ -113,7 +118,7 @@ pub const Iterator = struct { | |||
| 113 | pub fn init(wb: *const WordBreak, str: []const u8) Iterator { | 118 | pub fn init(wb: *const WordBreak, str: []const u8) Iterator { |
| 114 | var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; | 119 | var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; |
| 115 | wb_iter.advance(); | 120 | wb_iter.advance(); |
| 116 | return wb; | 121 | return wb_iter; |
| 117 | } | 122 | } |
| 118 | 123 | ||
| 119 | pub fn next(iter: *Iterator) ?Word { | 124 | pub fn next(iter: *Iterator) ?Word { |
| @@ -132,12 +137,18 @@ pub const Iterator = struct { | |||
| 132 | scan: while (true) : (iter.advance()) { | 137 | scan: while (true) : (iter.advance()) { |
| 133 | const this = iter.this.?; | 138 | const this = iter.this.?; |
| 134 | word_len += this.len; | 139 | word_len += this.len; |
| 140 | var ignored = false; | ||
| 135 | if (iter.that) |that| { | 141 | if (iter.that) |that| { |
| 136 | const that_p = iter.wb.breakProperty(that.code); | 142 | const that_p = iter.wb.breakProperty(that.code); |
| 137 | const this_p = this_p: { | 143 | const this_p = this_p: { |
| 138 | if (!isIgnorable(that_p) and iter.cache != null) { | 144 | if (!isIgnorable(that_p) and iter.cache != null) { |
| 145 | // TODO: might not need these what with peekPast | ||
| 146 | ignored = true; | ||
| 139 | defer iter.cache = null; | 147 | defer iter.cache = null; |
| 140 | break :this_p iter.cache.?; | 148 | // Fixup some state, apply pre-4 rules |
| 149 | const restore = iter.cache.?; | ||
| 150 | if (restore == .WSegSpace) break :this_p .none; | ||
| 151 | break :this_p restore; | ||
| 141 | } else { | 152 | } else { |
| 142 | break :this_p iter.wb.breakProperty(this.code); | 153 | break :this_p iter.wb.breakProperty(this.code); |
| 143 | } | 154 | } |
| @@ -149,11 +160,22 @@ pub const Iterator = struct { | |||
| 149 | // WB3b ÷ (Newline | CR | LF) | 160 | // WB3b ÷ (Newline | CR | LF) |
| 150 | if (isNewline(that_p)) break :scan; | 161 | if (isNewline(that_p)) break :scan; |
| 151 | // WB3c ZWJ × \p{Extended_Pictographic} | 162 | // WB3c ZWJ × \p{Extended_Pictographic} |
| 152 | // The right way to do this one is a RuneSet, TODO: circle back | 163 | if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { |
| 164 | // Invalid after ignoring | ||
| 165 | if (ignored) break :scan else continue :scan; | ||
| 166 | } | ||
| 153 | // WB3d WSegSpace × WSegSpace | 167 | // WB3d WSegSpace × WSegSpace |
| 154 | if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; | 168 | if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; |
| 155 | // WB4 X (Extend | Format | ZWJ)* → X | 169 | // WB4 X (Extend | Format | ZWJ)* → X |
| 156 | if (isIgnorable(that_p)) { | 170 | if (isIgnorable(that_p)) { |
| 171 | if (that_p == .ZWJ) { | ||
| 172 | const next_val = iter.peekPast(); | ||
| 173 | if (next_val) |next_cp| { | ||
| 174 | if (ext_pict.isMatch(next_cp.bytes(iter.cp_iter.bytes))) { | ||
| 175 | continue :scan; | ||
| 176 | } | ||
| 177 | } | ||
| 178 | } | ||
| 157 | if (iter.cache == null) { | 179 | if (iter.cache == null) { |
| 158 | iter.cache = this_p; | 180 | iter.cache = this_p; |
| 159 | } | 181 | } |
| @@ -164,14 +186,14 @@ pub const Iterator = struct { | |||
| 164 | if (isAHLetter(that_p)) continue :scan; | 186 | if (isAHLetter(that_p)) continue :scan; |
| 165 | // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter | 187 | // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter |
| 166 | if (isMidVal(that_p)) { | 188 | if (isMidVal(that_p)) { |
| 167 | const next_val = iter.cp_iter.peek(); | 189 | const next_val = iter.peekPast(); |
| 168 | if (next_val) |next_cp| { | 190 | if (next_val) |next_cp| { |
| 169 | const next_p = iter.wb.breakProperty(next_cp.code); | 191 | const next_p = iter.wb.breakProperty(next_cp.code); |
| 170 | if (isAHLetter(next_p)) { | 192 | if (isAHLetter(next_p)) { |
| 171 | state.mid_punct = true; | 193 | state.mid_punct = true; |
| 172 | continue :scan; | 194 | continue :scan; |
| 173 | } | 195 | } |
| 174 | } else break :scan; | 196 | } |
| 175 | } | 197 | } |
| 176 | } | 198 | } |
| 177 | // AHLetter (MidLetter | MidNumLetQ) × AHLetter | 199 | // AHLetter (MidLetter | MidNumLetQ) × AHLetter |
| @@ -187,7 +209,7 @@ pub const Iterator = struct { | |||
| 187 | if (that_p == .Single_Quote) continue :scan; | 209 | if (that_p == .Single_Quote) continue :scan; |
| 188 | // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter | 210 | // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter |
| 189 | if (that_p == .Double_Quote) { | 211 | if (that_p == .Double_Quote) { |
| 190 | const next_val = iter.cp_iter.peek(); | 212 | const next_val = iter.peekPast(); |
| 191 | if (next_val) |next_cp| { | 213 | if (next_val) |next_cp| { |
| 192 | const next_p = iter.wb.breakProperty(next_cp.code); | 214 | const next_p = iter.wb.breakProperty(next_cp.code); |
| 193 | if (next_p == .Hebrew_Letter) { | 215 | if (next_p == .Hebrew_Letter) { |
| @@ -212,8 +234,8 @@ pub const Iterator = struct { | |||
| 212 | // WB10 Numeric × AHLetter | 234 | // WB10 Numeric × AHLetter |
| 213 | if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; | 235 | if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; |
| 214 | // WB12 Numeric × (MidNum | MidNumLetQ) Numeric | 236 | // WB12 Numeric × (MidNum | MidNumLetQ) Numeric |
| 215 | if (this_p == .Numeric and isMidVal(that_p)) { | 237 | if (this_p == .Numeric and isMidNum(that_p)) { |
| 216 | const next_val = iter.cp_iter.peek(); | 238 | const next_val = iter.peekPast(); |
| 217 | if (next_val) |next_cp| { | 239 | if (next_val) |next_cp| { |
| 218 | const next_p = iter.wb.breakProperty(next_cp.code); | 240 | const next_p = iter.wb.breakProperty(next_cp.code); |
| 219 | if (next_p == .Numeric) { | 241 | if (next_p == .Numeric) { |
| @@ -224,7 +246,7 @@ pub const Iterator = struct { | |||
| 224 | } | 246 | } |
| 225 | // WB11 Numeric (MidNum | MidNumLetQ) × Numeric | 247 | // WB11 Numeric (MidNum | MidNumLetQ) × Numeric |
| 226 | if (state.mid_num) { | 248 | if (state.mid_num) { |
| 227 | assert(isMidVal(this_p)); | 249 | assert(isMidNum(this_p)); |
| 228 | assert(that_p == .Numeric); | 250 | assert(that_p == .Numeric); |
| 229 | state.mid_num = false; | 251 | state.mid_num = false; |
| 230 | continue :scan; | 252 | continue :scan; |
| @@ -235,25 +257,18 @@ pub const Iterator = struct { | |||
| 235 | if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; | 257 | if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; |
| 236 | // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) | 258 | // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) |
| 237 | if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; | 259 | if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; |
| 238 | // WB15, WB16 ([^RI] ! sot) (RI RI)* RI × RI | 260 | // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI |
| 239 | if (that_p == .Regional_Indicator) { | 261 | if (this_p == .Regional_Indicator) { |
| 240 | if (this_p == .Regional_Indicator) { | 262 | if (that_p == .Regional_Indicator) { |
| 241 | if (state.regional) { | 263 | if (state.regional == true or this.offset == 0) { |
| 242 | state.regional = false; | 264 | state.regional = false; |
| 243 | continue :scan; | 265 | continue :scan; |
| 244 | } else { | ||
| 245 | break :scan; | ||
| 246 | } | 266 | } |
| 247 | } else { | 267 | } else { |
| 248 | const next_val = iter.cp_iter.peek(); | 268 | state.regional = true; |
| 249 | if (next_val) |next_cp| { | ||
| 250 | const next_p = iter.wb.breakProperty(next_cp.code); | ||
| 251 | if (next_p == .Regional_Indicator) { | ||
| 252 | state.regional = true; | ||
| 253 | continue :scan; | ||
| 254 | } | ||
| 255 | } else break :scan; | ||
| 256 | } | 269 | } |
| 270 | } else if (that_p == .Regional_Indicator) { | ||
| 271 | state.regional = true; | ||
| 257 | } | 272 | } |
| 258 | // WB999 Any ÷ Any | 273 | // WB999 Any ÷ Any |
| 259 | break :scan; | 274 | break :scan; |
| @@ -265,9 +280,19 @@ pub const Iterator = struct { | |||
| 265 | return Word{ .len = word_len, .offset = word_start }; | 280 | return Word{ .len = word_len, .offset = word_start }; |
| 266 | } | 281 | } |
| 267 | 282 | ||
| 268 | fn advance(wb_iter: *Iterator) void { | 283 | fn advance(iter: *Iterator) void { |
| 269 | wb_iter.this = wb_iter.that; | 284 | iter.this = iter.that; |
| 270 | wb_iter.that = wb_iter.cp_iter.next(); | 285 | iter.that = iter.cp_iter.next(); |
| 286 | } | ||
| 287 | |||
| 288 | fn peekPast(iter: *Iterator) ?CodePoint { | ||
| 289 | const save_cp = iter.cp_iter; | ||
| 290 | defer iter.cp_iter = save_cp; | ||
| 291 | while (iter.cp_iter.peek()) |peeked| { | ||
| 292 | if (!isIgnorable(iter.wb.breakProperty(peeked.code))) return peeked; | ||
| 293 | _ = iter.cp_iter.next(); | ||
| 294 | } | ||
| 295 | return null; | ||
| 271 | } | 296 | } |
| 272 | }; | 297 | }; |
| 273 | 298 | ||
| @@ -292,6 +317,10 @@ inline fn isMidVal(wbp: WordBreakProperty) bool { | |||
| 292 | return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; | 317 | return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; |
| 293 | } | 318 | } |
| 294 | 319 | ||
| 320 | inline fn isMidNum(wbp: WordBreakProperty) bool { | ||
| 321 | return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote; | ||
| 322 | } | ||
| 323 | |||
| 295 | inline fn isExtensible(wbp: WordBreakProperty) bool { | 324 | inline fn isExtensible(wbp: WordBreakProperty) bool { |
| 296 | return switch (wbp) { | 325 | return switch (wbp) { |
| 297 | .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, | 326 | .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, |
| @@ -328,3 +357,5 @@ const testing = std.testing; | |||
| 328 | const code_point = @import("code_point"); | 357 | const code_point = @import("code_point"); |
| 329 | const CodepointIterator = code_point.Iterator; | 358 | const CodepointIterator = code_point.Iterator; |
| 330 | const CodePoint = code_point.CodePoint; | 359 | const CodePoint = code_point.CodePoint; |
| 360 | |||
| 361 | const ext_pict = @import("micro_runeset.zig").Extended_Pictographic; | ||
diff --git a/src/code_point.zig b/src/code_point.zig index b3c9aea..79ee5cd 100644 --- a/src/code_point.zig +++ b/src/code_point.zig | |||
| @@ -10,6 +10,11 @@ pub const CodePoint = struct { | |||
| 10 | code: u21, | 10 | code: u21, |
| 11 | len: u3, | 11 | len: u3, |
| 12 | offset: u32, | 12 | offset: u32, |
| 13 | |||
| 14 | /// Return the slice of this codepoint, given the original string. | ||
| 15 | pub fn bytes(cp: CodePoint, str: []const u8) []const u8 { | ||
| 16 | return str[cp.offset..][0..cp.len]; | ||
| 17 | } | ||
| 13 | }; | 18 | }; |
| 14 | 19 | ||
| 15 | /// This function is deprecated and will be removed in a later release. | 20 | /// This function is deprecated and will be removed in a later release. |
diff --git a/src/micro_runeset.zig b/src/micro_runeset.zig new file mode 100644 index 0000000..34fbcd3 --- /dev/null +++ b/src/micro_runeset.zig | |||
| @@ -0,0 +1,207 @@ | |||
| 1 | //! Minimal RuneSet implementation | ||
| 2 | //! | ||
| 3 | //! This is copied from the full RuneSet module, so that `zg` doesn't | ||
| 4 | //! depend on it. There's one spot in the WordBreak algorithm which | ||
| 5 | //! needs to identify the emoji Extended_Pictographic property, which | ||
| 6 | //! is not otherwise used in ZG. The Runeset is 89 words, while the | ||
| 7 | //! trie lookup used throughout ZG would be much larger. | ||
| 8 | //! | ||
| 9 | //! The RuneSet is borrowed from Runicode, which encodes Unicode things | ||
| 10 | //! in RuneSet form. This will need updating for each version of Unicode. | ||
| 11 | |||
| 12 | pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x8000060, 0x10000000000000, 0x8001000200600000, 0x7800985090, 0x801022055ef2d, 0xedf57effffffdf57, 0xaffd75bd6f7d001f, 0xdbffffbbbff7ff7f, 0x7d7fddd76f56dfb5, 0x3800000000000001, 0x40040000000000, 0x4, 0x30bae0000008000, 0x100, 0x10004000000, 0x20001f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xfffffffffffffff7, 0xffffffffffffffff, 0xffffffffffffffff, 0x7fffffffffffbfff, 0x800000006000, 0x4001700000000000, 0xffffe00003fe4000, 0x1fffffffff, 0x73fc800004007ffa, 0xfffffffffffd7e00, 0xffffffffffffffff, 0x7fffffffffffffff, 0xffd56ff6bedfafff, 0x77ffffffffff7bff, 0xffffffff5757ffff, 0x3fafff77ff7bfef, 0xbffffdfffffab77f, 0xffffd7efffffffff, 0xff5fefffffffffff, 0xef6fd7ffffffffff, 0x1fffd7ffffefff7b, 0xfdfabf7ff7ffbac0, 0xf7faff77ffaf5dbf, 0x7dfbbf7eb7f6ffed, 0xfff7775fbfefdebf, 0x7fee, 0xbedddfddfbf7f7db, 0x6ebb6edf776b7bdf, 0x7ff0000000000000, 0x7fff77ff7fe00000, 0x7000, 0x7c007f00, 0xffffc00000007f00, 0x7fffffffffffffff, 0xb3fb7f7fbeff7000, 0x7ebef7ffbfff779f, 0x7dff5bebff7dffef, 0x7fffffbfffff7bfb, 0xffffffffffffffff, 0x6b777fffffffffff, 0xdbbf6effffdfbebb, 0x7ebf7f7fb5bf5fdb, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x1fffffffffffffff } }; | ||
| 13 | |||
| 14 | // Meaningful names for the T1 slots | ||
| 15 | const LOW = 0; | ||
| 16 | const HI = 1; | ||
| 17 | const LEAD = 2; | ||
| 18 | const T4_OFF = 3; | ||
| 19 | |||
| 20 | /// Minimum Viable Runeset. Must be statically created, strictly boolean matching. | ||
| 21 | pub const RuneSet = struct { | ||
| 22 | body: []const u64, | ||
| 23 | |||
| 24 | // Returns whether the slice is a match. This assumes the validity of the | ||
| 25 | // string, which can be ensured by, in particular, deriving it from a CodePoint. | ||
| 26 | pub fn isMatch(runeset: RuneSet, str: []const u8) bool { | ||
| 27 | const set = runeset.body; | ||
| 28 | const a = codeunit(str[0]); | ||
| 29 | switch (a.kind) { | ||
| 30 | .follow => return false, | ||
| 31 | .low => { | ||
| 32 | const mask = toMask(set[LOW]); | ||
| 33 | if (mask.isIn(a)) | ||
| 34 | return true | ||
| 35 | else | ||
| 36 | return false; | ||
| 37 | }, | ||
| 38 | .hi => { | ||
| 39 | const mask = toMask(set[HI]); | ||
| 40 | if (mask.isIn(a)) | ||
| 41 | return true | ||
| 42 | else | ||
| 43 | return false; | ||
| 44 | }, | ||
| 45 | .lead => { | ||
| 46 | const nB = a.nMultiBytes().?; | ||
| 47 | const a_mask = toMask(set[LEAD]); | ||
| 48 | if (!a_mask.isIn(a)) return false; | ||
| 49 | const b = codeunit(str[1]); | ||
| 50 | const b_loc = 4 + a_mask.lowerThan(a).?; | ||
| 51 | const b_mask = toMask(set[b_loc]); | ||
| 52 | if (!b_mask.isIn(b)) return false; | ||
| 53 | if (nB == 2) return true; | ||
| 54 | const t3_off = 4 + @popCount(set[LEAD]); | ||
| 55 | const c = codeunit(str[2]); | ||
| 56 | // Slice is safe because we know the T2 span has at least one word. | ||
| 57 | const c_off = b_mask.higherThan(b).? + popCountSlice(set[b_loc + 1 .. t3_off]); | ||
| 58 | const c_loc = t3_off + c_off; | ||
| 59 | const c_mask = toMask(set[c_loc]); | ||
| 60 | if (!c_mask.isIn(c)) return false; | ||
| 61 | if (nB == 3) return true; | ||
| 62 | const d_off = c_mask.lowerThan(c).? + popCountSlice(set[t3_off..c_loc]); | ||
| 63 | const d_loc = set[T4_OFF] + d_off; | ||
| 64 | const d = codeunit(str[3]); | ||
| 65 | const d_mask = toMask(set[d_loc]); | ||
| 66 | if (d_mask.isIn(d)) return true else return false; | ||
| 67 | }, | ||
| 68 | } | ||
| 69 | } | ||
| 70 | }; | ||
| 71 | |||
| 72 | /// Kinds of most significant bits in UTF-8 | ||
| 73 | const RuneKind = enum(u2) { | ||
| 74 | low, | ||
| 75 | hi, | ||
| 76 | follow, | ||
| 77 | lead, | ||
| 78 | }; | ||
| 79 | |||
| 80 | /// Packed `u8` struct representing one codeunit of UTF-8. | ||
| 81 | const CodeUnit = packed struct(u8) { | ||
| 82 | body: u6, | ||
| 83 | kind: RuneKind, | ||
| 84 | |||
| 85 | /// Mask to check presence | ||
| 86 | pub inline fn inMask(self: *const CodeUnit) u64 { | ||
| 87 | return @as(u64, 1) << self.body; | ||
| 88 | } | ||
| 89 | |||
| 90 | // TODO consider an nMultiBytesFast, for the cases where we | ||
| 91 | // know that invalid lead bytes are never present (such as in set) | ||
| 92 | // operations, where we may assume that (and will assert that) the | ||
| 93 | // LEAD mask contains no such bytes. | ||
| 94 | |||
| 95 | /// Number of bytes in known multi-byte rune. | ||
| 96 | /// | ||
| 97 | /// Caller guarantees that the CodeUnit is a lead byte | ||
| 98 | /// of a multi-byte rune: `cu.kind == .lead`. | ||
| 99 | /// | ||
| 100 | /// Invalid lead bytes will return null. | ||
| 101 | pub inline fn nMultiBytes(self: *const CodeUnit) ?u8 { | ||
| 102 | return switch (self.body) { | ||
| 103 | // 0 and 1 are invalid for overlong reasons, | ||
| 104 | // but RuneSet supports overlong encodings | ||
| 105 | 0...31 => 2, | ||
| 106 | 32...47 => 3, | ||
| 107 | 48...55 => 4, | ||
| 108 | // Wasted space 56...61 is due entirely to Microsoft's | ||
| 109 | // lack of vision and insistence on a substandard | ||
| 110 | // and utterly inadequate encoding for Unicode | ||
| 111 | // "64k should be enough for anyone" <spits> | ||
| 112 | 56...63 => null, | ||
| 113 | }; | ||
| 114 | } | ||
| 115 | |||
| 116 | /// Given a valid lead byte, return the number of bytes that should | ||
| 117 | /// make up the code unit sequence. Will return `null` if the lead | ||
| 118 | /// byte is invalid. | ||
| 119 | pub inline fn nBytes(self: *const CodeUnit) ?u8 { | ||
| 120 | switch (self.kind) { | ||
| 121 | .low, .hi => return 1, | ||
| 122 | .lead => return self.nMultiBytes(), | ||
| 123 | .follow => return null, | ||
| 124 | } | ||
| 125 | } | ||
| 126 | |||
| 127 | /// Mask off all bits >= cu.body | ||
| 128 | pub inline fn hiMask(self: *const CodeUnit) u64 { | ||
| 129 | return (@as(u64, 1) << self.body) - 1; | ||
| 130 | } | ||
| 131 | |||
| 132 | /// Mask off all bits <= cu.body | ||
| 133 | pub inline fn lowMask(self: *const CodeUnit) u64 { | ||
| 134 | if (self.body == 63) | ||
| 135 | return 0 | ||
| 136 | else | ||
| 137 | return ~((@as(u64, 1) << (self.body + 1)) - 1); | ||
| 138 | } | ||
| 139 | |||
| 140 | /// Cast the `CodeUnit` to its backing `u8`. | ||
| 141 | pub inline fn byte(self: *const CodeUnit) u8 { | ||
| 142 | return @bitCast(self.*); | ||
| 143 | } | ||
| 144 | }; | ||
| 145 | |||
| 146 | /// Cast raw byte to CodeUnit | ||
| 147 | inline fn codeunit(b: u8) CodeUnit { | ||
| 148 | return @bitCast(b); | ||
| 149 | } | ||
| 150 | |||
| 151 | inline fn toMask(w: u64) Mask { | ||
| 152 | return Mask.toMask(w); | ||
| 153 | } | ||
| 154 | |||
| 155 | /// Bitmask for runesets | ||
| 156 | /// | ||
| 157 | /// We define our own bitset, because the operations we need to | ||
| 158 | /// perform only overlap with IntegerBitSet for trivial one-liners, | ||
| 159 | /// and furthermore, we need nondestructive versions of the basic | ||
| 160 | /// operations, which aren't a part of the IntegerBitSet interface. | ||
| 161 | /// | ||
| 162 | /// Note that Masks do not track which kind of byte they apply to, | ||
| 163 | /// since they will be stored as ordinary u64s. User code must | ||
| 164 | /// ensure that CodeUnits tested against a Mask are of the appropriate | ||
| 165 | /// type, and otherwise valid for the test performed. | ||
| 166 | /// | ||
| 167 | const Mask = struct { | ||
| 168 | m: u64, | ||
| 169 | |||
| 170 | pub fn toMask(w: u64) Mask { | ||
| 171 | return Mask{ .m = w }; | ||
| 172 | } | ||
| 173 | |||
| 174 | /// Test if a CodeUnit's low bytes are present in mask | ||
| 175 | pub inline fn isIn(self: Mask, cu: CodeUnit) bool { | ||
| 176 | return self.m | cu.inMask() == self.m; | ||
| 177 | } | ||
| 178 | |||
| 179 | /// Return number of bytes lower than cu.body in mask, | ||
| 180 | /// if cu inhabits the mask. Otherwise return null. | ||
| 181 | pub inline fn lowerThan(self: Mask, cu: CodeUnit) ?u64 { | ||
| 182 | if (self.isIn(cu)) { | ||
| 183 | const m = cu.hiMask(); | ||
| 184 | return @popCount(self.m & m); | ||
| 185 | } else { | ||
| 186 | return null; | ||
| 187 | } | ||
| 188 | } | ||
| 189 | |||
| 190 | /// Return number of bytes higher than cu.body in mask, | ||
| 191 | /// if cu inhabits the mask. Otherwise return null. | ||
| 192 | pub inline fn higherThan(self: Mask, cu: CodeUnit) ?u64 { | ||
| 193 | if (self.isIn(cu)) { | ||
| 194 | const m = cu.lowMask(); | ||
| 195 | return @popCount(self.m & m); | ||
| 196 | } else { | ||
| 197 | return null; | ||
| 198 | } | ||
| 199 | } | ||
| 200 | }; | ||
| 201 | |||
| 202 | /// Sum of @popCount of all words in region. | ||
| 203 | inline fn popCountSlice(region: []const u64) usize { | ||
| 204 | var ct: usize = 0; | ||
| 205 | for (region) |w| ct += @popCount(w); | ||
| 206 | return ct; | ||
| 207 | } | ||
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index ee259a3..7ce2b4e 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -1,31 +1,5 @@ | |||
| 1 | const dbg_print = false; | 1 | const dbg_print = false; |
| 2 | 2 | ||
| 3 | comptime { | ||
| 4 | testing.refAllDecls(grapheme); | ||
| 5 | } | ||
| 6 | |||
| 7 | test "Iterator.peek" { | ||
| 8 | const peek_seq = "aΔ👨🏻🌾→"; | ||
| 9 | const data = try Graphemes.init(std.testing.allocator); | ||
| 10 | defer data.deinit(std.testing.allocator); | ||
| 11 | |||
| 12 | var iter = data.iterator(peek_seq); | ||
| 13 | const peek_a = iter.peek().?; | ||
| 14 | const next_a = iter.next().?; | ||
| 15 | try std.testing.expectEqual(peek_a, next_a); | ||
| 16 | try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq)); | ||
| 17 | const peek_d1 = iter.peek().?; | ||
| 18 | const peek_d2 = iter.peek().?; | ||
| 19 | try std.testing.expectEqual(peek_d1, peek_d2); | ||
| 20 | const next_d = iter.next().?; | ||
| 21 | try std.testing.expectEqual(peek_d2, next_d); | ||
| 22 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 23 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 24 | try std.testing.expectEqual(null, iter.peek()); | ||
| 25 | try std.testing.expectEqual(null, iter.peek()); | ||
| 26 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 27 | } | ||
| 28 | |||
| 29 | test "Unicode normalization tests" { | 3 | test "Unicode normalization tests" { |
| 30 | var arena = heap.ArenaAllocator.init(testing.allocator); | 4 | var arena = heap.ArenaAllocator.init(testing.allocator); |
| 31 | defer arena.deinit(); | 5 | defer arena.deinit(); |
| @@ -147,15 +121,13 @@ test "Segmentation GraphemeIterator" { | |||
| 147 | var buf_reader = std.io.bufferedReader(file.reader()); | 121 | var buf_reader = std.io.bufferedReader(file.reader()); |
| 148 | var input_stream = buf_reader.reader(); | 122 | var input_stream = buf_reader.reader(); |
| 149 | 123 | ||
| 150 | const data = try Graphemes.init(allocator); | 124 | const graph = try Graphemes.init(allocator); |
| 151 | defer data.deinit(allocator); | 125 | defer graph.deinit(allocator); |
| 152 | 126 | ||
| 153 | var buf: [4096]u8 = undefined; | 127 | var buf: [4096]u8 = undefined; |
| 154 | var line_iter: IterRead = .{ .read = &input_stream }; | 128 | var line_iter: IterRead = .{ .read = &input_stream }; |
| 155 | 129 | ||
| 156 | while (try line_iter.next(&buf)) |raw| { | 130 | while (try line_iter.next(&buf)) |raw| { |
| 157 | // Skip comments or empty lines. | ||
| 158 | // if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; | ||
| 159 | // Clean up. | 131 | // Clean up. |
| 160 | var line = std.mem.trimLeft(u8, raw, "÷ "); | 132 | var line = std.mem.trimLeft(u8, raw, "÷ "); |
| 161 | if (std.mem.indexOf(u8, line, " ÷\t")) |final| { | 133 | if (std.mem.indexOf(u8, line, " ÷\t")) |final| { |
| @@ -190,7 +162,7 @@ test "Segmentation GraphemeIterator" { | |||
| 190 | bytes_index += cp_index; | 162 | bytes_index += cp_index; |
| 191 | } | 163 | } |
| 192 | 164 | ||
| 193 | var iter = data.iterator(all_bytes.items); | 165 | var iter = graph.iterator(all_bytes.items); |
| 194 | 166 | ||
| 195 | // Check. | 167 | // Check. |
| 196 | for (want.items) |want_gc| { | 168 | for (want.items) |want_gc| { |
| @@ -203,6 +175,71 @@ test "Segmentation GraphemeIterator" { | |||
| 203 | } | 175 | } |
| 204 | } | 176 | } |
| 205 | 177 | ||
| 178 | test "Segmentation Word Iterator" { | ||
| 179 | const allocator = std.testing.allocator; | ||
| 180 | var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{}); | ||
| 181 | defer file.close(); | ||
| 182 | var buf_reader = std.io.bufferedReader(file.reader()); | ||
| 183 | var input_stream = buf_reader.reader(); | ||
| 184 | |||
| 185 | const wb = try WordBreak.init(allocator); | ||
| 186 | defer wb.deinit(allocator); | ||
| 187 | |||
| 188 | var buf: [4096]u8 = undefined; | ||
| 189 | var line_iter: IterRead = .{ .read = &input_stream }; | ||
| 190 | |||
| 191 | while (try line_iter.next(&buf)) |raw| { | ||
| 192 | // Clean up. | ||
| 193 | var line = std.mem.trimLeft(u8, raw, "÷ "); | ||
| 194 | if (std.mem.indexOf(u8, line, " ÷\t")) |final| { | ||
| 195 | line = line[0..final]; | ||
| 196 | } | ||
| 197 | // Iterate over fields. | ||
| 198 | var want = std.ArrayList(Grapheme).init(allocator); | ||
| 199 | defer want.deinit(); | ||
| 200 | |||
| 201 | var all_bytes = std.ArrayList(u8).init(allocator); | ||
| 202 | defer all_bytes.deinit(); | ||
| 203 | |||
| 204 | var words = std.mem.splitSequence(u8, line, " ÷ "); | ||
| 205 | var bytes_index: u32 = 0; | ||
| 206 | |||
| 207 | while (words.next()) |field| { | ||
| 208 | var code_points = std.mem.splitScalar(u8, field, ' '); | ||
| 209 | var cp_buf: [4]u8 = undefined; | ||
| 210 | var cp_index: u32 = 0; | ||
| 211 | var gc_len: u8 = 0; | ||
| 212 | |||
| 213 | while (code_points.next()) |code_point| { | ||
| 214 | if (std.mem.eql(u8, code_point, "×")) continue; | ||
| 215 | const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); | ||
| 216 | const len = try unicode.utf8Encode(cp, &cp_buf); | ||
| 217 | try all_bytes.appendSlice(cp_buf[0..len]); | ||
| 218 | cp_index += len; | ||
| 219 | gc_len += len; | ||
| 220 | } | ||
| 221 | |||
| 222 | try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); | ||
| 223 | bytes_index += cp_index; | ||
| 224 | } | ||
| 225 | |||
| 226 | var iter = wb.iterator(all_bytes.items); | ||
| 227 | |||
| 228 | // Check. | ||
| 229 | for (want.items, 1..) |want_word, i| { | ||
| 230 | const got_word = (iter.next()).?; | ||
| 231 | std.testing.expectEqualSlices( | ||
| 232 | u8, | ||
| 233 | want_word.bytes(all_bytes.items), | ||
| 234 | got_word.bytes(all_bytes.items), | ||
| 235 | ) catch |err| { | ||
| 236 | debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); | ||
| 237 | return err; | ||
| 238 | }; | ||
| 239 | } | ||
| 240 | } | ||
| 241 | } | ||
| 242 | |||
| 206 | const IterRead = struct { | 243 | const IterRead = struct { |
| 207 | read: *Reader, | 244 | read: *Reader, |
| 208 | line: usize = 0, | 245 | line: usize = 0, |
| @@ -235,8 +272,9 @@ const debug = std.debug; | |||
| 235 | const testing = std.testing; | 272 | const testing = std.testing; |
| 236 | const unicode = std.unicode; | 273 | const unicode = std.unicode; |
| 237 | 274 | ||
| 238 | const grapheme = @import("Graphemes"); | ||
| 239 | const Grapheme = @import("Graphemes").Grapheme; | 275 | const Grapheme = @import("Graphemes").Grapheme; |
| 240 | const Graphemes = @import("Graphemes"); | 276 | const Graphemes = @import("Graphemes"); |
| 241 | const GraphemeIterator = @import("Graphemes").Iterator; | 277 | const GraphemeIterator = @import("Graphemes").Iterator; |
| 242 | const Normalize = @import("Normalize"); | 278 | const Normalize = @import("Normalize"); |
| 279 | |||
| 280 | const WordBreak = @import("WordBreak"); | ||