diff options
| author | 2025-05-12 18:10:02 -0400 | |
|---|---|---|
| committer | 2025-05-15 15:31:16 -0400 | |
| commit | a7f6990a8d433c6c8d34892a2126e94cdb31541f (patch) | |
| tree | 427d465cbce19c816a375b34f5bb04ce4599a8d6 /src | |
| parent | Begin conformance test (diff) | |
| download | zg-a7f6990a8d433c6c8d34892a2126e94cdb31541f.tar.gz zg-a7f6990a8d433c6c8d34892a2126e94cdb31541f.tar.xz zg-a7f6990a8d433c6c8d34892a2126e94cdb31541f.zip | |
Rewrite, passes WordBreakTest
After fixing a bug in Runicode which was fenceposting codepoints off the
end of ranges. As one does.
Diffstat (limited to 'src')
| -rw-r--r-- | src/WordBreak.zig | 111 | ||||
| -rw-r--r-- | src/micro_runeset.zig | 4 | ||||
| -rw-r--r-- | src/unicode_tests.zig | 3 |
3 files changed, 40 insertions, 78 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 53db76b..a2be011 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig | |||
| @@ -132,28 +132,21 @@ pub const Iterator = struct { | |||
| 132 | const word_start = iter.this.?.offset; | 132 | const word_start = iter.this.?.offset; |
| 133 | var word_len: u32 = 0; | 133 | var word_len: u32 = 0; |
| 134 | 134 | ||
| 135 | var state: IterState = .initial; | 135 | // state variables |
| 136 | var last_p: WordBreakProperty = .none; | ||
| 137 | var last_last_p: WordBreakProperty = .none; | ||
| 138 | var ri_count: usize = 0; | ||
| 136 | 139 | ||
| 137 | scan: while (true) : (iter.advance()) { | 140 | scan: while (true) : (iter.advance()) { |
| 138 | const this = iter.this.?; | 141 | const this = iter.this.?; |
| 139 | word_len += this.len; | 142 | word_len += this.len; |
| 140 | var ignored = false; | ||
| 141 | if (iter.that) |that| { | 143 | if (iter.that) |that| { |
| 144 | const this_p = iter.wb.breakProperty(this.code); // WB3 CR × LF | ||
| 142 | const that_p = iter.wb.breakProperty(that.code); | 145 | const that_p = iter.wb.breakProperty(that.code); |
| 143 | const this_p = this_p: { | 146 | if (!isIgnorable(this_p)) { |
| 144 | if (!isIgnorable(that_p) and iter.cache != null) { | 147 | last_last_p = last_p; |
| 145 | // TODO: might not need these what with peekPast | 148 | last_p = this_p; |
| 146 | ignored = true; | 149 | } |
| 147 | defer iter.cache = null; | ||
| 148 | // Fixup some state, apply pre-4 rules | ||
| 149 | const restore = iter.cache.?; | ||
| 150 | if (restore == .WSegSpace) break :this_p .none; | ||
| 151 | break :this_p restore; | ||
| 152 | } else { | ||
| 153 | break :this_p iter.wb.breakProperty(this.code); | ||
| 154 | } | ||
| 155 | }; | ||
| 156 | // WB3 CR × LF | ||
| 157 | if (this_p == .CR and that_p == .LF) continue :scan; | 150 | if (this_p == .CR and that_p == .LF) continue :scan; |
| 158 | // WB3a (Newline | CR | LF) ÷ | 151 | // WB3a (Newline | CR | LF) ÷ |
| 159 | if (isNewline(this_p)) break :scan; | 152 | if (isNewline(this_p)) break :scan; |
| @@ -161,27 +154,15 @@ pub const Iterator = struct { | |||
| 161 | if (isNewline(that_p)) break :scan; | 154 | if (isNewline(that_p)) break :scan; |
| 162 | // WB3c ZWJ × \p{Extended_Pictographic} | 155 | // WB3c ZWJ × \p{Extended_Pictographic} |
| 163 | if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { | 156 | if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { |
| 164 | // Invalid after ignoring | 157 | continue :scan; |
| 165 | if (ignored) break :scan else continue :scan; | ||
| 166 | } | 158 | } |
| 167 | // WB3d WSegSpace × WSegSpace | 159 | // WB3d WSegSpace × WSegSpace |
| 168 | if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; | 160 | if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; |
| 169 | // WB4 X (Extend | Format | ZWJ)* → X | 161 | // WB4 X (Extend | Format | ZWJ)* → X |
| 170 | if (isIgnorable(that_p)) { | 162 | if (isIgnorable(that_p)) { |
| 171 | if (that_p == .ZWJ) { | ||
| 172 | const next_val = iter.peekPast(); | ||
| 173 | if (next_val) |next_cp| { | ||
| 174 | if (ext_pict.isMatch(next_cp.bytes(iter.cp_iter.bytes))) { | ||
| 175 | continue :scan; | ||
| 176 | } | ||
| 177 | } | ||
| 178 | } | ||
| 179 | if (iter.cache == null) { | ||
| 180 | iter.cache = this_p; | ||
| 181 | } | ||
| 182 | continue :scan; | 163 | continue :scan; |
| 183 | } | 164 | } // Now we use last_p instead of this_p for ignorable's sake |
| 184 | if (isAHLetter(this_p)) { | 165 | if (isAHLetter(last_p)) { |
| 185 | // WB5 AHLetter × AHLetter | 166 | // WB5 AHLetter × AHLetter |
| 186 | if (isAHLetter(that_p)) continue :scan; | 167 | if (isAHLetter(that_p)) continue :scan; |
| 187 | // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter | 168 | // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter |
| @@ -190,21 +171,16 @@ pub const Iterator = struct { | |||
| 190 | if (next_val) |next_cp| { | 171 | if (next_val) |next_cp| { |
| 191 | const next_p = iter.wb.breakProperty(next_cp.code); | 172 | const next_p = iter.wb.breakProperty(next_cp.code); |
| 192 | if (isAHLetter(next_p)) { | 173 | if (isAHLetter(next_p)) { |
| 193 | state.mid_punct = true; | ||
| 194 | continue :scan; | 174 | continue :scan; |
| 195 | } | 175 | } |
| 196 | } | 176 | } |
| 197 | } | 177 | } |
| 198 | } | 178 | } |
| 199 | // AHLetter (MidLetter | MidNumLetQ) × AHLetter | 179 | // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter |
| 200 | if (state.mid_punct) { | 180 | if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) { |
| 201 | // Should always be true: | ||
| 202 | assert(isMidVal(this_p)); | ||
| 203 | assert(isAHLetter(that_p)); | ||
| 204 | state.mid_punct = false; | ||
| 205 | continue :scan; | 181 | continue :scan; |
| 206 | } | 182 | } |
| 207 | if (this_p == .Hebrew_Letter) { | 183 | if (last_p == .Hebrew_Letter) { |
| 208 | // WB7a Hebrew_Letter × Single_Quote | 184 | // WB7a Hebrew_Letter × Single_Quote |
| 209 | if (that_p == .Single_Quote) continue :scan; | 185 | if (that_p == .Single_Quote) continue :scan; |
| 210 | // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter | 186 | // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter |
| @@ -213,62 +189,44 @@ pub const Iterator = struct { | |||
| 213 | if (next_val) |next_cp| { | 189 | if (next_val) |next_cp| { |
| 214 | const next_p = iter.wb.breakProperty(next_cp.code); | 190 | const next_p = iter.wb.breakProperty(next_cp.code); |
| 215 | if (next_p == .Hebrew_Letter) { | 191 | if (next_p == .Hebrew_Letter) { |
| 216 | state.quote_heb = true; | ||
| 217 | continue :scan; | 192 | continue :scan; |
| 218 | } | 193 | } |
| 219 | } else break :scan; | 194 | } |
| 220 | } | 195 | } |
| 221 | } | 196 | } |
| 222 | // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter | 197 | // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter |
| 223 | if (state.quote_heb) { | 198 | if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter) |
| 224 | // Should always be true: | ||
| 225 | assert(this_p == .Double_Quote); | ||
| 226 | assert(that_p == .Hebrew_Letter); | ||
| 227 | state.quote_heb = false; | ||
| 228 | continue :scan; | 199 | continue :scan; |
| 229 | } | ||
| 230 | // WB8 Numeric × Numeric | 200 | // WB8 Numeric × Numeric |
| 231 | if (this_p == .Numeric and that_p == .Numeric) continue :scan; | 201 | if (last_p == .Numeric and that_p == .Numeric) continue :scan; |
| 232 | // WB9 AHLetter × Numeric | 202 | // WB9 AHLetter × Numeric |
| 233 | if (isAHLetter(this_p) and that_p == .Numeric) continue :scan; | 203 | if (isAHLetter(last_p) and that_p == .Numeric) continue :scan; |
| 234 | // WB10 Numeric × AHLetter | 204 | // WB10 Numeric × AHLetter |
| 235 | if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; | 205 | if (last_p == .Numeric and isAHLetter(that_p)) continue :scan; |
| 206 | // WB11 Numeric (MidNum | MidNumLetQ) × Numeric | ||
| 207 | if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric) | ||
| 208 | continue :scan; | ||
| 236 | // WB12 Numeric × (MidNum | MidNumLetQ) Numeric | 209 | // WB12 Numeric × (MidNum | MidNumLetQ) Numeric |
| 237 | if (this_p == .Numeric and isMidNum(that_p)) { | 210 | if (last_p == .Numeric and isMidNum(that_p)) { |
| 238 | const next_val = iter.peekPast(); | 211 | const next_val = iter.peekPast(); |
| 239 | if (next_val) |next_cp| { | 212 | if (next_val) |next_cp| { |
| 240 | const next_p = iter.wb.breakProperty(next_cp.code); | 213 | const next_p = iter.wb.breakProperty(next_cp.code); |
| 241 | if (next_p == .Numeric) { | 214 | if (next_p == .Numeric) { |
| 242 | state.mid_num = true; | ||
| 243 | continue :scan; | 215 | continue :scan; |
| 244 | } | 216 | } |
| 245 | } else break :scan; | 217 | } |
| 246 | } | ||
| 247 | // WB11 Numeric (MidNum | MidNumLetQ) × Numeric | ||
| 248 | if (state.mid_num) { | ||
| 249 | assert(isMidNum(this_p)); | ||
| 250 | assert(that_p == .Numeric); | ||
| 251 | state.mid_num = false; | ||
| 252 | continue :scan; | ||
| 253 | } | 218 | } |
| 254 | // WB13 Katakana × Katakana | 219 | // WB13 Katakana × Katakana |
| 255 | if (this_p == .Katakana and that_p == .Katakana) continue :scan; | 220 | if (last_p == .Katakana and that_p == .Katakana) continue :scan; |
| 256 | // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet | 221 | // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet |
| 257 | if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; | 222 | if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan; |
| 258 | // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) | 223 | // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) |
| 259 | if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; | 224 | if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; |
| 260 | // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI | 225 | // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI |
| 261 | if (this_p == .Regional_Indicator) { | 226 | const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator; |
| 262 | if (that_p == .Regional_Indicator) { | 227 | if (maybe_flag) { |
| 263 | if (state.regional == true or this.offset == 0) { | 228 | ri_count += 1; |
| 264 | state.regional = false; | 229 | if (ri_count % 2 == 1) continue :scan; |
| 265 | continue :scan; | ||
| 266 | } | ||
| 267 | } else { | ||
| 268 | state.regional = true; | ||
| 269 | } | ||
| 270 | } else if (that_p == .Regional_Indicator) { | ||
| 271 | state.regional = true; | ||
| 272 | } | 230 | } |
| 273 | // WB999 Any ÷ Any | 231 | // WB999 Any ÷ Any |
| 274 | break :scan; | 232 | break :scan; |
| @@ -337,6 +295,11 @@ test "Word Break Properties" { | |||
| 337 | try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); | 295 | try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); |
| 338 | } | 296 | } |
| 339 | 297 | ||
| 298 | test "ext_pic" { | ||
| 299 | try testing.expect(ext_pict.isMatch("👇")); | ||
| 300 | try testing.expect(ext_pict.isMatch("\u{2704}")); | ||
| 301 | } | ||
| 302 | |||
| 340 | fn testAllocations(allocator: Allocator) !void { | 303 | fn testAllocations(allocator: Allocator) !void { |
| 341 | const wb = try WordBreak.init(allocator); | 304 | const wb = try WordBreak.init(allocator); |
| 342 | wb.deinit(allocator); | 305 | wb.deinit(allocator); |
diff --git a/src/micro_runeset.zig b/src/micro_runeset.zig index 34fbcd3..80ce4bf 100644 --- a/src/micro_runeset.zig +++ b/src/micro_runeset.zig | |||
| @@ -9,7 +9,7 @@ | |||
| 9 | //! The RuneSet is borrowed from Runicode, which encodes Unicode things | 9 | //! The RuneSet is borrowed from Runicode, which encodes Unicode things |
| 10 | //! in RuneSet form. This will need updating for each version of Unicode. | 10 | //! in RuneSet form. This will need updating for each version of Unicode. |
| 11 | 11 | ||
| 12 | pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x8000060, 0x10000000000000, 0x8001000200600000, 0x7800985090, 0x801022055ef2d, 0xedf57effffffdf57, 0xaffd75bd6f7d001f, 0xdbffffbbbff7ff7f, 0x7d7fddd76f56dfb5, 0x3800000000000001, 0x40040000000000, 0x4, 0x30bae0000008000, 0x100, 0x10004000000, 0x20001f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xfffffffffffffff7, 0xffffffffffffffff, 0xffffffffffffffff, 0x7fffffffffffbfff, 0x800000006000, 0x4001700000000000, 0xffffe00003fe4000, 0x1fffffffff, 0x73fc800004007ffa, 0xfffffffffffd7e00, 0xffffffffffffffff, 0x7fffffffffffffff, 0xffd56ff6bedfafff, 0x77ffffffffff7bff, 0xffffffff5757ffff, 0x3fafff77ff7bfef, 0xbffffdfffffab77f, 0xffffd7efffffffff, 0xff5fefffffffffff, 0xef6fd7ffffffffff, 0x1fffd7ffffefff7b, 0xfdfabf7ff7ffbac0, 0xf7faff77ffaf5dbf, 0x7dfbbf7eb7f6ffed, 0xfff7775fbfefdebf, 0x7fee, 0xbedddfddfbf7f7db, 0x6ebb6edf776b7bdf, 0x7ff0000000000000, 0x7fff77ff7fe00000, 0x7000, 0x7c007f00, 0xffffc00000007f00, 0x7fffffffffffffff, 0xb3fb7f7fbeff7000, 0x7ebef7ffbfff779f, 0x7dff5bebff7dffef, 0x7fffffbfffff7bfb, 0xffffffffffffffff, 0x6b777fffffffffff, 0xdbbf6effffdfbebb, 0x7ebf7f7fb5bf5fdb, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x1fffffffffffffff } }; | 12 | pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x180000e0, 0x30000000000000, 0x8001000200e00000, 0xf800b85090, 0x1801022057ff3f, 0xffffffffffffffff, 0xffffffffffff003f, 0xffffffffffffffff, 0xfffffffffff7ffbf, 0x7800000000000001, 0x400c0000000000, 0x4, 0x70ffe0000008000, 0x100, 0x1000c000000, 0x60003f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x80000000e000, 0xc003f00000000000, 0xffffe00007fe4000, 0x3fffffffff, 0xf7fc80000400fffe, 0xfffffffffffffe00, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x7ffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff, 0xffffffffffffffc0, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xfff0000000000000, 0xffffffffffe00000, 0xf000, 0xfc00ff00, 0xffffc0000000ff00, 0xffffffffffffffff, 0xf7fffffffffff000, 0xffffffffffffffbf, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff } }; |
| 13 | 13 | ||
| 14 | // Meaningful names for the T1 slots | 14 | // Meaningful names for the T1 slots |
| 15 | const LOW = 0; | 15 | const LOW = 0; |
| @@ -27,7 +27,7 @@ pub const RuneSet = struct { | |||
| 27 | const set = runeset.body; | 27 | const set = runeset.body; |
| 28 | const a = codeunit(str[0]); | 28 | const a = codeunit(str[0]); |
| 29 | switch (a.kind) { | 29 | switch (a.kind) { |
| 30 | .follow => return false, | 30 | .follow => unreachable, |
| 31 | .low => { | 31 | .low => { |
| 32 | const mask = toMask(set[LOW]); | 32 | const mask = toMask(set[LOW]); |
| 33 | if (mask.isIn(a)) | 33 | if (mask.isIn(a)) |
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 7ce2b4e..59f0c6f 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -228,8 +228,7 @@ test "Segmentation Word Iterator" { | |||
| 228 | // Check. | 228 | // Check. |
| 229 | for (want.items, 1..) |want_word, i| { | 229 | for (want.items, 1..) |want_word, i| { |
| 230 | const got_word = (iter.next()).?; | 230 | const got_word = (iter.next()).?; |
| 231 | std.testing.expectEqualSlices( | 231 | std.testing.expectEqualStrings( |
| 232 | u8, | ||
| 233 | want_word.bytes(all_bytes.items), | 232 | want_word.bytes(all_bytes.items), |
| 234 | got_word.bytes(all_bytes.items), | 233 | got_word.bytes(all_bytes.items), |
| 235 | ) catch |err| { | 234 | ) catch |err| { |