diff options
| author | 2025-05-14 10:05:49 -0400 | |
|---|---|---|
| committer | 2025-05-15 15:32:43 -0400 | |
| commit | 3461a7750344fe7301cacf04f2f5e154ef70e966 (patch) | |
| tree | e2788807685c99b2579ea94627e6f2d8ff247b39 /src/WordBreak.zig | |
| parent | Hooked up break test, some bugs squashed (diff) | |
| download | zg-3461a7750344fe7301cacf04f2f5e154ef70e966.tar.gz zg-3461a7750344fe7301cacf04f2f5e154ef70e966.tar.xz zg-3461a7750344fe7301cacf04f2f5e154ef70e966.zip | |
ReverseWordIterator passes conformance test
Ended up needing a clone of the codepoint iterator, so that WB4 can
ignore points in a matter compatible with backward search.
So I created a special SneakIterator which can return WBPs directly,
so as to skip ignorables. This is also needed for flag emoji, since
the odd-number case must be handle immediately. So we count back in
a WB4 compatible way, then store the count on the word iterator, and
voila.
Diffstat (limited to 'src/WordBreak.zig')
| -rw-r--r-- | src/WordBreak.zig | 83 |
1 files changed, 64 insertions, 19 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 0cab30e..f1322ff 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig | |||
| @@ -270,6 +270,7 @@ pub const ReverseIterator = struct { | |||
| 270 | before: ?CodePoint = null, | 270 | before: ?CodePoint = null, |
| 271 | cp_iter: ReverseCodepointIterator, | 271 | cp_iter: ReverseCodepointIterator, |
| 272 | wb: *const WordBreak, | 272 | wb: *const WordBreak, |
| 273 | flags: usize = 0, | ||
| 273 | 274 | ||
| 274 | /// Assumes `str` is valid UTF-8. | 275 | /// Assumes `str` is valid UTF-8. |
| 275 | pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { | 276 | pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { |
| @@ -302,19 +303,12 @@ pub const ReverseIterator = struct { | |||
| 302 | // State variables. | 303 | // State variables. |
| 303 | var last_p: WordBreakProperty = .none; | 304 | var last_p: WordBreakProperty = .none; |
| 304 | var last_last_p: WordBreakProperty = .none; | 305 | var last_last_p: WordBreakProperty = .none; |
| 305 | var ri_count: usize = 0; | ||
| 306 | 306 | ||
| 307 | // TODO: Ignorables have to be handled completely differently, unfortunately. | ||
| 308 | // We have to find whatever is before it, match against that, and use that | ||
| 309 | // decision to handle the break we're currently working on. | ||
| 310 | // -- | ||
| 311 | // This is achieveable I think. Just need to use peekPast to get that, and then | ||
| 312 | // take it from there. Probably as long as an ignorable is an after_p we just keep | ||
| 313 | // going. | ||
| 314 | scan: while (true) : (iter.advance()) { | 307 | scan: while (true) : (iter.advance()) { |
| 315 | const after = iter.after.?; | 308 | const after = iter.after.?; |
| 316 | word_len += after.len; | 309 | word_len += after.len; |
| 317 | if (iter.before) |before| { | 310 | if (iter.before) |before| { |
| 311 | var sneak = sneaky(iter); // 'sneaks' past ignorables | ||
| 318 | const after_p = iter.wb.breakProp(after); | 312 | const after_p = iter.wb.breakProp(after); |
| 319 | var before_p = iter.wb.breakProp(before); | 313 | var before_p = iter.wb.breakProp(before); |
| 320 | if (!isIgnorable(after_p)) { | 314 | if (!isIgnorable(after_p)) { |
| @@ -335,13 +329,11 @@ pub const ReverseIterator = struct { | |||
| 335 | if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; | 329 | if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; |
| 336 | // WB4 X (Extend | Format | ZWJ)* → X | 330 | // WB4 X (Extend | Format | ZWJ)* → X |
| 337 | if (isIgnorable(before_p)) { | 331 | if (isIgnorable(before_p)) { |
| 338 | const maybe_before = iter.peekPast(); | 332 | const maybe_before = sneak.prev(); |
| 339 | if (maybe_before) |valid_before| { | 333 | if (maybe_before) |valid_before| { |
| 340 | before_p = iter.wb.breakProp(valid_before); | 334 | before_p = iter.wb.breakProp(valid_before); |
| 341 | } else if (isIgnorable(after_p)) { | 335 | } else if (!isIgnorable(after_p)) { |
| 342 | continue :scan; | ||
| 343 | // We're done | 336 | // We're done |
| 344 | } else { | ||
| 345 | break :scan; | 337 | break :scan; |
| 346 | } | 338 | } |
| 347 | } | 339 | } |
| @@ -356,7 +348,7 @@ pub const ReverseIterator = struct { | |||
| 356 | } | 348 | } |
| 357 | // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter | 349 | // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter |
| 358 | if (isMidVal(before_p) and isAHLetter(last_p)) { | 350 | if (isMidVal(before_p) and isAHLetter(last_p)) { |
| 359 | const prev_val = iter.peekPast(); | 351 | const prev_val = sneak.peek(); |
| 360 | if (prev_val) |prev_cp| { | 352 | if (prev_val) |prev_cp| { |
| 361 | const prev_p = iter.wb.breakProp(prev_cp); | 353 | const prev_p = iter.wb.breakProp(prev_cp); |
| 362 | if (isAHLetter(prev_p)) { | 354 | if (isAHLetter(prev_p)) { |
| @@ -372,7 +364,7 @@ pub const ReverseIterator = struct { | |||
| 372 | } | 364 | } |
| 373 | // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter | 365 | // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter |
| 374 | if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { | 366 | if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { |
| 375 | const prev_val = iter.peekPast(); | 367 | const prev_val = sneak.peek(); |
| 376 | if (prev_val) |prev_cp| { | 368 | if (prev_val) |prev_cp| { |
| 377 | const prev_p = iter.wb.breakProp(prev_cp); | 369 | const prev_p = iter.wb.breakProp(prev_cp); |
| 378 | if (prev_p == .Hebrew_Letter) { | 370 | if (prev_p == .Hebrew_Letter) { |
| @@ -388,7 +380,7 @@ pub const ReverseIterator = struct { | |||
| 388 | if (before_p == .Numeric and isAHLetter(last_p)) continue :scan; | 380 | if (before_p == .Numeric and isAHLetter(last_p)) continue :scan; |
| 389 | // WB11 Numeric (MidNum | MidNumLetQ) × Numeric | 381 | // WB11 Numeric (MidNum | MidNumLetQ) × Numeric |
| 390 | if (isMidNum(before_p) and last_p == .Numeric) { | 382 | if (isMidNum(before_p) and last_p == .Numeric) { |
| 391 | const prev_val = iter.peekPast(); | 383 | const prev_val = sneak.peek(); |
| 392 | if (prev_val) |prev_cp| { | 384 | if (prev_val) |prev_cp| { |
| 393 | const prev_p = iter.wb.breakProp(prev_cp); | 385 | const prev_p = iter.wb.breakProp(prev_cp); |
| 394 | if (prev_p == .Numeric) { | 386 | if (prev_p == .Numeric) { |
| @@ -407,10 +399,23 @@ pub const ReverseIterator = struct { | |||
| 407 | // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) | 399 | // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) |
| 408 | if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan; | 400 | if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan; |
| 409 | // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI | 401 | // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI |
| 410 | const maybe_flag = before_p == .Regional_Indicator and last_p == .Regional_Indicator; | 402 | // NOTE: |
| 411 | if (maybe_flag) { | 403 | // So here we simply have to know whether a run of flags is even or odd. |
| 412 | ri_count += 1; | 404 | // The whole run. To avoid quadratic behavior (and long flag runs are |
| 413 | if (ri_count % 2 == 1) continue :scan; | 405 | // actually a thing in the wild), we have to count them once, store that |
| 406 | // on the iterator, and decrement each time we see two, possibly breaking | ||
| 407 | // once extra at the beginning. They break up one per flag, once we hit | ||
| 408 | // zero, that's all the flags. If we see another flag we do it again. | ||
| 409 | if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) { | ||
| 410 | defer { | ||
| 411 | if (iter.flags > 0) iter.flags -= 1; | ||
| 412 | } | ||
| 413 | if (iter.flags == 0) { | ||
| 414 | iter.flags = sneak.countFlags(); | ||
| 415 | } | ||
| 416 | if (iter.flags % 2 == 0) { | ||
| 417 | continue :scan; | ||
| 418 | } | ||
| 414 | } | 419 | } |
| 415 | // WB999 Any ÷ Any | 420 | // WB999 Any ÷ Any |
| 416 | break :scan; | 421 | break :scan; |
| @@ -436,6 +441,46 @@ pub const ReverseIterator = struct { | |||
| 436 | } | 441 | } |
| 437 | }; | 442 | }; |
| 438 | 443 | ||
| 444 | fn sneaky(iter: *const ReverseIterator) SneakIterator { | ||
| 445 | return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; | ||
| 446 | } | ||
| 447 | |||
| 448 | const SneakIterator = struct { | ||
| 449 | cp_iter: ReverseCodepointIterator, | ||
| 450 | wb: *const WordBreak, | ||
| 451 | |||
| 452 | fn peek(iter: *SneakIterator) ?CodePoint { | ||
| 453 | const save_cp = iter.cp_iter; | ||
| 454 | defer iter.cp_iter = save_cp; | ||
| 455 | while (iter.cp_iter.peek()) |peeked| { | ||
| 456 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | ||
| 457 | _ = iter.cp_iter.prev(); | ||
| 458 | } | ||
| 459 | return null; | ||
| 460 | } | ||
| 461 | |||
| 462 | fn countFlags(iter: *SneakIterator) usize { | ||
| 463 | var flags: usize = 0; | ||
| 464 | const save_cp = iter.cp_iter; | ||
| 465 | defer iter.cp_iter = save_cp; | ||
| 466 | while (iter.cp_iter.prev()) |cp| { | ||
| 467 | const prop = iter.wb.breakProp(cp); | ||
| 468 | if (isIgnorable(prop)) continue; | ||
| 469 | if (prop == .Regional_Indicator) { | ||
| 470 | flags += 1; | ||
| 471 | } else break; | ||
| 472 | } | ||
| 473 | return flags; | ||
| 474 | } | ||
| 475 | |||
| 476 | fn prev(iter: *SneakIterator) ?CodePoint { | ||
| 477 | while (iter.cp_iter.prev()) |peeked| { | ||
| 478 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | ||
| 479 | } | ||
| 480 | return null; | ||
| 481 | } | ||
| 482 | }; | ||
| 483 | |||
| 439 | inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { | 484 | inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { |
| 440 | const decompressor = compress.flate.inflate.decompressor; | 485 | const decompressor = compress.flate.inflate.decompressor; |
| 441 | const in_bytes = @embedFile("wbp"); | 486 | const in_bytes = @embedFile("wbp"); |