summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-14 10:05:49 -0400
committerGravatar Sam Atman2025-05-15 15:32:43 -0400
commit3461a7750344fe7301cacf04f2f5e154ef70e966 (patch)
treee2788807685c99b2579ea94627e6f2d8ff247b39 /src
parentHooked up break test, some bugs squashed (diff)
downloadzg-3461a7750344fe7301cacf04f2f5e154ef70e966.tar.gz
zg-3461a7750344fe7301cacf04f2f5e154ef70e966.tar.xz
zg-3461a7750344fe7301cacf04f2f5e154ef70e966.zip
ReverseWordIterator passes conformance test
Ended up needing a clone of the codepoint iterator, so that WB4 can ignore points in a matter compatible with backward search. So I created a special SneakIterator which can return WBPs directly, so as to skip ignorables. This is also needed for flag emoji, since the odd-number case must be handle immediately. So we count back in a WB4 compatible way, then store the count on the word iterator, and voila.
Diffstat (limited to 'src')
-rw-r--r--src/WordBreak.zig83
1 files changed, 64 insertions, 19 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 0cab30e..f1322ff 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -270,6 +270,7 @@ pub const ReverseIterator = struct {
270 before: ?CodePoint = null, 270 before: ?CodePoint = null,
271 cp_iter: ReverseCodepointIterator, 271 cp_iter: ReverseCodepointIterator,
272 wb: *const WordBreak, 272 wb: *const WordBreak,
273 flags: usize = 0,
273 274
274 /// Assumes `str` is valid UTF-8. 275 /// Assumes `str` is valid UTF-8.
275 pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { 276 pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator {
@@ -302,19 +303,12 @@ pub const ReverseIterator = struct {
302 // State variables. 303 // State variables.
303 var last_p: WordBreakProperty = .none; 304 var last_p: WordBreakProperty = .none;
304 var last_last_p: WordBreakProperty = .none; 305 var last_last_p: WordBreakProperty = .none;
305 var ri_count: usize = 0;
306 306
307 // TODO: Ignorables have to be handled completely differently, unfortunately.
308 // We have to find whatever is before it, match against that, and use that
309 // decision to handle the break we're currently working on.
310 // --
311 // This is achieveable I think. Just need to use peekPast to get that, and then
312 // take it from there. Probably as long as an ignorable is an after_p we just keep
313 // going.
314 scan: while (true) : (iter.advance()) { 307 scan: while (true) : (iter.advance()) {
315 const after = iter.after.?; 308 const after = iter.after.?;
316 word_len += after.len; 309 word_len += after.len;
317 if (iter.before) |before| { 310 if (iter.before) |before| {
311 var sneak = sneaky(iter); // 'sneaks' past ignorables
318 const after_p = iter.wb.breakProp(after); 312 const after_p = iter.wb.breakProp(after);
319 var before_p = iter.wb.breakProp(before); 313 var before_p = iter.wb.breakProp(before);
320 if (!isIgnorable(after_p)) { 314 if (!isIgnorable(after_p)) {
@@ -335,13 +329,11 @@ pub const ReverseIterator = struct {
335 if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; 329 if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan;
336 // WB4 X (Extend | Format | ZWJ)* → X 330 // WB4 X (Extend | Format | ZWJ)* → X
337 if (isIgnorable(before_p)) { 331 if (isIgnorable(before_p)) {
338 const maybe_before = iter.peekPast(); 332 const maybe_before = sneak.prev();
339 if (maybe_before) |valid_before| { 333 if (maybe_before) |valid_before| {
340 before_p = iter.wb.breakProp(valid_before); 334 before_p = iter.wb.breakProp(valid_before);
341 } else if (isIgnorable(after_p)) { 335 } else if (!isIgnorable(after_p)) {
342 continue :scan;
343 // We're done 336 // We're done
344 } else {
345 break :scan; 337 break :scan;
346 } 338 }
347 } 339 }
@@ -356,7 +348,7 @@ pub const ReverseIterator = struct {
356 } 348 }
357 // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter 349 // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
358 if (isMidVal(before_p) and isAHLetter(last_p)) { 350 if (isMidVal(before_p) and isAHLetter(last_p)) {
359 const prev_val = iter.peekPast(); 351 const prev_val = sneak.peek();
360 if (prev_val) |prev_cp| { 352 if (prev_val) |prev_cp| {
361 const prev_p = iter.wb.breakProp(prev_cp); 353 const prev_p = iter.wb.breakProp(prev_cp);
362 if (isAHLetter(prev_p)) { 354 if (isAHLetter(prev_p)) {
@@ -372,7 +364,7 @@ pub const ReverseIterator = struct {
372 } 364 }
373 // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter 365 // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
374 if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { 366 if (before_p == .Double_Quote and last_p == .Hebrew_Letter) {
375 const prev_val = iter.peekPast(); 367 const prev_val = sneak.peek();
376 if (prev_val) |prev_cp| { 368 if (prev_val) |prev_cp| {
377 const prev_p = iter.wb.breakProp(prev_cp); 369 const prev_p = iter.wb.breakProp(prev_cp);
378 if (prev_p == .Hebrew_Letter) { 370 if (prev_p == .Hebrew_Letter) {
@@ -388,7 +380,7 @@ pub const ReverseIterator = struct {
388 if (before_p == .Numeric and isAHLetter(last_p)) continue :scan; 380 if (before_p == .Numeric and isAHLetter(last_p)) continue :scan;
389 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric 381 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric
390 if (isMidNum(before_p) and last_p == .Numeric) { 382 if (isMidNum(before_p) and last_p == .Numeric) {
391 const prev_val = iter.peekPast(); 383 const prev_val = sneak.peek();
392 if (prev_val) |prev_cp| { 384 if (prev_val) |prev_cp| {
393 const prev_p = iter.wb.breakProp(prev_cp); 385 const prev_p = iter.wb.breakProp(prev_cp);
394 if (prev_p == .Numeric) { 386 if (prev_p == .Numeric) {
@@ -407,10 +399,23 @@ pub const ReverseIterator = struct {
407 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) 399 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana)
408 if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan; 400 if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan;
409 // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI 401 // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI
410 const maybe_flag = before_p == .Regional_Indicator and last_p == .Regional_Indicator; 402 // NOTE:
411 if (maybe_flag) { 403 // So here we simply have to know whether a run of flags is even or odd.
412 ri_count += 1; 404 // The whole run. To avoid quadratic behavior (and long flag runs are
413 if (ri_count % 2 == 1) continue :scan; 405 // actually a thing in the wild), we have to count them once, store that
406 // on the iterator, and decrement each time we see two, possibly breaking
407 // once extra at the beginning. They break up one per flag, once we hit
408 // zero, that's all the flags. If we see another flag we do it again.
409 if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) {
410 defer {
411 if (iter.flags > 0) iter.flags -= 1;
412 }
413 if (iter.flags == 0) {
414 iter.flags = sneak.countFlags();
415 }
416 if (iter.flags % 2 == 0) {
417 continue :scan;
418 }
414 } 419 }
415 // WB999 Any ÷ Any 420 // WB999 Any ÷ Any
416 break :scan; 421 break :scan;
@@ -436,6 +441,46 @@ pub const ReverseIterator = struct {
436 } 441 }
437}; 442};
438 443
444fn sneaky(iter: *const ReverseIterator) SneakIterator {
445 return .{ .cp_iter = iter.cp_iter, .wb = iter.wb };
446}
447
448const SneakIterator = struct {
449 cp_iter: ReverseCodepointIterator,
450 wb: *const WordBreak,
451
452 fn peek(iter: *SneakIterator) ?CodePoint {
453 const save_cp = iter.cp_iter;
454 defer iter.cp_iter = save_cp;
455 while (iter.cp_iter.peek()) |peeked| {
456 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
457 _ = iter.cp_iter.prev();
458 }
459 return null;
460 }
461
462 fn countFlags(iter: *SneakIterator) usize {
463 var flags: usize = 0;
464 const save_cp = iter.cp_iter;
465 defer iter.cp_iter = save_cp;
466 while (iter.cp_iter.prev()) |cp| {
467 const prop = iter.wb.breakProp(cp);
468 if (isIgnorable(prop)) continue;
469 if (prop == .Regional_Indicator) {
470 flags += 1;
471 } else break;
472 }
473 return flags;
474 }
475
476 fn prev(iter: *SneakIterator) ?CodePoint {
477 while (iter.cp_iter.prev()) |peeked| {
478 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
479 }
480 return null;
481 }
482};
483
439inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { 484inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void {
440 const decompressor = compress.flate.inflate.decompressor; 485 const decompressor = compress.flate.inflate.decompressor;
441 const in_bytes = @embedFile("wbp"); 486 const in_bytes = @embedFile("wbp");