From 5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Tue, 13 May 2025 17:19:56 -0400 Subject: Hooked up break test, some bugs squashed The handling of ignorables is really different, because they 'adhere' to the future of the iteration, not the past. --- src/WordBreak.zig | 39 ++++++++++++++++++++++++++++++--------- src/code_point.zig | 10 ---------- src/unicode_tests.zig | 49 ++++++++++++++++++++++++++++++++++--------------- 3 files changed, 64 insertions(+), 34 deletions(-) (limited to 'src') diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 37c0df9..0cab30e 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -98,11 +98,16 @@ pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usiz return this_word.?; } -/// Returns an iterator over words in `slice` +/// Returns an iterator over words in `slice`. pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { return Iterator.init(wordbreak, slice); } +/// Returns a reverse iterator over the words in `slice`. +pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator { + return ReverseIterator.init(wordbreak, slice); +} + pub const Iterator = struct { this: ?CodePoint = null, that: ?CodePoint = null, @@ -111,7 +116,7 @@ pub const Iterator = struct { /// Assumes `str` is valid UTF-8. pub fn init(wb: *const WordBreak, str: []const u8) Iterator { - var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; + var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; wb_iter.advance(); return wb_iter; } @@ -267,8 +272,8 @@ pub const ReverseIterator = struct { wb: *const WordBreak, /// Assumes `str` is valid UTF-8. - pub fn init(wb: *const WordBreak, str: []const u8) Iterator { - var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; + pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { + var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; wb_iter.advance(); return wb_iter; } @@ -299,12 +304,19 @@ pub const ReverseIterator = struct { var last_last_p: WordBreakProperty = .none; var ri_count: usize = 0; + // TODO: Ignorables have to be handled completely differently, unfortunately. + // We have to find whatever is before it, match against that, and use that + // decision to handle the break we're currently working on. + // -- + // This is achieveable I think. Just need to use peekPast to get that, and then + // take it from there. Probably as long as an ignorable is an after_p we just keep + // going. scan: while (true) : (iter.advance()) { const after = iter.after.?; word_len += after.len; if (iter.before) |before| { const after_p = iter.wb.breakProp(after); - const before_p = iter.wb.breakProp(before); + var before_p = iter.wb.breakProp(before); if (!isIgnorable(after_p)) { last_last_p = last_p; last_p = after_p; @@ -322,9 +334,18 @@ pub const ReverseIterator = struct { // WB3d WSegSpace × WSegSpace if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; // WB4 X (Extend | Format | ZWJ)* → X - if (isIgnorable(after_p)) { - continue :scan; - } // Now we use last_p instead of after_p for ignorable's sake + if (isIgnorable(before_p)) { + const maybe_before = iter.peekPast(); + if (maybe_before) |valid_before| { + before_p = iter.wb.breakProp(valid_before); + } else if (isIgnorable(after_p)) { + continue :scan; + // We're done + } else { + break :scan; + } + } + if (isIgnorable(after_p)) continue :scan; // WB5 AHLetter × AHLetter if (isAHLetter(last_p) and isAHLetter(before_p)) { continue :scan; @@ -334,7 +355,7 @@ pub const ReverseIterator = struct { continue :scan; } // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter - if (isMidVal(before_p)) { + if (isMidVal(before_p) and isAHLetter(last_p)) { const prev_val = iter.peekPast(); if (prev_val) |prev_cp| { const prev_p = iter.wb.breakProp(prev_cp); diff --git a/src/code_point.zig b/src/code_point.zig index a5b10d4..ba0b434 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -53,22 +53,12 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { }; // Multibyte -<<<<<<< HEAD // Second: var class: u4 = @intCast(u8dfa[byte]); var st: u32 = state_dfa[class]; if (st == RUNE_REJECT or cursor.* == bytes.len) { @branchHint(.cold); // First one is never a truncation -||||||| parent of ad4b046 (Various small iterator improvements) - // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte - if (cp.len > bytes.len) { - // Unicode replacement code point. -======= - // Return replacement if we don't have a complete codepoint remaining. Consumes only one byte. - if (cp.len > bytes.len) { - // Unicode replacement code point. ->>>>>>> ad4b046 (Various small iterator improvements) return .{ .code = 0xfffd, .len = 1, diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 59f0c6f..8661bfd 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -195,7 +195,7 @@ test "Segmentation Word Iterator" { line = line[0..final]; } // Iterate over fields. - var want = std.ArrayList(Grapheme).init(allocator); + var want = std.ArrayList(Word).init(allocator); defer want.deinit(); var all_bytes = std.ArrayList(u8).init(allocator); @@ -219,22 +219,40 @@ test "Segmentation Word Iterator" { gc_len += len; } - try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); + try want.append(Word{ .len = gc_len, .offset = bytes_index }); bytes_index += cp_index; } - - var iter = wb.iterator(all_bytes.items); - - // Check. - for (want.items, 1..) |want_word, i| { - const got_word = (iter.next()).?; - std.testing.expectEqualStrings( - want_word.bytes(all_bytes.items), - got_word.bytes(all_bytes.items), - ) catch |err| { - debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); - return err; - }; + { + var iter = wb.iterator(all_bytes.items); + + // Check. + for (want.items, 1..) |want_word, i| { + const got_word = (iter.next()).?; + std.testing.expectEqualStrings( + want_word.bytes(all_bytes.items), + got_word.bytes(all_bytes.items), + ) catch |err| { + debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); + return err; + }; + } + } + { + var r_iter = wb.reverseIterator(all_bytes.items); + var idx = want.items.len - 1; + while (true) : (idx -= 1) { + const want_word = want.items[idx]; + const got_word = r_iter.prev().?; + std.testing.expectEqualSlices( + u8, + want_word.bytes(all_bytes.items), + got_word.bytes(all_bytes.items), + ) catch |err| { + debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 }); + return err; + }; + if (idx == 0) break; + } } } } @@ -277,3 +295,4 @@ const GraphemeIterator = @import("Graphemes").Iterator; const Normalize = @import("Normalize"); const WordBreak = @import("WordBreak"); +const Word = WordBreak.Word; -- cgit v1.2.3