diff options
| author | 2025-05-13 17:19:56 -0400 | |
|---|---|---|
| committer | 2025-05-15 15:32:38 -0400 | |
| commit | 5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a (patch) | |
| tree | f46287fbc0d92238644c23d0b176354567b647d1 | |
| parent | Reverse Word Iterator (diff) | |
| download | zg-5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a.tar.gz zg-5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a.tar.xz zg-5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a.zip | |
Hooked up break test, some bugs squashed
The handling of ignorables is really different, because they 'adhere'
to the future of the iteration, not the past.
| -rw-r--r-- | src/WordBreak.zig | 39 | ||||
| -rw-r--r-- | src/code_point.zig | 10 | ||||
| -rw-r--r-- | src/unicode_tests.zig | 49 |
3 files changed, 64 insertions, 34 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 37c0df9..0cab30e 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig | |||
| @@ -98,11 +98,16 @@ pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usiz | |||
| 98 | return this_word.?; | 98 | return this_word.?; |
| 99 | } | 99 | } |
| 100 | 100 | ||
| 101 | /// Returns an iterator over words in `slice` | 101 | /// Returns an iterator over words in `slice`. |
| 102 | pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { | 102 | pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { |
| 103 | return Iterator.init(wordbreak, slice); | 103 | return Iterator.init(wordbreak, slice); |
| 104 | } | 104 | } |
| 105 | 105 | ||
| 106 | /// Returns a reverse iterator over the words in `slice`. | ||
| 107 | pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator { | ||
| 108 | return ReverseIterator.init(wordbreak, slice); | ||
| 109 | } | ||
| 110 | |||
| 106 | pub const Iterator = struct { | 111 | pub const Iterator = struct { |
| 107 | this: ?CodePoint = null, | 112 | this: ?CodePoint = null, |
| 108 | that: ?CodePoint = null, | 113 | that: ?CodePoint = null, |
| @@ -111,7 +116,7 @@ pub const Iterator = struct { | |||
| 111 | 116 | ||
| 112 | /// Assumes `str` is valid UTF-8. | 117 | /// Assumes `str` is valid UTF-8. |
| 113 | pub fn init(wb: *const WordBreak, str: []const u8) Iterator { | 118 | pub fn init(wb: *const WordBreak, str: []const u8) Iterator { |
| 114 | var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; | 119 | var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; |
| 115 | wb_iter.advance(); | 120 | wb_iter.advance(); |
| 116 | return wb_iter; | 121 | return wb_iter; |
| 117 | } | 122 | } |
| @@ -267,8 +272,8 @@ pub const ReverseIterator = struct { | |||
| 267 | wb: *const WordBreak, | 272 | wb: *const WordBreak, |
| 268 | 273 | ||
| 269 | /// Assumes `str` is valid UTF-8. | 274 | /// Assumes `str` is valid UTF-8. |
| 270 | pub fn init(wb: *const WordBreak, str: []const u8) Iterator { | 275 | pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { |
| 271 | var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; | 276 | var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; |
| 272 | wb_iter.advance(); | 277 | wb_iter.advance(); |
| 273 | return wb_iter; | 278 | return wb_iter; |
| 274 | } | 279 | } |
| @@ -299,12 +304,19 @@ pub const ReverseIterator = struct { | |||
| 299 | var last_last_p: WordBreakProperty = .none; | 304 | var last_last_p: WordBreakProperty = .none; |
| 300 | var ri_count: usize = 0; | 305 | var ri_count: usize = 0; |
| 301 | 306 | ||
| 307 | // TODO: Ignorables have to be handled completely differently, unfortunately. | ||
| 308 | // We have to find whatever is before it, match against that, and use that | ||
| 309 | // decision to handle the break we're currently working on. | ||
| 310 | // -- | ||
| 311 | // This is achieveable I think. Just need to use peekPast to get that, and then | ||
| 312 | // take it from there. Probably as long as an ignorable is an after_p we just keep | ||
| 313 | // going. | ||
| 302 | scan: while (true) : (iter.advance()) { | 314 | scan: while (true) : (iter.advance()) { |
| 303 | const after = iter.after.?; | 315 | const after = iter.after.?; |
| 304 | word_len += after.len; | 316 | word_len += after.len; |
| 305 | if (iter.before) |before| { | 317 | if (iter.before) |before| { |
| 306 | const after_p = iter.wb.breakProp(after); | 318 | const after_p = iter.wb.breakProp(after); |
| 307 | const before_p = iter.wb.breakProp(before); | 319 | var before_p = iter.wb.breakProp(before); |
| 308 | if (!isIgnorable(after_p)) { | 320 | if (!isIgnorable(after_p)) { |
| 309 | last_last_p = last_p; | 321 | last_last_p = last_p; |
| 310 | last_p = after_p; | 322 | last_p = after_p; |
| @@ -322,9 +334,18 @@ pub const ReverseIterator = struct { | |||
| 322 | // WB3d WSegSpace × WSegSpace | 334 | // WB3d WSegSpace × WSegSpace |
| 323 | if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; | 335 | if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; |
| 324 | // WB4 X (Extend | Format | ZWJ)* → X | 336 | // WB4 X (Extend | Format | ZWJ)* → X |
| 325 | if (isIgnorable(after_p)) { | 337 | if (isIgnorable(before_p)) { |
| 326 | continue :scan; | 338 | const maybe_before = iter.peekPast(); |
| 327 | } // Now we use last_p instead of after_p for ignorable's sake | 339 | if (maybe_before) |valid_before| { |
| 340 | before_p = iter.wb.breakProp(valid_before); | ||
| 341 | } else if (isIgnorable(after_p)) { | ||
| 342 | continue :scan; | ||
| 343 | // We're done | ||
| 344 | } else { | ||
| 345 | break :scan; | ||
| 346 | } | ||
| 347 | } | ||
| 348 | if (isIgnorable(after_p)) continue :scan; | ||
| 328 | // WB5 AHLetter × AHLetter | 349 | // WB5 AHLetter × AHLetter |
| 329 | if (isAHLetter(last_p) and isAHLetter(before_p)) { | 350 | if (isAHLetter(last_p) and isAHLetter(before_p)) { |
| 330 | continue :scan; | 351 | continue :scan; |
| @@ -334,7 +355,7 @@ pub const ReverseIterator = struct { | |||
| 334 | continue :scan; | 355 | continue :scan; |
| 335 | } | 356 | } |
| 336 | // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter | 357 | // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter |
| 337 | if (isMidVal(before_p)) { | 358 | if (isMidVal(before_p) and isAHLetter(last_p)) { |
| 338 | const prev_val = iter.peekPast(); | 359 | const prev_val = iter.peekPast(); |
| 339 | if (prev_val) |prev_cp| { | 360 | if (prev_val) |prev_cp| { |
| 340 | const prev_p = iter.wb.breakProp(prev_cp); | 361 | const prev_p = iter.wb.breakProp(prev_cp); |
diff --git a/src/code_point.zig b/src/code_point.zig index a5b10d4..ba0b434 100644 --- a/src/code_point.zig +++ b/src/code_point.zig | |||
| @@ -53,22 +53,12 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { | |||
| 53 | }; | 53 | }; |
| 54 | // Multibyte | 54 | // Multibyte |
| 55 | 55 | ||
| 56 | <<<<<<< HEAD | ||
| 57 | // Second: | 56 | // Second: |
| 58 | var class: u4 = @intCast(u8dfa[byte]); | 57 | var class: u4 = @intCast(u8dfa[byte]); |
| 59 | var st: u32 = state_dfa[class]; | 58 | var st: u32 = state_dfa[class]; |
| 60 | if (st == RUNE_REJECT or cursor.* == bytes.len) { | 59 | if (st == RUNE_REJECT or cursor.* == bytes.len) { |
| 61 | @branchHint(.cold); | 60 | @branchHint(.cold); |
| 62 | // First one is never a truncation | 61 | // First one is never a truncation |
| 63 | ||||||| parent of ad4b046 (Various small iterator improvements) | ||
| 64 | // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte | ||
| 65 | if (cp.len > bytes.len) { | ||
| 66 | // Unicode replacement code point. | ||
| 67 | ======= | ||
| 68 | // Return replacement if we don't have a complete codepoint remaining. Consumes only one byte. | ||
| 69 | if (cp.len > bytes.len) { | ||
| 70 | // Unicode replacement code point. | ||
| 71 | >>>>>>> ad4b046 (Various small iterator improvements) | ||
| 72 | return .{ | 62 | return .{ |
| 73 | .code = 0xfffd, | 63 | .code = 0xfffd, |
| 74 | .len = 1, | 64 | .len = 1, |
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 59f0c6f..8661bfd 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -195,7 +195,7 @@ test "Segmentation Word Iterator" { | |||
| 195 | line = line[0..final]; | 195 | line = line[0..final]; |
| 196 | } | 196 | } |
| 197 | // Iterate over fields. | 197 | // Iterate over fields. |
| 198 | var want = std.ArrayList(Grapheme).init(allocator); | 198 | var want = std.ArrayList(Word).init(allocator); |
| 199 | defer want.deinit(); | 199 | defer want.deinit(); |
| 200 | 200 | ||
| 201 | var all_bytes = std.ArrayList(u8).init(allocator); | 201 | var all_bytes = std.ArrayList(u8).init(allocator); |
| @@ -219,22 +219,40 @@ test "Segmentation Word Iterator" { | |||
| 219 | gc_len += len; | 219 | gc_len += len; |
| 220 | } | 220 | } |
| 221 | 221 | ||
| 222 | try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); | 222 | try want.append(Word{ .len = gc_len, .offset = bytes_index }); |
| 223 | bytes_index += cp_index; | 223 | bytes_index += cp_index; |
| 224 | } | 224 | } |
| 225 | 225 | { | |
| 226 | var iter = wb.iterator(all_bytes.items); | 226 | var iter = wb.iterator(all_bytes.items); |
| 227 | 227 | ||
| 228 | // Check. | 228 | // Check. |
| 229 | for (want.items, 1..) |want_word, i| { | 229 | for (want.items, 1..) |want_word, i| { |
| 230 | const got_word = (iter.next()).?; | 230 | const got_word = (iter.next()).?; |
| 231 | std.testing.expectEqualStrings( | 231 | std.testing.expectEqualStrings( |
| 232 | want_word.bytes(all_bytes.items), | 232 | want_word.bytes(all_bytes.items), |
| 233 | got_word.bytes(all_bytes.items), | 233 | got_word.bytes(all_bytes.items), |
| 234 | ) catch |err| { | 234 | ) catch |err| { |
| 235 | debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); | 235 | debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); |
| 236 | return err; | 236 | return err; |
| 237 | }; | 237 | }; |
| 238 | } | ||
| 239 | } | ||
| 240 | { | ||
| 241 | var r_iter = wb.reverseIterator(all_bytes.items); | ||
| 242 | var idx = want.items.len - 1; | ||
| 243 | while (true) : (idx -= 1) { | ||
| 244 | const want_word = want.items[idx]; | ||
| 245 | const got_word = r_iter.prev().?; | ||
| 246 | std.testing.expectEqualSlices( | ||
| 247 | u8, | ||
| 248 | want_word.bytes(all_bytes.items), | ||
| 249 | got_word.bytes(all_bytes.items), | ||
| 250 | ) catch |err| { | ||
| 251 | debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 }); | ||
| 252 | return err; | ||
| 253 | }; | ||
| 254 | if (idx == 0) break; | ||
| 255 | } | ||
| 238 | } | 256 | } |
| 239 | } | 257 | } |
| 240 | } | 258 | } |
| @@ -277,3 +295,4 @@ const GraphemeIterator = @import("Graphemes").Iterator; | |||
| 277 | const Normalize = @import("Normalize"); | 295 | const Normalize = @import("Normalize"); |
| 278 | 296 | ||
| 279 | const WordBreak = @import("WordBreak"); | 297 | const WordBreak = @import("WordBreak"); |
| 298 | const Word = WordBreak.Word; | ||