diff options
| author | 2025-05-13 17:19:56 -0400 | |
|---|---|---|
| committer | 2025-05-15 15:32:38 -0400 | |
| commit | 5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a (patch) | |
| tree | f46287fbc0d92238644c23d0b176354567b647d1 /src/WordBreak.zig | |
| parent | Reverse Word Iterator (diff) | |
| download | zg-5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a.tar.gz zg-5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a.tar.xz zg-5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a.zip | |
Hooked up break test, some bugs squashed
The handling of ignorables is really different, because they 'adhere'
to the future of the iteration, not the past.
Diffstat (limited to 'src/WordBreak.zig')
| -rw-r--r-- | src/WordBreak.zig | 39 |
1 files changed, 30 insertions, 9 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 37c0df9..0cab30e 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig | |||
| @@ -98,11 +98,16 @@ pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usiz | |||
| 98 | return this_word.?; | 98 | return this_word.?; |
| 99 | } | 99 | } |
| 100 | 100 | ||
| 101 | /// Returns an iterator over words in `slice` | 101 | /// Returns an iterator over words in `slice`. |
| 102 | pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { | 102 | pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { |
| 103 | return Iterator.init(wordbreak, slice); | 103 | return Iterator.init(wordbreak, slice); |
| 104 | } | 104 | } |
| 105 | 105 | ||
| 106 | /// Returns a reverse iterator over the words in `slice`. | ||
| 107 | pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator { | ||
| 108 | return ReverseIterator.init(wordbreak, slice); | ||
| 109 | } | ||
| 110 | |||
| 106 | pub const Iterator = struct { | 111 | pub const Iterator = struct { |
| 107 | this: ?CodePoint = null, | 112 | this: ?CodePoint = null, |
| 108 | that: ?CodePoint = null, | 113 | that: ?CodePoint = null, |
| @@ -111,7 +116,7 @@ pub const Iterator = struct { | |||
| 111 | 116 | ||
| 112 | /// Assumes `str` is valid UTF-8. | 117 | /// Assumes `str` is valid UTF-8. |
| 113 | pub fn init(wb: *const WordBreak, str: []const u8) Iterator { | 118 | pub fn init(wb: *const WordBreak, str: []const u8) Iterator { |
| 114 | var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; | 119 | var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; |
| 115 | wb_iter.advance(); | 120 | wb_iter.advance(); |
| 116 | return wb_iter; | 121 | return wb_iter; |
| 117 | } | 122 | } |
| @@ -267,8 +272,8 @@ pub const ReverseIterator = struct { | |||
| 267 | wb: *const WordBreak, | 272 | wb: *const WordBreak, |
| 268 | 273 | ||
| 269 | /// Assumes `str` is valid UTF-8. | 274 | /// Assumes `str` is valid UTF-8. |
| 270 | pub fn init(wb: *const WordBreak, str: []const u8) Iterator { | 275 | pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { |
| 271 | var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; | 276 | var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; |
| 272 | wb_iter.advance(); | 277 | wb_iter.advance(); |
| 273 | return wb_iter; | 278 | return wb_iter; |
| 274 | } | 279 | } |
| @@ -299,12 +304,19 @@ pub const ReverseIterator = struct { | |||
| 299 | var last_last_p: WordBreakProperty = .none; | 304 | var last_last_p: WordBreakProperty = .none; |
| 300 | var ri_count: usize = 0; | 305 | var ri_count: usize = 0; |
| 301 | 306 | ||
| 307 | // TODO: Ignorables have to be handled completely differently, unfortunately. | ||
| 308 | // We have to find whatever is before it, match against that, and use that | ||
| 309 | // decision to handle the break we're currently working on. | ||
| 310 | // -- | ||
| 311 | // This is achieveable I think. Just need to use peekPast to get that, and then | ||
| 312 | // take it from there. Probably as long as an ignorable is an after_p we just keep | ||
| 313 | // going. | ||
| 302 | scan: while (true) : (iter.advance()) { | 314 | scan: while (true) : (iter.advance()) { |
| 303 | const after = iter.after.?; | 315 | const after = iter.after.?; |
| 304 | word_len += after.len; | 316 | word_len += after.len; |
| 305 | if (iter.before) |before| { | 317 | if (iter.before) |before| { |
| 306 | const after_p = iter.wb.breakProp(after); | 318 | const after_p = iter.wb.breakProp(after); |
| 307 | const before_p = iter.wb.breakProp(before); | 319 | var before_p = iter.wb.breakProp(before); |
| 308 | if (!isIgnorable(after_p)) { | 320 | if (!isIgnorable(after_p)) { |
| 309 | last_last_p = last_p; | 321 | last_last_p = last_p; |
| 310 | last_p = after_p; | 322 | last_p = after_p; |
| @@ -322,9 +334,18 @@ pub const ReverseIterator = struct { | |||
| 322 | // WB3d WSegSpace × WSegSpace | 334 | // WB3d WSegSpace × WSegSpace |
| 323 | if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; | 335 | if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; |
| 324 | // WB4 X (Extend | Format | ZWJ)* → X | 336 | // WB4 X (Extend | Format | ZWJ)* → X |
| 325 | if (isIgnorable(after_p)) { | 337 | if (isIgnorable(before_p)) { |
| 326 | continue :scan; | 338 | const maybe_before = iter.peekPast(); |
| 327 | } // Now we use last_p instead of after_p for ignorable's sake | 339 | if (maybe_before) |valid_before| { |
| 340 | before_p = iter.wb.breakProp(valid_before); | ||
| 341 | } else if (isIgnorable(after_p)) { | ||
| 342 | continue :scan; | ||
| 343 | // We're done | ||
| 344 | } else { | ||
| 345 | break :scan; | ||
| 346 | } | ||
| 347 | } | ||
| 348 | if (isIgnorable(after_p)) continue :scan; | ||
| 328 | // WB5 AHLetter × AHLetter | 349 | // WB5 AHLetter × AHLetter |
| 329 | if (isAHLetter(last_p) and isAHLetter(before_p)) { | 350 | if (isAHLetter(last_p) and isAHLetter(before_p)) { |
| 330 | continue :scan; | 351 | continue :scan; |
| @@ -334,7 +355,7 @@ pub const ReverseIterator = struct { | |||
| 334 | continue :scan; | 355 | continue :scan; |
| 335 | } | 356 | } |
| 336 | // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter | 357 | // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter |
| 337 | if (isMidVal(before_p)) { | 358 | if (isMidVal(before_p) and isAHLetter(last_p)) { |
| 338 | const prev_val = iter.peekPast(); | 359 | const prev_val = iter.peekPast(); |
| 339 | if (prev_val) |prev_cp| { | 360 | if (prev_val) |prev_cp| { |
| 340 | const prev_p = iter.wb.breakProp(prev_cp); | 361 | const prev_p = iter.wb.breakProp(prev_cp); |