diff options
| author | 2025-05-13 16:33:02 -0400 | |
|---|---|---|
| committer | 2025-05-15 15:31:16 -0400 | |
| commit | 7ff729895e72fc841440ec73a44c142779fcae1e (patch) | |
| tree | 8917658e78f42d14a824f2595664b0a88f018c3a /src | |
| parent | Add wordAtCursor (diff) | |
| download | zg-7ff729895e72fc841440ec73a44c142779fcae1e.tar.gz zg-7ff729895e72fc841440ec73a44c142779fcae1e.tar.xz zg-7ff729895e72fc841440ec73a44c142779fcae1e.zip | |
Reverse Word Iterator
Next up I hook it to the tests.
Diffstat (limited to 'src')
| -rw-r--r-- | src/WordBreak.zig | 156 | ||||
| -rw-r--r-- | src/code_point.zig | 2 |
2 files changed, 157 insertions, 1 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig index f0da30d..37c0df9 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig | |||
| @@ -260,6 +260,161 @@ pub const Iterator = struct { | |||
| 260 | } | 260 | } |
| 261 | }; | 261 | }; |
| 262 | 262 | ||
| 263 | pub const ReverseIterator = struct { | ||
| 264 | after: ?CodePoint = null, | ||
| 265 | before: ?CodePoint = null, | ||
| 266 | cp_iter: ReverseCodepointIterator, | ||
| 267 | wb: *const WordBreak, | ||
| 268 | |||
| 269 | /// Assumes `str` is valid UTF-8. | ||
| 270 | pub fn init(wb: *const WordBreak, str: []const u8) Iterator { | ||
| 271 | var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; | ||
| 272 | wb_iter.advance(); | ||
| 273 | return wb_iter; | ||
| 274 | } | ||
| 275 | |||
| 276 | /// Returns the previous word segment, without advancing. | ||
| 277 | pub fn peek(iter: *ReverseIterator) ?Word { | ||
| 278 | const cache = .{ iter.before, iter.after, iter.cp_iter }; | ||
| 279 | defer { | ||
| 280 | iter.before, iter.after, iter.cp_iter = cache; | ||
| 281 | } | ||
| 282 | return iter.prev(); | ||
| 283 | } | ||
| 284 | |||
| 285 | /// Return the previous word, if any. | ||
| 286 | pub fn prev(iter: *ReverseIterator) ?Word { | ||
| 287 | iter.advance(); | ||
| 288 | |||
| 289 | // Done? | ||
| 290 | if (iter.after == null) return null; | ||
| 291 | // Last? | ||
| 292 | if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 }; | ||
| 293 | |||
| 294 | const word_end = iter.after.?.offset + iter.after.?.len; | ||
| 295 | var word_len: u32 = 0; | ||
| 296 | |||
| 297 | // State variables. | ||
| 298 | var last_p: WordBreakProperty = .none; | ||
| 299 | var last_last_p: WordBreakProperty = .none; | ||
| 300 | var ri_count: usize = 0; | ||
| 301 | |||
| 302 | scan: while (true) : (iter.advance()) { | ||
| 303 | const after = iter.after.?; | ||
| 304 | word_len += after.len; | ||
| 305 | if (iter.before) |before| { | ||
| 306 | const after_p = iter.wb.breakProp(after); | ||
| 307 | const before_p = iter.wb.breakProp(before); | ||
| 308 | if (!isIgnorable(after_p)) { | ||
| 309 | last_last_p = last_p; | ||
| 310 | last_p = after_p; | ||
| 311 | } | ||
| 312 | // WB3 CR × LF | ||
| 313 | if (before_p == .CR and after_p == .LF) continue :scan; | ||
| 314 | // WB3a (Newline | CR | LF) ÷ | ||
| 315 | if (isNewline(before_p)) break :scan; | ||
| 316 | // WB3b ÷ (Newline | CR | LF) | ||
| 317 | if (isNewline(after_p)) break :scan; | ||
| 318 | // WB3c ZWJ × \p{Extended_Pictographic} | ||
| 319 | if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) { | ||
| 320 | continue :scan; | ||
| 321 | } | ||
| 322 | // WB3d WSegSpace × WSegSpace | ||
| 323 | if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; | ||
| 324 | // WB4 X (Extend | Format | ZWJ)* → X | ||
| 325 | if (isIgnorable(after_p)) { | ||
| 326 | continue :scan; | ||
| 327 | } // Now we use last_p instead of after_p for ignorable's sake | ||
| 328 | // WB5 AHLetter × AHLetter | ||
| 329 | if (isAHLetter(last_p) and isAHLetter(before_p)) { | ||
| 330 | continue :scan; | ||
| 331 | } | ||
| 332 | // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter | ||
| 333 | if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) { | ||
| 334 | continue :scan; | ||
| 335 | } | ||
| 336 | // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter | ||
| 337 | if (isMidVal(before_p)) { | ||
| 338 | const prev_val = iter.peekPast(); | ||
| 339 | if (prev_val) |prev_cp| { | ||
| 340 | const prev_p = iter.wb.breakProp(prev_cp); | ||
| 341 | if (isAHLetter(prev_p)) { | ||
| 342 | continue :scan; | ||
| 343 | } | ||
| 344 | } | ||
| 345 | } | ||
| 346 | // WB7a Hebrew_Letter × Single_Quote | ||
| 347 | if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan; | ||
| 348 | // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter | ||
| 349 | if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) { | ||
| 350 | continue :scan; | ||
| 351 | } | ||
| 352 | // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter | ||
| 353 | if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { | ||
| 354 | const prev_val = iter.peekPast(); | ||
| 355 | if (prev_val) |prev_cp| { | ||
| 356 | const prev_p = iter.wb.breakProp(prev_cp); | ||
| 357 | if (prev_p == .Hebrew_Letter) { | ||
| 358 | continue :scan; | ||
| 359 | } | ||
| 360 | } | ||
| 361 | } | ||
| 362 | // WB8 Numeric × Numeric | ||
| 363 | if (before_p == .Numeric and last_p == .Numeric) continue :scan; | ||
| 364 | // WB9 AHLetter × Numeric | ||
| 365 | if (isAHLetter(before_p) and last_p == .Numeric) continue :scan; | ||
| 366 | // WB10 Numeric × AHLetter | ||
| 367 | if (before_p == .Numeric and isAHLetter(last_p)) continue :scan; | ||
| 368 | // WB11 Numeric (MidNum | MidNumLetQ) × Numeric | ||
| 369 | if (isMidNum(before_p) and last_p == .Numeric) { | ||
| 370 | const prev_val = iter.peekPast(); | ||
| 371 | if (prev_val) |prev_cp| { | ||
| 372 | const prev_p = iter.wb.breakProp(prev_cp); | ||
| 373 | if (prev_p == .Numeric) { | ||
| 374 | continue :scan; | ||
| 375 | } | ||
| 376 | } | ||
| 377 | } | ||
| 378 | // WB12 Numeric × (MidNum | MidNumLetQ) Numeric | ||
| 379 | if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) { | ||
| 380 | continue :scan; | ||
| 381 | } | ||
| 382 | // WB13 Katakana × Katakana | ||
| 383 | if (before_p == .Katakana and last_p == .Katakana) continue :scan; | ||
| 384 | // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet | ||
| 385 | if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan; | ||
| 386 | // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) | ||
| 387 | if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan; | ||
| 388 | // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI | ||
| 389 | const maybe_flag = before_p == .Regional_Indicator and last_p == .Regional_Indicator; | ||
| 390 | if (maybe_flag) { | ||
| 391 | ri_count += 1; | ||
| 392 | if (ri_count % 2 == 1) continue :scan; | ||
| 393 | } | ||
| 394 | // WB999 Any ÷ Any | ||
| 395 | break :scan; | ||
| 396 | } | ||
| 397 | break :scan; | ||
| 398 | } | ||
| 399 | return Word{ .len = word_len, .offset = word_end - word_len }; | ||
| 400 | } | ||
| 401 | |||
| 402 | fn peekPast(iter: *ReverseIterator) ?CodePoint { | ||
| 403 | const save_cp = iter.cp_iter; | ||
| 404 | defer iter.cp_iter = save_cp; | ||
| 405 | while (iter.cp_iter.peek()) |peeked| { | ||
| 406 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | ||
| 407 | _ = iter.cp_iter.prev(); | ||
| 408 | } | ||
| 409 | return null; | ||
| 410 | } | ||
| 411 | |||
| 412 | fn advance(iter: *ReverseIterator) void { | ||
| 413 | iter.after = iter.before; | ||
| 414 | iter.before = iter.cp_iter.prev(); | ||
| 415 | } | ||
| 416 | }; | ||
| 417 | |||
| 263 | inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { | 418 | inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { |
| 264 | const decompressor = compress.flate.inflate.decompressor; | 419 | const decompressor = compress.flate.inflate.decompressor; |
| 265 | const in_bytes = @embedFile("wbp"); | 420 | const in_bytes = @embedFile("wbp"); |
| @@ -371,6 +526,7 @@ const testing = std.testing; | |||
| 371 | 526 | ||
| 372 | const code_point = @import("code_point"); | 527 | const code_point = @import("code_point"); |
| 373 | const CodepointIterator = code_point.Iterator; | 528 | const CodepointIterator = code_point.Iterator; |
| 529 | const ReverseCodepointIterator = code_point.ReverseIterator; | ||
| 374 | const CodePoint = code_point.CodePoint; | 530 | const CodePoint = code_point.CodePoint; |
| 375 | 531 | ||
| 376 | const ext_pict = @import("micro_runeset.zig").Extended_Pictographic; | 532 | const ext_pict = @import("micro_runeset.zig").Extended_Pictographic; |
diff --git a/src/code_point.zig b/src/code_point.zig index 79ee5cd..a5b10d4 100644 --- a/src/code_point.zig +++ b/src/code_point.zig | |||
| @@ -12,7 +12,7 @@ pub const CodePoint = struct { | |||
| 12 | offset: u32, | 12 | offset: u32, |
| 13 | 13 | ||
| 14 | /// Return the slice of this codepoint, given the original string. | 14 | /// Return the slice of this codepoint, given the original string. |
| 15 | pub fn bytes(cp: CodePoint, str: []const u8) []const u8 { | 15 | pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 { |
| 16 | return str[cp.offset..][0..cp.len]; | 16 | return str[cp.offset..][0..cp.len]; |
| 17 | } | 17 | } |
| 18 | }; | 18 | }; |