diff options
| -rw-r--r-- | build.zig.zon | 2 | ||||
| -rw-r--r-- | src/Words.zig | 98 | ||||
| -rw-r--r-- | src/unicode_tests.zig | 38 |
3 files changed, 105 insertions, 33 deletions
diff --git a/build.zig.zon b/build.zig.zon index b69249f..3e1df95 100644 --- a/build.zig.zon +++ b/build.zig.zon | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | .{ | 1 | .{ |
| 2 | .name = .zg, | 2 | .name = .zg, |
| 3 | .version = "0.14.0", | 3 | .version = "0.14.1", |
| 4 | .minimum_zig_version = "0.14.0", | 4 | .minimum_zig_version = "0.14.0", |
| 5 | .fingerprint = 0x47df7778dc946aa0, | 5 | .fingerprint = 0x47df7778dc946aa0, |
| 6 | 6 | ||
diff --git a/src/Words.zig b/src/Words.zig index 565a2fb..1d10b2a 100644 --- a/src/Words.zig +++ b/src/Words.zig | |||
| @@ -1,4 +1,7 @@ | |||
| 1 | //! Word Breaking Algorithm. | 1 | //! Word Breaking Algorithm. |
| 2 | //! | ||
| 3 | //! https://www.unicode.org/reports/tr29/#Word_Boundaries | ||
| 4 | //! | ||
| 2 | 5 | ||
| 3 | const WordBreakProperty = enum(u5) { | 6 | const WordBreakProperty = enum(u5) { |
| 4 | none, | 7 | none, |
| @@ -42,9 +45,9 @@ pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void { | |||
| 42 | }; | 45 | }; |
| 43 | } | 46 | } |
| 44 | 47 | ||
| 45 | pub fn deinit(wordbreak: *const Words, allocator: mem.Allocator) void { | 48 | pub fn deinit(words: *const Words, allocator: mem.Allocator) void { |
| 46 | allocator.free(wordbreak.s1); | 49 | allocator.free(words.s1); |
| 47 | allocator.free(wordbreak.s2); | 50 | allocator.free(words.s2); |
| 48 | } | 51 | } |
| 49 | 52 | ||
| 50 | /// Represents a Unicode word span, as an offset into the source string | 53 | /// Represents a Unicode word span, as an offset into the source string |
| @@ -54,51 +57,44 @@ pub const Word = struct { | |||
| 54 | len: u32, | 57 | len: u32, |
| 55 | 58 | ||
| 56 | /// Returns a slice of the word given the source string. | 59 | /// Returns a slice of the word given the source string. |
| 57 | pub fn bytes(self: Word, src: []const u8) []const u8 { | 60 | pub fn bytes(word: Word, src: []const u8) []const u8 { |
| 58 | return src[self.offset..][0..self.len]; | 61 | return src[word.offset..][0..word.len]; |
| 59 | } | 62 | } |
| 60 | }; | 63 | }; |
| 61 | 64 | ||
| 62 | /// Returns the word break property type for `cp`. | 65 | /// Returns the word break property type for `cp`. |
| 63 | pub fn breakProperty(wordbreak: *const Words, cp: u21) WordBreakProperty { | 66 | pub fn breakProperty(words: *const Words, cp: u21) WordBreakProperty { |
| 64 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); | 67 | return @enumFromInt(words.s2[words.s1[cp >> 8] + (cp & 0xff)]); |
| 65 | } | 68 | } |
| 66 | 69 | ||
| 67 | /// Convenience function for working with CodePoints | 70 | /// Convenience function for working with CodePoints |
| 68 | fn breakProp(wb: *const Words, point: CodePoint) WordBreakProperty { | 71 | fn breakProp(words: *const Words, point: CodePoint) WordBreakProperty { |
| 69 | return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); | 72 | return @enumFromInt(words.s2[words.s1[point.code >> 8] + (point.code & 0xff)]); |
| 70 | } | 73 | } |
| 71 | 74 | ||
| 72 | /// Returns the Word at the given index. Asserts that the index is less than | 75 | /// Returns the Word at the given index. Asserts that the index is less than |
| 73 | /// `string.len`, and that the string is not empty. Always returns a word. | 76 | /// `string.len`, and that the string is not empty. Always returns a word. |
| 74 | /// The index does not have to be the start of a codepoint in the word. | 77 | /// The index does not have to be the start of a codepoint in the word. |
| 75 | pub fn wordAtIndex(wordbreak: *const Words, string: []const u8, index: usize) Word { | 78 | pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word { |
| 76 | assert(index < string.len and string.len > 0); | 79 | assert(index < string.len and string.len > 0); |
| 77 | var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); | 80 | var iter_back: ReverseIterator = reverseFromIndex(words, string, index); |
| 78 | const first_back = iter_back.prev(); | 81 | const first_back = iter_back.prev(); |
| 79 | if (first_back) |back| { | 82 | if (first_back) |back| { |
| 80 | if (back.offset == 0) { | 83 | if (back.offset == 0) { |
| 81 | var iter_fwd = wordbreak.iterator(string); | 84 | var iter_fwd = words.iterator(string); |
| 82 | while (iter_fwd.next()) |word| { | 85 | while (iter_fwd.next()) |word| { |
| 83 | if (word.offset <= index and index < word.offset + word.len) | 86 | if (word.offset <= index and index < word.offset + word.len) |
| 84 | return word; | 87 | return word; |
| 85 | } | 88 | } |
| 86 | } | 89 | } |
| 87 | } else { | 90 | } else { |
| 88 | var iter_fwd = wordbreak.iterator(string); | 91 | var iter_fwd = words.iterator(string); |
| 89 | while (iter_fwd.next()) |word| { | 92 | while (iter_fwd.next()) |word| { |
| 90 | if (word.offset <= index and index < word.offset + word.len) | 93 | if (word.offset <= index and index < word.offset + word.len) |
| 91 | return word; | 94 | return word; |
| 92 | } | 95 | } |
| 93 | } | 96 | } |
| 94 | const second_back = iter_back.prev(); | 97 | _ = iter_back.prev(); |
| 95 | if (second_back) |back| if (back.offset == 0) { | ||
| 96 | var iter_fwd = wordbreak.iterator(string); | ||
| 97 | while (iter_fwd.next()) |word| { | ||
| 98 | if (word.offset <= index and index < word.offset + word.len) | ||
| 99 | return word; | ||
| 100 | } | ||
| 101 | }; | ||
| 102 | // There's sometimes flags: | 98 | // There's sometimes flags: |
| 103 | if (iter_back.flags > 0) { | 99 | if (iter_back.flags > 0) { |
| 104 | while (iter_back.flags > 0) { | 100 | while (iter_back.flags > 0) { |
| @@ -118,13 +114,23 @@ pub fn wordAtIndex(wordbreak: *const Words, string: []const u8, index: usize) Wo | |||
| 118 | } | 114 | } |
| 119 | 115 | ||
| 120 | /// Returns an iterator over words in `slice`. | 116 | /// Returns an iterator over words in `slice`. |
| 121 | pub fn iterator(wordbreak: *const Words, slice: []const u8) Iterator { | 117 | pub fn iterator(words: *const Words, slice: []const u8) Iterator { |
| 122 | return Iterator.init(wordbreak, slice); | 118 | return Iterator.init(words, slice); |
| 123 | } | 119 | } |
| 124 | 120 | ||
| 125 | /// Returns a reverse iterator over the words in `slice`. | 121 | /// Returns a reverse iterator over the words in `slice`. |
| 126 | pub fn reverseIterator(wordbreak: *const Words, slice: []const u8) ReverseIterator { | 122 | pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator { |
| 127 | return ReverseIterator.init(wordbreak, slice); | 123 | return ReverseIterator.init(words, slice); |
| 124 | } | ||
| 125 | |||
| 126 | /// Returns an iterator after the `word` in `slice`. | ||
| 127 | pub fn iterateAfter(words: *const Words, slice: []const u8, word: Word) Iterator { | ||
| 128 | return forwardFromIndex(words, slice, word.offset + word.len); | ||
| 129 | } | ||
| 130 | |||
| 131 | /// Returns a reverse iterator before the `word` in `slice`. | ||
| 132 | pub fn iterateBefore(words: *const Words, slice: []const u8, word: Word) ReverseIterator { | ||
| 133 | return reverseFromIndex(words, slice, word.offset); | ||
| 128 | } | 134 | } |
| 129 | 135 | ||
| 130 | /// An iterator, forward, over all words in a provided string. | 136 | /// An iterator, forward, over all words in a provided string. |
| @@ -135,8 +141,8 @@ pub const Iterator = struct { | |||
| 135 | wb: *const Words, | 141 | wb: *const Words, |
| 136 | 142 | ||
| 137 | /// Assumes `str` is valid UTF-8. | 143 | /// Assumes `str` is valid UTF-8. |
| 138 | pub fn init(wb: *const Words, str: []const u8) Iterator { | 144 | pub fn init(words: *const Words, str: []const u8) Iterator { |
| 139 | var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; | 145 | var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = words }; |
| 140 | wb_iter.advance(); | 146 | wb_iter.advance(); |
| 141 | return wb_iter; | 147 | return wb_iter; |
| 142 | } | 148 | } |
| @@ -318,8 +324,8 @@ pub const ReverseIterator = struct { | |||
| 318 | flags: usize = 0, | 324 | flags: usize = 0, |
| 319 | 325 | ||
| 320 | /// Assumes `str` is valid UTF-8. | 326 | /// Assumes `str` is valid UTF-8. |
| 321 | pub fn init(wb: *const Words, str: []const u8) ReverseIterator { | 327 | pub fn init(words: *const Words, str: []const u8) ReverseIterator { |
| 322 | var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; | 328 | var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = words }; |
| 323 | wb_iter.advance(); | 329 | wb_iter.advance(); |
| 324 | return wb_iter; | 330 | return wb_iter; |
| 325 | } | 331 | } |
| @@ -511,13 +517,13 @@ pub const ReverseIterator = struct { | |||
| 511 | //| Implementation Details | 517 | //| Implementation Details |
| 512 | 518 | ||
| 513 | /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. | 519 | /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. |
| 514 | fn initAtIndex(wb: *const Words, string: []const u8, index: usize) ReverseIterator { | 520 | fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator { |
| 515 | var idx: u32 = @intCast(index); | 521 | var idx: u32 = @intCast(index); |
| 516 | // Find the next lead byte: | 522 | // Find the next lead byte: |
| 517 | while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} | 523 | while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} |
| 518 | if (idx == string.len) return wb.reverseIterator(string); | 524 | if (idx == string.len) return words.reverseIterator(string); |
| 519 | var iter: ReverseIterator = undefined; | 525 | var iter: ReverseIterator = undefined; |
| 520 | iter.wb = wb; | 526 | iter.wb = words; |
| 521 | iter.flags = 0; | 527 | iter.flags = 0; |
| 522 | // We need to populate the CodePoints, and the codepoint iterator. | 528 | // We need to populate the CodePoints, and the codepoint iterator. |
| 523 | // Consider "abc| def" with the cursor as |. | 529 | // Consider "abc| def" with the cursor as |. |
| @@ -530,6 +536,34 @@ fn initAtIndex(wb: *const Words, string: []const u8, index: usize) ReverseIterat | |||
| 530 | return iter; | 536 | return iter; |
| 531 | } | 537 | } |
| 532 | 538 | ||
| 539 | fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator { | ||
| 540 | var idx: u32 = @intCast(index); | ||
| 541 | if (idx == string.len) { | ||
| 542 | return .{ | ||
| 543 | .cp_iter = .{ .bytes = string, .i = idx }, | ||
| 544 | .this = null, | ||
| 545 | .that = null, | ||
| 546 | .wb = words, | ||
| 547 | }; | ||
| 548 | } | ||
| 549 | while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} | ||
| 550 | if (idx == 0) return words.iterator(string); | ||
| 551 | var iter: Iterator = undefined; | ||
| 552 | iter.wb = words; | ||
| 553 | // We need to populate the CodePoints, and the codepoint iterator. | ||
| 554 | // Consider "abc |def" with the cursor as |. | ||
| 555 | // We need `this` to be ` ` and `that` to be 'd', | ||
| 556 | // and `cp_iter.next()` to be `d`. | ||
| 557 | idx -= 1; | ||
| 558 | while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} | ||
| 559 | // "abc| def" | ||
| 560 | var cp_iter: CodepointIterator = .{ .bytes = string, .i = idx }; | ||
| 561 | iter.this = cp_iter.next(); | ||
| 562 | iter.that = cp_iter.next(); | ||
| 563 | iter.cp_iter = cp_iter; | ||
| 564 | return iter; | ||
| 565 | } | ||
| 566 | |||
| 533 | fn sneaky(iter: *const ReverseIterator) SneakIterator { | 567 | fn sneaky(iter: *const ReverseIterator) SneakIterator { |
| 534 | return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; | 568 | return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; |
| 535 | } | 569 | } |
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 18f1814..195fdcb 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -287,6 +287,25 @@ test "Segmentation Word Iterator" { | |||
| 287 | } else { | 287 | } else { |
| 288 | try testing.expect(false); | 288 | try testing.expect(false); |
| 289 | } | 289 | } |
| 290 | var peek_iter = wb.iterateAfter(this_str, got_word); | ||
| 291 | const peek_1 = peek_iter.next(); | ||
| 292 | if (peek_1) |p1| { | ||
| 293 | const peek_2 = iter.peek(); | ||
| 294 | if (peek_2) |p2| { | ||
| 295 | std.testing.expectEqualSlices( | ||
| 296 | u8, | ||
| 297 | p1.bytes(this_str), | ||
| 298 | p2.bytes(this_str), | ||
| 299 | ) catch |err| { | ||
| 300 | debug.print("Bad peek on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, idx }); | ||
| 301 | return err; | ||
| 302 | }; | ||
| 303 | } else { | ||
| 304 | try testing.expect(false); | ||
| 305 | } | ||
| 306 | } else { | ||
| 307 | try testing.expectEqual(null, iter.peek()); | ||
| 308 | } | ||
| 290 | for (got_word.offset..got_word.offset + got_word.len) |i| { | 309 | for (got_word.offset..got_word.offset + got_word.len) |i| { |
| 291 | const this_word = wb.wordAtIndex(this_str, i); | 310 | const this_word = wb.wordAtIndex(this_str, i); |
| 292 | std.testing.expectEqualSlices( | 311 | std.testing.expectEqualSlices( |
| @@ -337,6 +356,25 @@ test "Segmentation Word Iterator" { | |||
| 337 | } else { | 356 | } else { |
| 338 | try testing.expect(false); | 357 | try testing.expect(false); |
| 339 | } | 358 | } |
| 359 | var peek_iter = wb.iterateBefore(this_str, got_word); | ||
| 360 | const peek_1 = peek_iter.prev(); | ||
| 361 | if (peek_1) |p1| { | ||
| 362 | const peek_2 = r_iter.peek(); | ||
| 363 | if (peek_2) |p2| { | ||
| 364 | std.testing.expectEqualSlices( | ||
| 365 | u8, | ||
| 366 | p1.bytes(this_str), | ||
| 367 | p2.bytes(this_str), | ||
| 368 | ) catch |err| { | ||
| 369 | debug.print("Bad peek on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, idx }); | ||
| 370 | return err; | ||
| 371 | }; | ||
| 372 | } else { | ||
| 373 | try testing.expect(false); | ||
| 374 | } | ||
| 375 | } else { | ||
| 376 | try testing.expectEqual(null, r_iter.peek()); | ||
| 377 | } | ||
| 340 | for (got_word.offset..got_word.offset + got_word.len) |i| { | 378 | for (got_word.offset..got_word.offset + got_word.len) |i| { |
| 341 | const this_word = wb.wordAtIndex(this_str, i); | 379 | const this_word = wb.wordAtIndex(this_str, i); |
| 342 | std.testing.expectEqualSlices( | 380 | std.testing.expectEqualSlices( |