From ef27c51b8e46f3909a27fd137429b717797f1fd9 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 23 May 2025 16:48:55 -0400 Subject: Add iterateBefore and iterateAfter These create reverse or forward iterators before or after a Word. So this way, the user can get the word at an index, then iterate forward or back from that word. Also: Fixes #59 Which was fixed awhile back, but I don't feel like doing repo surgery to tag the fix where it happened. We have blame for that kind of thing. --- src/Words.zig | 98 ++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 66 insertions(+), 32 deletions(-) (limited to 'src/Words.zig') diff --git a/src/Words.zig b/src/Words.zig index 565a2fb..1d10b2a 100644 --- a/src/Words.zig +++ b/src/Words.zig @@ -1,4 +1,7 @@ //! Word Breaking Algorithm. +//! +//! https://www.unicode.org/reports/tr29/#Word_Boundaries +//! const WordBreakProperty = enum(u5) { none, @@ -42,9 +45,9 @@ pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void { }; } -pub fn deinit(wordbreak: *const Words, allocator: mem.Allocator) void { - allocator.free(wordbreak.s1); - allocator.free(wordbreak.s2); +pub fn deinit(words: *const Words, allocator: mem.Allocator) void { + allocator.free(words.s1); + allocator.free(words.s2); } /// Represents a Unicode word span, as an offset into the source string @@ -54,51 +57,44 @@ pub const Word = struct { len: u32, /// Returns a slice of the word given the source string. - pub fn bytes(self: Word, src: []const u8) []const u8 { - return src[self.offset..][0..self.len]; + pub fn bytes(word: Word, src: []const u8) []const u8 { + return src[word.offset..][0..word.len]; } }; /// Returns the word break property type for `cp`. -pub fn breakProperty(wordbreak: *const Words, cp: u21) WordBreakProperty { - return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); +pub fn breakProperty(words: *const Words, cp: u21) WordBreakProperty { + return @enumFromInt(words.s2[words.s1[cp >> 8] + (cp & 0xff)]); } /// Convenience function for working with CodePoints -fn breakProp(wb: *const Words, point: CodePoint) WordBreakProperty { - return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); +fn breakProp(words: *const Words, point: CodePoint) WordBreakProperty { + return @enumFromInt(words.s2[words.s1[point.code >> 8] + (point.code & 0xff)]); } /// Returns the Word at the given index. Asserts that the index is less than /// `string.len`, and that the string is not empty. Always returns a word. /// The index does not have to be the start of a codepoint in the word. -pub fn wordAtIndex(wordbreak: *const Words, string: []const u8, index: usize) Word { +pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word { assert(index < string.len and string.len > 0); - var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); + var iter_back: ReverseIterator = reverseFromIndex(words, string, index); const first_back = iter_back.prev(); if (first_back) |back| { if (back.offset == 0) { - var iter_fwd = wordbreak.iterator(string); + var iter_fwd = words.iterator(string); while (iter_fwd.next()) |word| { if (word.offset <= index and index < word.offset + word.len) return word; } } } else { - var iter_fwd = wordbreak.iterator(string); + var iter_fwd = words.iterator(string); while (iter_fwd.next()) |word| { if (word.offset <= index and index < word.offset + word.len) return word; } } - const second_back = iter_back.prev(); - if (second_back) |back| if (back.offset == 0) { - var iter_fwd = wordbreak.iterator(string); - while (iter_fwd.next()) |word| { - if (word.offset <= index and index < word.offset + word.len) - return word; - } - }; + _ = iter_back.prev(); // There's sometimes flags: if (iter_back.flags > 0) { while (iter_back.flags > 0) { @@ -118,13 +114,23 @@ pub fn wordAtIndex(wordbreak: *const Words, string: []const u8, index: usize) Wo } /// Returns an iterator over words in `slice`. -pub fn iterator(wordbreak: *const Words, slice: []const u8) Iterator { - return Iterator.init(wordbreak, slice); +pub fn iterator(words: *const Words, slice: []const u8) Iterator { + return Iterator.init(words, slice); } /// Returns a reverse iterator over the words in `slice`. -pub fn reverseIterator(wordbreak: *const Words, slice: []const u8) ReverseIterator { - return ReverseIterator.init(wordbreak, slice); +pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator { + return ReverseIterator.init(words, slice); +} + +/// Returns an iterator after the `word` in `slice`. +pub fn iterateAfter(words: *const Words, slice: []const u8, word: Word) Iterator { + return forwardFromIndex(words, slice, word.offset + word.len); +} + +/// Returns a reverse iterator before the `word` in `slice`. +pub fn iterateBefore(words: *const Words, slice: []const u8, word: Word) ReverseIterator { + return reverseFromIndex(words, slice, word.offset); } /// An iterator, forward, over all words in a provided string. @@ -135,8 +141,8 @@ pub const Iterator = struct { wb: *const Words, /// Assumes `str` is valid UTF-8. - pub fn init(wb: *const Words, str: []const u8) Iterator { - var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; + pub fn init(words: *const Words, str: []const u8) Iterator { + var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = words }; wb_iter.advance(); return wb_iter; } @@ -318,8 +324,8 @@ pub const ReverseIterator = struct { flags: usize = 0, /// Assumes `str` is valid UTF-8. - pub fn init(wb: *const Words, str: []const u8) ReverseIterator { - var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; + pub fn init(words: *const Words, str: []const u8) ReverseIterator { + var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = words }; wb_iter.advance(); return wb_iter; } @@ -511,13 +517,13 @@ pub const ReverseIterator = struct { //| Implementation Details /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. -fn initAtIndex(wb: *const Words, string: []const u8, index: usize) ReverseIterator { +fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator { var idx: u32 = @intCast(index); // Find the next lead byte: while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} - if (idx == string.len) return wb.reverseIterator(string); + if (idx == string.len) return words.reverseIterator(string); var iter: ReverseIterator = undefined; - iter.wb = wb; + iter.wb = words; iter.flags = 0; // We need to populate the CodePoints, and the codepoint iterator. // Consider "abc| def" with the cursor as |. @@ -530,6 +536,34 @@ fn initAtIndex(wb: *const Words, string: []const u8, index: usize) ReverseIterat return iter; } +fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator { + var idx: u32 = @intCast(index); + if (idx == string.len) { + return .{ + .cp_iter = .{ .bytes = string, .i = idx }, + .this = null, + .that = null, + .wb = words, + }; + } + while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} + if (idx == 0) return words.iterator(string); + var iter: Iterator = undefined; + iter.wb = words; + // We need to populate the CodePoints, and the codepoint iterator. + // Consider "abc |def" with the cursor as |. + // We need `this` to be ` ` and `that` to be 'd', + // and `cp_iter.next()` to be `d`. + idx -= 1; + while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} + // "abc| def" + var cp_iter: CodepointIterator = .{ .bytes = string, .i = idx }; + iter.this = cp_iter.next(); + iter.that = cp_iter.next(); + iter.cp_iter = cp_iter; + return iter; +} + fn sneaky(iter: *const ReverseIterator) SneakIterator { return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; } -- cgit v1.2.3