summaryrefslogtreecommitdiff
path: root/src/WordBreak.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/WordBreak.zig')
-rw-r--r--src/WordBreak.zig161
1 files changed, 72 insertions, 89 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 7ac8f14..6ada7e1 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -64,58 +64,57 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
64 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); 64 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
65} 65}
66 66
67/// Convenience function for working with CodePoints
67fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { 68fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty {
68 return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); 69 return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]);
69} 70}
70 71
71/// Returns the Word at the given index. Asserts that the index is valid for 72/// Returns the Word at the given index. Asserts that the index is less than
72/// the provided string, and that the string is not empty. Always returns a word. 73/// `string.len`, and that the string is not empty. Always returns a word.
73/// The index does not have to be the start of a codepoint in the word. 74/// The index does not have to be the start of a codepoint in the word.
74pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { 75pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word {
75 assert(index < string.len and string.len > 0); 76 assert(index < string.len and string.len > 0);
76 var iter_fwd: Iterator = .initAtIndex(wordbreak, string, index); 77 var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index);
77 if (iter_fwd.next()) |_| { 78 const first_back = iter_back.prev();
78 // This is a bit tricky because we may have been in the middle of various 79 if (first_back) |back| {
79 // stateful things. So we go forward again: 80 if (back.offset == 0) {
80 if (iter_fwd.next()) |_| { 81 var iter_fwd = wordbreak.iterator(string);
81 // Make a back iterator: 82 while (iter_fwd.next()) |word| {
82 var iter_back = iter_fwd.reverseIterator(); 83 if (word.offset <= index and index < word.offset + word.len)
83 const last_word = iter_back.prev().?; // Always works.
84 const no_flags = iter_back.flags == 0;
85 if (no_flags) {
86 // Next previous is our word.
87 const the_word = iter_back.prev();
88 if (the_word) |word| {
89 assert(word.offset <= index and index <= word.offset + word.len);
90 return word; 84 return word;
91 } else { // Can happen, at least I think so 85 }
92 assert(last_word.offset <= index and index <= last_word.offset + last_word.len); 86 }
93 return last_word; 87 } else {
94 } 88 var iter_fwd = wordbreak.iterator(string);
89 while (iter_fwd.next()) |word| {
90 if (word.offset <= index and index < word.offset + word.len)
91 return word;
92 }
93 }
94 const second_back = iter_back.prev();
95 if (second_back) |back| if (back.offset == 0) {
96 var iter_fwd = wordbreak.iterator(string);
97 while (iter_fwd.next()) |word| {
98 if (word.offset <= index and index < word.offset + word.len)
99 return word;
100 }
101 };
102 // There's sometimes flags:
103 if (iter_back.flags > 0) {
104 while (iter_back.flags > 0) {
105 if (iter_back.prev()) |_| {
106 continue;
95 } else { 107 } else {
96 // Scan past all the flags. 108 break;
97 while (iter_back.flags > 0) {
98 _ = iter_back.prev();
99 }
100 // Now just look for our word
101 iter_fwd = iter_back.forwardIterator();
102 while (iter_fwd.next()) |word| {
103 if (word.offset <= index and index <= word.offset + word.len) {
104 return word;
105 }
106 }
107 unreachable;
108 } 109 }
109 } else { // We can just reverse here
110 var iter_back = iter_fwd.reverseIterator();
111 const word = iter_back.prev().?;
112 assert(word.offset <= index and index <= word.offset + word.len);
113 return word;
114 } 110 }
115 } else { // last word then
116 var iter_back = iter_fwd.reverseIterator();
117 return iter_back.prev().?;
118 } 111 }
112 var iter_fwd = iter_back.forwardIterator();
113 while (iter_fwd.next()) |word| {
114 if (word.offset <= index and index < word.offset + word.len)
115 return word;
116 }
117 unreachable;
119} 118}
120 119
121/// Returns an iterator over words in `slice`. 120/// Returns an iterator over words in `slice`.
@@ -128,6 +127,7 @@ pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIt
128 return ReverseIterator.init(wordbreak, slice); 127 return ReverseIterator.init(wordbreak, slice);
129} 128}
130 129
130/// An iterator, forward, over all words in a provided string.
131pub const Iterator = struct { 131pub const Iterator = struct {
132 this: ?CodePoint = null, 132 this: ?CodePoint = null,
133 that: ?CodePoint = null, 133 that: ?CodePoint = null,
@@ -150,37 +150,7 @@ pub const Iterator = struct {
150 return iter.next(); 150 return iter.next();
151 } 151 }
152 152
153 /// Initialize an Iterator at the provided index. Assumes str is valid 153 /// Returns a reverse iterator from the point this iterator is paused
154 /// UTF-8, asserts that `index` is less than str.len, and that `str` is not
155 /// empty. Note that for various stateful reasons, this may give spurious
156 /// results if used naïvely. If you want to reliably iterate from an index,
157 /// use `wb.wordAtIndex(string, index)` to obtain the word, then start an
158 /// iterator at `wb.initAtIndex(string, word.offset)`.
159 pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator {
160 assert(index < string.len and string.len > 0);
161 // Just in case...
162 if (index == 0) return wb.iterator(string);
163 var idx: u32 = @intCast(index);
164 // Back up past any any follow bytes:
165 while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
166 var iter: Iterator = undefined;
167 iter.wb = wb;
168 // We need to populate the CodePoints, and the codepoint iterator.
169 // Consider "abc |def" with the cursor on d.
170 // We need `this` to be ` ` and `that` to be 'd',
171 // and `cp_iter.next()` to be `e`.
172 var cp_back: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
173 // Reverse gives us before `d`:
174 iter.this = cp_back.prev(); // that == ` `
175 // This iterator will give us `d`:
176 iter.cp_iter = .{ .bytes = string, .i = idx };
177 iter.that = iter.cp_iter.next();
178 // So that the next call will will give us `e`,
179 // thus the word will be `def`.
180 return iter;
181 }
182
183 /// Return a reverse iterator from the point this iterator is paused
184 /// at. Usually, calling `prev()` will return the word just seen. 154 /// at. Usually, calling `prev()` will return the word just seen.
185 pub fn reverseIterator(iter: *Iterator) ReverseIterator { 155 pub fn reverseIterator(iter: *Iterator) ReverseIterator {
186 var cp_it = iter.cp_iter.reverseIterator(); 156 var cp_it = iter.cp_iter.reverseIterator();
@@ -196,7 +166,7 @@ pub const Iterator = struct {
196 }; 166 };
197 } 167 }
198 168
199 /// Returns the next word segment. 169 /// Returns the next word segment, if any.
200 pub fn next(iter: *Iterator) ?Word { 170 pub fn next(iter: *Iterator) ?Word {
201 iter.advance(); 171 iter.advance();
202 172
@@ -338,6 +308,7 @@ pub const Iterator = struct {
338 } 308 }
339}; 309};
340 310
311/// An iterator, backward, over all words in a provided string.
341pub const ReverseIterator = struct { 312pub const ReverseIterator = struct {
342 after: ?CodePoint = null, 313 after: ?CodePoint = null,
343 before: ?CodePoint = null, 314 before: ?CodePoint = null,
@@ -352,16 +323,7 @@ pub const ReverseIterator = struct {
352 return wb_iter; 323 return wb_iter;
353 } 324 }
354 325
355 /// Initialize a ReverseIterator at the provided index. Assumes str is valid 326 /// Returns the previous word segment, if any, without advancing.
356 /// UTF-8, asserts that `index` is less than str.len, and that `str` is not
357 /// empty. You should prefer not to use this function, see Iterator.initAtIndex
358 /// for more details.
359 pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator {
360 var fw_iter = Iterator.initAtIndex(wb, string, index);
361 return fw_iter.reverseIterator();
362 }
363
364 /// Returns the previous word segment, without advancing.
365 pub fn peek(iter: *ReverseIterator) ?Word { 327 pub fn peek(iter: *ReverseIterator) ?Word {
366 const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; 328 const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags };
367 defer { 329 defer {
@@ -544,6 +506,27 @@ pub const ReverseIterator = struct {
544 } 506 }
545}; 507};
546 508
509//| Implementation Details
510
511/// Initialize a ReverseIterator at the provided index. Used in wordAtIndex.
512fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator {
513 var idx: u32 = @intCast(index);
514 while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
515 if (idx == string.len) return wb.reverseIterator(string);
516 var iter: ReverseIterator = undefined;
517 iter.wb = wb;
518 iter.flags = 0;
519 // We need to populate the CodePoints, and the codepoint iterator.
520 // Consider "abc| def" with the cursor as |.
521 // We need `before` to be `c` and `after` to be ' ',
522 // and `cp_iter.prev()` to be `b`.
523 var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
524 iter.after = cp_iter.prev();
525 iter.before = cp_iter.prev();
526 iter.cp_iter = cp_iter;
527 return iter;
528}
529
547fn sneaky(iter: *const ReverseIterator) SneakIterator { 530fn sneaky(iter: *const ReverseIterator) SneakIterator {
548 return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; 531 return .{ .cp_iter = iter.cp_iter, .wb = iter.wb };
549} 532}
@@ -656,23 +639,23 @@ test "ext_pict" {
656 try testing.expect(ext_pict.isMatch("\u{2701}")); 639 try testing.expect(ext_pict.isMatch("\u{2701}"));
657} 640}
658 641
659test wordAtCursor { 642test wordAtIndex {
660 const wb = try WordBreak.init(testing.allocator); 643 const wb = try WordBreak.init(testing.allocator);
661 defer wb.deinit(testing.allocator); 644 defer wb.deinit(testing.allocator);
662 const t_string = "first second third"; 645 const t_string = "first second third";
663 const second = wb.wordAtCursor(t_string, 8); 646 const second = wb.wordAtIndex(t_string, 8);
664 try testing.expectEqualStrings("second", second.bytes(t_string)); 647 try testing.expectEqualStrings("second", second.bytes(t_string));
665 const third = wb.wordAtCursor(t_string, 14); 648 const third = wb.wordAtIndex(t_string, 14);
666 try testing.expectEqualStrings("third", third.bytes(t_string)); 649 try testing.expectEqualStrings("third", third.bytes(t_string));
667 { 650 {
668 const first = wb.wordAtCursor(t_string, 3); 651 const first = wb.wordAtIndex(t_string, 3);
669 try testing.expectEqualStrings("first", first.bytes(t_string)); 652 try testing.expectEqualStrings("first", first.bytes(t_string));
670 } 653 }
671 { 654 {
672 const first = wb.wordAtCursor(t_string, 0); 655 const first = wb.wordAtIndex(t_string, 0);
673 try testing.expectEqualStrings("first", first.bytes(t_string)); 656 try testing.expectEqualStrings("first", first.bytes(t_string));
674 } 657 }
675 const last = wb.wordAtCursor(t_string, 14); 658 const last = wb.wordAtIndex(t_string, 14);
676 try testing.expectEqualStrings("third", last.bytes(t_string)); 659 try testing.expectEqualStrings("third", last.bytes(t_string));
677} 660}
678 661