summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/WordBreak.zig161
-rw-r--r--src/code_point.zig1
-rw-r--r--src/unicode_tests.zig76
3 files changed, 135 insertions, 103 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 7ac8f14..6ada7e1 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -64,58 +64,57 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
64 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); 64 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
65} 65}
66 66
67/// Convenience function for working with CodePoints
67fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { 68fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty {
68 return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); 69 return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]);
69} 70}
70 71
71/// Returns the Word at the given index. Asserts that the index is valid for 72/// Returns the Word at the given index. Asserts that the index is less than
72/// the provided string, and that the string is not empty. Always returns a word. 73/// `string.len`, and that the string is not empty. Always returns a word.
73/// The index does not have to be the start of a codepoint in the word. 74/// The index does not have to be the start of a codepoint in the word.
74pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { 75pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word {
75 assert(index < string.len and string.len > 0); 76 assert(index < string.len and string.len > 0);
76 var iter_fwd: Iterator = .initAtIndex(wordbreak, string, index); 77 var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index);
77 if (iter_fwd.next()) |_| { 78 const first_back = iter_back.prev();
78 // This is a bit tricky because we may have been in the middle of various 79 if (first_back) |back| {
79 // stateful things. So we go forward again: 80 if (back.offset == 0) {
80 if (iter_fwd.next()) |_| { 81 var iter_fwd = wordbreak.iterator(string);
81 // Make a back iterator: 82 while (iter_fwd.next()) |word| {
82 var iter_back = iter_fwd.reverseIterator(); 83 if (word.offset <= index and index < word.offset + word.len)
83 const last_word = iter_back.prev().?; // Always works.
84 const no_flags = iter_back.flags == 0;
85 if (no_flags) {
86 // Next previous is our word.
87 const the_word = iter_back.prev();
88 if (the_word) |word| {
89 assert(word.offset <= index and index <= word.offset + word.len);
90 return word; 84 return word;
91 } else { // Can happen, at least I think so 85 }
92 assert(last_word.offset <= index and index <= last_word.offset + last_word.len); 86 }
93 return last_word; 87 } else {
94 } 88 var iter_fwd = wordbreak.iterator(string);
89 while (iter_fwd.next()) |word| {
90 if (word.offset <= index and index < word.offset + word.len)
91 return word;
92 }
93 }
94 const second_back = iter_back.prev();
95 if (second_back) |back| if (back.offset == 0) {
96 var iter_fwd = wordbreak.iterator(string);
97 while (iter_fwd.next()) |word| {
98 if (word.offset <= index and index < word.offset + word.len)
99 return word;
100 }
101 };
102 // There's sometimes flags:
103 if (iter_back.flags > 0) {
104 while (iter_back.flags > 0) {
105 if (iter_back.prev()) |_| {
106 continue;
95 } else { 107 } else {
96 // Scan past all the flags. 108 break;
97 while (iter_back.flags > 0) {
98 _ = iter_back.prev();
99 }
100 // Now just look for our word
101 iter_fwd = iter_back.forwardIterator();
102 while (iter_fwd.next()) |word| {
103 if (word.offset <= index and index <= word.offset + word.len) {
104 return word;
105 }
106 }
107 unreachable;
108 } 109 }
109 } else { // We can just reverse here
110 var iter_back = iter_fwd.reverseIterator();
111 const word = iter_back.prev().?;
112 assert(word.offset <= index and index <= word.offset + word.len);
113 return word;
114 } 110 }
115 } else { // last word then
116 var iter_back = iter_fwd.reverseIterator();
117 return iter_back.prev().?;
118 } 111 }
112 var iter_fwd = iter_back.forwardIterator();
113 while (iter_fwd.next()) |word| {
114 if (word.offset <= index and index < word.offset + word.len)
115 return word;
116 }
117 unreachable;
119} 118}
120 119
121/// Returns an iterator over words in `slice`. 120/// Returns an iterator over words in `slice`.
@@ -128,6 +127,7 @@ pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIt
128 return ReverseIterator.init(wordbreak, slice); 127 return ReverseIterator.init(wordbreak, slice);
129} 128}
130 129
130/// An iterator, forward, over all words in a provided string.
131pub const Iterator = struct { 131pub const Iterator = struct {
132 this: ?CodePoint = null, 132 this: ?CodePoint = null,
133 that: ?CodePoint = null, 133 that: ?CodePoint = null,
@@ -150,37 +150,7 @@ pub const Iterator = struct {
150 return iter.next(); 150 return iter.next();
151 } 151 }
152 152
153 /// Initialize an Iterator at the provided index. Assumes str is valid 153 /// Returns a reverse iterator from the point this iterator is paused
154 /// UTF-8, asserts that `index` is less than str.len, and that `str` is not
155 /// empty. Note that for various stateful reasons, this may give spurious
156 /// results if used naïvely. If you want to reliably iterate from an index,
157 /// use `wb.wordAtIndex(string, index)` to obtain the word, then start an
158 /// iterator at `wb.initAtIndex(string, word.offset)`.
159 pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator {
160 assert(index < string.len and string.len > 0);
161 // Just in case...
162 if (index == 0) return wb.iterator(string);
163 var idx: u32 = @intCast(index);
164 // Back up past any any follow bytes:
165 while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
166 var iter: Iterator = undefined;
167 iter.wb = wb;
168 // We need to populate the CodePoints, and the codepoint iterator.
169 // Consider "abc |def" with the cursor on d.
170 // We need `this` to be ` ` and `that` to be 'd',
171 // and `cp_iter.next()` to be `e`.
172 var cp_back: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
173 // Reverse gives us before `d`:
174 iter.this = cp_back.prev(); // that == ` `
175 // This iterator will give us `d`:
176 iter.cp_iter = .{ .bytes = string, .i = idx };
177 iter.that = iter.cp_iter.next();
178 // So that the next call will will give us `e`,
179 // thus the word will be `def`.
180 return iter;
181 }
182
183 /// Return a reverse iterator from the point this iterator is paused
184 /// at. Usually, calling `prev()` will return the word just seen. 154 /// at. Usually, calling `prev()` will return the word just seen.
185 pub fn reverseIterator(iter: *Iterator) ReverseIterator { 155 pub fn reverseIterator(iter: *Iterator) ReverseIterator {
186 var cp_it = iter.cp_iter.reverseIterator(); 156 var cp_it = iter.cp_iter.reverseIterator();
@@ -196,7 +166,7 @@ pub const Iterator = struct {
196 }; 166 };
197 } 167 }
198 168
199 /// Returns the next word segment. 169 /// Returns the next word segment, if any.
200 pub fn next(iter: *Iterator) ?Word { 170 pub fn next(iter: *Iterator) ?Word {
201 iter.advance(); 171 iter.advance();
202 172
@@ -338,6 +308,7 @@ pub const Iterator = struct {
338 } 308 }
339}; 309};
340 310
311/// An iterator, backward, over all words in a provided string.
341pub const ReverseIterator = struct { 312pub const ReverseIterator = struct {
342 after: ?CodePoint = null, 313 after: ?CodePoint = null,
343 before: ?CodePoint = null, 314 before: ?CodePoint = null,
@@ -352,16 +323,7 @@ pub const ReverseIterator = struct {
352 return wb_iter; 323 return wb_iter;
353 } 324 }
354 325
355 /// Initialize a ReverseIterator at the provided index. Assumes str is valid 326 /// Returns the previous word segment, if any, without advancing.
356 /// UTF-8, asserts that `index` is less than str.len, and that `str` is not
357 /// empty. You should prefer not to use this function, see Iterator.initAtIndex
358 /// for more details.
359 pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator {
360 var fw_iter = Iterator.initAtIndex(wb, string, index);
361 return fw_iter.reverseIterator();
362 }
363
364 /// Returns the previous word segment, without advancing.
365 pub fn peek(iter: *ReverseIterator) ?Word { 327 pub fn peek(iter: *ReverseIterator) ?Word {
366 const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; 328 const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags };
367 defer { 329 defer {
@@ -544,6 +506,27 @@ pub const ReverseIterator = struct {
544 } 506 }
545}; 507};
546 508
509//| Implementation Details
510
511/// Initialize a ReverseIterator at the provided index. Used in wordAtIndex.
512fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator {
513 var idx: u32 = @intCast(index);
514 while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
515 if (idx == string.len) return wb.reverseIterator(string);
516 var iter: ReverseIterator = undefined;
517 iter.wb = wb;
518 iter.flags = 0;
519 // We need to populate the CodePoints, and the codepoint iterator.
520 // Consider "abc| def" with the cursor as |.
521 // We need `before` to be `c` and `after` to be ' ',
522 // and `cp_iter.prev()` to be `b`.
523 var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
524 iter.after = cp_iter.prev();
525 iter.before = cp_iter.prev();
526 iter.cp_iter = cp_iter;
527 return iter;
528}
529
547fn sneaky(iter: *const ReverseIterator) SneakIterator { 530fn sneaky(iter: *const ReverseIterator) SneakIterator {
548 return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; 531 return .{ .cp_iter = iter.cp_iter, .wb = iter.wb };
549} 532}
@@ -656,23 +639,23 @@ test "ext_pict" {
656 try testing.expect(ext_pict.isMatch("\u{2701}")); 639 try testing.expect(ext_pict.isMatch("\u{2701}"));
657} 640}
658 641
659test wordAtCursor { 642test wordAtIndex {
660 const wb = try WordBreak.init(testing.allocator); 643 const wb = try WordBreak.init(testing.allocator);
661 defer wb.deinit(testing.allocator); 644 defer wb.deinit(testing.allocator);
662 const t_string = "first second third"; 645 const t_string = "first second third";
663 const second = wb.wordAtCursor(t_string, 8); 646 const second = wb.wordAtIndex(t_string, 8);
664 try testing.expectEqualStrings("second", second.bytes(t_string)); 647 try testing.expectEqualStrings("second", second.bytes(t_string));
665 const third = wb.wordAtCursor(t_string, 14); 648 const third = wb.wordAtIndex(t_string, 14);
666 try testing.expectEqualStrings("third", third.bytes(t_string)); 649 try testing.expectEqualStrings("third", third.bytes(t_string));
667 { 650 {
668 const first = wb.wordAtCursor(t_string, 3); 651 const first = wb.wordAtIndex(t_string, 3);
669 try testing.expectEqualStrings("first", first.bytes(t_string)); 652 try testing.expectEqualStrings("first", first.bytes(t_string));
670 } 653 }
671 { 654 {
672 const first = wb.wordAtCursor(t_string, 0); 655 const first = wb.wordAtIndex(t_string, 0);
673 try testing.expectEqualStrings("first", first.bytes(t_string)); 656 try testing.expectEqualStrings("first", first.bytes(t_string));
674 } 657 }
675 const last = wb.wordAtCursor(t_string, 14); 658 const last = wb.wordAtIndex(t_string, 14);
676 try testing.expectEqualStrings("third", last.bytes(t_string)); 659 try testing.expectEqualStrings("third", last.bytes(t_string));
677} 660}
678 661
diff --git a/src/code_point.zig b/src/code_point.zig
index 43b38d2..9a84080 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -272,7 +272,6 @@ pub const ReverseIterator = struct {
272 272
273 while (i_prev > 0) : (i_prev -= 1) { 273 while (i_prev > 0) : (i_prev -= 1) {
274 if (!followbyte(iter.bytes[i_prev])) break; 274 if (!followbyte(iter.bytes[i_prev])) break;
275 if (i_prev == 0) break;
276 } 275 }
277 276
278 if (i_prev > 0) 277 if (i_prev > 0)
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index ef459bf..8b02e98 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -222,32 +222,58 @@ test "Segmentation Word Iterator" {
222 try want.append(Word{ .len = gc_len, .offset = bytes_index }); 222 try want.append(Word{ .len = gc_len, .offset = bytes_index });
223 bytes_index += cp_index; 223 bytes_index += cp_index;
224 } 224 }
225 const this_str = all_bytes.items;
226
225 { 227 {
226 var iter = wb.iterator(all_bytes.items); 228 var iter = wb.iterator(this_str);
227 var peeked: ?Word = iter.peek(); 229 var peeked: ?Word = iter.peek();
228 230
229 // Check. 231 // Check.
230 for (want.items, 1..) |want_word, i| { 232 for (want.items, 1..) |want_word, idx| {
231 const got_word = (iter.next()).?; 233 const got_word = (iter.next()).?;
232 std.testing.expectEqualStrings( 234 std.testing.expectEqualStrings(
233 want_word.bytes(all_bytes.items), 235 want_word.bytes(this_str),
234 got_word.bytes(all_bytes.items), 236 got_word.bytes(this_str),
235 ) catch |err| { 237 ) catch |err| {
236 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); 238 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx });
237 return err; 239 return err;
238 }; 240 };
239 std.testing.expectEqualStrings( 241 std.testing.expectEqualStrings(
240 peeked.?.bytes(all_bytes.items), 242 peeked.?.bytes(this_str),
241 got_word.bytes(all_bytes.items), 243 got_word.bytes(this_str),
242 ) catch |err| { 244 ) catch |err| {
243 debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, i }); 245 debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx });
244 return err; 246 return err;
245 }; 247 };
248 var r_iter = iter.reverseIterator();
249 const if_r_word = r_iter.prev();
250 if (if_r_word) |r_word| {
251 std.testing.expectEqualStrings(
252 want_word.bytes(this_str),
253 r_word.bytes(this_str),
254 ) catch |err| {
255 debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx });
256 return err;
257 };
258 } else {
259 try testing.expect(false);
260 }
261 for (got_word.offset..got_word.offset + got_word.len) |i| {
262 const this_word = wb.wordAtIndex(this_str, i);
263 std.testing.expectEqualSlices(
264 u8,
265 got_word.bytes(this_str),
266 this_word.bytes(this_str),
267 ) catch |err| {
268 debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i });
269 return err;
270 };
271 }
246 peeked = iter.peek(); 272 peeked = iter.peek();
247 } 273 }
248 } 274 }
249 { 275 {
250 var r_iter = wb.reverseIterator(all_bytes.items); 276 var r_iter = wb.reverseIterator(this_str);
251 var peeked: ?Word = r_iter.peek(); 277 var peeked: ?Word = r_iter.peek();
252 var idx = want.items.len - 1; 278 var idx = want.items.len - 1;
253 279
@@ -256,19 +282,43 @@ test "Segmentation Word Iterator" {
256 const got_word = r_iter.prev().?; 282 const got_word = r_iter.prev().?;
257 std.testing.expectEqualSlices( 283 std.testing.expectEqualSlices(
258 u8, 284 u8,
259 want_word.bytes(all_bytes.items), 285 want_word.bytes(this_str),
260 got_word.bytes(all_bytes.items), 286 got_word.bytes(this_str),
261 ) catch |err| { 287 ) catch |err| {
262 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 }); 288 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 });
263 return err; 289 return err;
264 }; 290 };
265 std.testing.expectEqualStrings( 291 std.testing.expectEqualStrings(
266 peeked.?.bytes(all_bytes.items), 292 peeked.?.bytes(this_str),
267 got_word.bytes(all_bytes.items), 293 got_word.bytes(this_str),
268 ) catch |err| { 294 ) catch |err| {
269 debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx + 1 }); 295 debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx + 1 });
270 return err; 296 return err;
271 }; 297 };
298 var f_iter = r_iter.forwardIterator();
299 const if_f_word = f_iter.next();
300 if (if_f_word) |f_word| {
301 std.testing.expectEqualStrings(
302 want_word.bytes(this_str),
303 f_word.bytes(this_str),
304 ) catch |err| {
305 debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx });
306 return err;
307 };
308 } else {
309 try testing.expect(false);
310 }
311 for (got_word.offset..got_word.offset + got_word.len) |i| {
312 const this_word = wb.wordAtIndex(this_str, i);
313 std.testing.expectEqualSlices(
314 u8,
315 got_word.bytes(this_str),
316 this_word.bytes(this_str),
317 ) catch |err| {
318 debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i });
319 return err;
320 };
321 }
272 peeked = r_iter.peek(); 322 peeked = r_iter.peek();
273 if (idx == 0) break; 323 if (idx == 0) break;
274 } 324 }