diff options
| -rw-r--r-- | src/WordBreak.zig | 161 | ||||
| -rw-r--r-- | src/code_point.zig | 1 | ||||
| -rw-r--r-- | src/unicode_tests.zig | 76 |
3 files changed, 135 insertions, 103 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 7ac8f14..6ada7e1 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig | |||
| @@ -64,58 +64,57 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { | |||
| 64 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); | 64 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); |
| 65 | } | 65 | } |
| 66 | 66 | ||
| 67 | /// Convenience function for working with CodePoints | ||
| 67 | fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { | 68 | fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { |
| 68 | return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); | 69 | return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); |
| 69 | } | 70 | } |
| 70 | 71 | ||
| 71 | /// Returns the Word at the given index. Asserts that the index is valid for | 72 | /// Returns the Word at the given index. Asserts that the index is less than |
| 72 | /// the provided string, and that the string is not empty. Always returns a word. | 73 | /// `string.len`, and that the string is not empty. Always returns a word. |
| 73 | /// The index does not have to be the start of a codepoint in the word. | 74 | /// The index does not have to be the start of a codepoint in the word. |
| 74 | pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { | 75 | pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { |
| 75 | assert(index < string.len and string.len > 0); | 76 | assert(index < string.len and string.len > 0); |
| 76 | var iter_fwd: Iterator = .initAtIndex(wordbreak, string, index); | 77 | var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); |
| 77 | if (iter_fwd.next()) |_| { | 78 | const first_back = iter_back.prev(); |
| 78 | // This is a bit tricky because we may have been in the middle of various | 79 | if (first_back) |back| { |
| 79 | // stateful things. So we go forward again: | 80 | if (back.offset == 0) { |
| 80 | if (iter_fwd.next()) |_| { | 81 | var iter_fwd = wordbreak.iterator(string); |
| 81 | // Make a back iterator: | 82 | while (iter_fwd.next()) |word| { |
| 82 | var iter_back = iter_fwd.reverseIterator(); | 83 | if (word.offset <= index and index < word.offset + word.len) |
| 83 | const last_word = iter_back.prev().?; // Always works. | ||
| 84 | const no_flags = iter_back.flags == 0; | ||
| 85 | if (no_flags) { | ||
| 86 | // Next previous is our word. | ||
| 87 | const the_word = iter_back.prev(); | ||
| 88 | if (the_word) |word| { | ||
| 89 | assert(word.offset <= index and index <= word.offset + word.len); | ||
| 90 | return word; | 84 | return word; |
| 91 | } else { // Can happen, at least I think so | 85 | } |
| 92 | assert(last_word.offset <= index and index <= last_word.offset + last_word.len); | 86 | } |
| 93 | return last_word; | 87 | } else { |
| 94 | } | 88 | var iter_fwd = wordbreak.iterator(string); |
| 89 | while (iter_fwd.next()) |word| { | ||
| 90 | if (word.offset <= index and index < word.offset + word.len) | ||
| 91 | return word; | ||
| 92 | } | ||
| 93 | } | ||
| 94 | const second_back = iter_back.prev(); | ||
| 95 | if (second_back) |back| if (back.offset == 0) { | ||
| 96 | var iter_fwd = wordbreak.iterator(string); | ||
| 97 | while (iter_fwd.next()) |word| { | ||
| 98 | if (word.offset <= index and index < word.offset + word.len) | ||
| 99 | return word; | ||
| 100 | } | ||
| 101 | }; | ||
| 102 | // There's sometimes flags: | ||
| 103 | if (iter_back.flags > 0) { | ||
| 104 | while (iter_back.flags > 0) { | ||
| 105 | if (iter_back.prev()) |_| { | ||
| 106 | continue; | ||
| 95 | } else { | 107 | } else { |
| 96 | // Scan past all the flags. | 108 | break; |
| 97 | while (iter_back.flags > 0) { | ||
| 98 | _ = iter_back.prev(); | ||
| 99 | } | ||
| 100 | // Now just look for our word | ||
| 101 | iter_fwd = iter_back.forwardIterator(); | ||
| 102 | while (iter_fwd.next()) |word| { | ||
| 103 | if (word.offset <= index and index <= word.offset + word.len) { | ||
| 104 | return word; | ||
| 105 | } | ||
| 106 | } | ||
| 107 | unreachable; | ||
| 108 | } | 109 | } |
| 109 | } else { // We can just reverse here | ||
| 110 | var iter_back = iter_fwd.reverseIterator(); | ||
| 111 | const word = iter_back.prev().?; | ||
| 112 | assert(word.offset <= index and index <= word.offset + word.len); | ||
| 113 | return word; | ||
| 114 | } | 110 | } |
| 115 | } else { // last word then | ||
| 116 | var iter_back = iter_fwd.reverseIterator(); | ||
| 117 | return iter_back.prev().?; | ||
| 118 | } | 111 | } |
| 112 | var iter_fwd = iter_back.forwardIterator(); | ||
| 113 | while (iter_fwd.next()) |word| { | ||
| 114 | if (word.offset <= index and index < word.offset + word.len) | ||
| 115 | return word; | ||
| 116 | } | ||
| 117 | unreachable; | ||
| 119 | } | 118 | } |
| 120 | 119 | ||
| 121 | /// Returns an iterator over words in `slice`. | 120 | /// Returns an iterator over words in `slice`. |
| @@ -128,6 +127,7 @@ pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIt | |||
| 128 | return ReverseIterator.init(wordbreak, slice); | 127 | return ReverseIterator.init(wordbreak, slice); |
| 129 | } | 128 | } |
| 130 | 129 | ||
| 130 | /// An iterator, forward, over all words in a provided string. | ||
| 131 | pub const Iterator = struct { | 131 | pub const Iterator = struct { |
| 132 | this: ?CodePoint = null, | 132 | this: ?CodePoint = null, |
| 133 | that: ?CodePoint = null, | 133 | that: ?CodePoint = null, |
| @@ -150,37 +150,7 @@ pub const Iterator = struct { | |||
| 150 | return iter.next(); | 150 | return iter.next(); |
| 151 | } | 151 | } |
| 152 | 152 | ||
| 153 | /// Initialize an Iterator at the provided index. Assumes str is valid | 153 | /// Returns a reverse iterator from the point this iterator is paused |
| 154 | /// UTF-8, asserts that `index` is less than str.len, and that `str` is not | ||
| 155 | /// empty. Note that for various stateful reasons, this may give spurious | ||
| 156 | /// results if used naïvely. If you want to reliably iterate from an index, | ||
| 157 | /// use `wb.wordAtIndex(string, index)` to obtain the word, then start an | ||
| 158 | /// iterator at `wb.initAtIndex(string, word.offset)`. | ||
| 159 | pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator { | ||
| 160 | assert(index < string.len and string.len > 0); | ||
| 161 | // Just in case... | ||
| 162 | if (index == 0) return wb.iterator(string); | ||
| 163 | var idx: u32 = @intCast(index); | ||
| 164 | // Back up past any any follow bytes: | ||
| 165 | while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} | ||
| 166 | var iter: Iterator = undefined; | ||
| 167 | iter.wb = wb; | ||
| 168 | // We need to populate the CodePoints, and the codepoint iterator. | ||
| 169 | // Consider "abc |def" with the cursor on d. | ||
| 170 | // We need `this` to be ` ` and `that` to be 'd', | ||
| 171 | // and `cp_iter.next()` to be `e`. | ||
| 172 | var cp_back: ReverseCodepointIterator = .{ .bytes = string, .i = idx }; | ||
| 173 | // Reverse gives us before `d`: | ||
| 174 | iter.this = cp_back.prev(); // that == ` ` | ||
| 175 | // This iterator will give us `d`: | ||
| 176 | iter.cp_iter = .{ .bytes = string, .i = idx }; | ||
| 177 | iter.that = iter.cp_iter.next(); | ||
| 178 | // So that the next call will will give us `e`, | ||
| 179 | // thus the word will be `def`. | ||
| 180 | return iter; | ||
| 181 | } | ||
| 182 | |||
| 183 | /// Return a reverse iterator from the point this iterator is paused | ||
| 184 | /// at. Usually, calling `prev()` will return the word just seen. | 154 | /// at. Usually, calling `prev()` will return the word just seen. |
| 185 | pub fn reverseIterator(iter: *Iterator) ReverseIterator { | 155 | pub fn reverseIterator(iter: *Iterator) ReverseIterator { |
| 186 | var cp_it = iter.cp_iter.reverseIterator(); | 156 | var cp_it = iter.cp_iter.reverseIterator(); |
| @@ -196,7 +166,7 @@ pub const Iterator = struct { | |||
| 196 | }; | 166 | }; |
| 197 | } | 167 | } |
| 198 | 168 | ||
| 199 | /// Returns the next word segment. | 169 | /// Returns the next word segment, if any. |
| 200 | pub fn next(iter: *Iterator) ?Word { | 170 | pub fn next(iter: *Iterator) ?Word { |
| 201 | iter.advance(); | 171 | iter.advance(); |
| 202 | 172 | ||
| @@ -338,6 +308,7 @@ pub const Iterator = struct { | |||
| 338 | } | 308 | } |
| 339 | }; | 309 | }; |
| 340 | 310 | ||
| 311 | /// An iterator, backward, over all words in a provided string. | ||
| 341 | pub const ReverseIterator = struct { | 312 | pub const ReverseIterator = struct { |
| 342 | after: ?CodePoint = null, | 313 | after: ?CodePoint = null, |
| 343 | before: ?CodePoint = null, | 314 | before: ?CodePoint = null, |
| @@ -352,16 +323,7 @@ pub const ReverseIterator = struct { | |||
| 352 | return wb_iter; | 323 | return wb_iter; |
| 353 | } | 324 | } |
| 354 | 325 | ||
| 355 | /// Initialize a ReverseIterator at the provided index. Assumes str is valid | 326 | /// Returns the previous word segment, if any, without advancing. |
| 356 | /// UTF-8, asserts that `index` is less than str.len, and that `str` is not | ||
| 357 | /// empty. You should prefer not to use this function, see Iterator.initAtIndex | ||
| 358 | /// for more details. | ||
| 359 | pub fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) Iterator { | ||
| 360 | var fw_iter = Iterator.initAtIndex(wb, string, index); | ||
| 361 | return fw_iter.reverseIterator(); | ||
| 362 | } | ||
| 363 | |||
| 364 | /// Returns the previous word segment, without advancing. | ||
| 365 | pub fn peek(iter: *ReverseIterator) ?Word { | 327 | pub fn peek(iter: *ReverseIterator) ?Word { |
| 366 | const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; | 328 | const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; |
| 367 | defer { | 329 | defer { |
| @@ -544,6 +506,27 @@ pub const ReverseIterator = struct { | |||
| 544 | } | 506 | } |
| 545 | }; | 507 | }; |
| 546 | 508 | ||
| 509 | //| Implementation Details | ||
| 510 | |||
| 511 | /// Initialize a ReverseIterator at the provided index. Used in wordAtIndex. | ||
| 512 | fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator { | ||
| 513 | var idx: u32 = @intCast(index); | ||
| 514 | while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} | ||
| 515 | if (idx == string.len) return wb.reverseIterator(string); | ||
| 516 | var iter: ReverseIterator = undefined; | ||
| 517 | iter.wb = wb; | ||
| 518 | iter.flags = 0; | ||
| 519 | // We need to populate the CodePoints, and the codepoint iterator. | ||
| 520 | // Consider "abc| def" with the cursor as |. | ||
| 521 | // We need `before` to be `c` and `after` to be ' ', | ||
| 522 | // and `cp_iter.prev()` to be `b`. | ||
| 523 | var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx }; | ||
| 524 | iter.after = cp_iter.prev(); | ||
| 525 | iter.before = cp_iter.prev(); | ||
| 526 | iter.cp_iter = cp_iter; | ||
| 527 | return iter; | ||
| 528 | } | ||
| 529 | |||
| 547 | fn sneaky(iter: *const ReverseIterator) SneakIterator { | 530 | fn sneaky(iter: *const ReverseIterator) SneakIterator { |
| 548 | return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; | 531 | return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; |
| 549 | } | 532 | } |
| @@ -656,23 +639,23 @@ test "ext_pict" { | |||
| 656 | try testing.expect(ext_pict.isMatch("\u{2701}")); | 639 | try testing.expect(ext_pict.isMatch("\u{2701}")); |
| 657 | } | 640 | } |
| 658 | 641 | ||
| 659 | test wordAtCursor { | 642 | test wordAtIndex { |
| 660 | const wb = try WordBreak.init(testing.allocator); | 643 | const wb = try WordBreak.init(testing.allocator); |
| 661 | defer wb.deinit(testing.allocator); | 644 | defer wb.deinit(testing.allocator); |
| 662 | const t_string = "first second third"; | 645 | const t_string = "first second third"; |
| 663 | const second = wb.wordAtCursor(t_string, 8); | 646 | const second = wb.wordAtIndex(t_string, 8); |
| 664 | try testing.expectEqualStrings("second", second.bytes(t_string)); | 647 | try testing.expectEqualStrings("second", second.bytes(t_string)); |
| 665 | const third = wb.wordAtCursor(t_string, 14); | 648 | const third = wb.wordAtIndex(t_string, 14); |
| 666 | try testing.expectEqualStrings("third", third.bytes(t_string)); | 649 | try testing.expectEqualStrings("third", third.bytes(t_string)); |
| 667 | { | 650 | { |
| 668 | const first = wb.wordAtCursor(t_string, 3); | 651 | const first = wb.wordAtIndex(t_string, 3); |
| 669 | try testing.expectEqualStrings("first", first.bytes(t_string)); | 652 | try testing.expectEqualStrings("first", first.bytes(t_string)); |
| 670 | } | 653 | } |
| 671 | { | 654 | { |
| 672 | const first = wb.wordAtCursor(t_string, 0); | 655 | const first = wb.wordAtIndex(t_string, 0); |
| 673 | try testing.expectEqualStrings("first", first.bytes(t_string)); | 656 | try testing.expectEqualStrings("first", first.bytes(t_string)); |
| 674 | } | 657 | } |
| 675 | const last = wb.wordAtCursor(t_string, 14); | 658 | const last = wb.wordAtIndex(t_string, 14); |
| 676 | try testing.expectEqualStrings("third", last.bytes(t_string)); | 659 | try testing.expectEqualStrings("third", last.bytes(t_string)); |
| 677 | } | 660 | } |
| 678 | 661 | ||
diff --git a/src/code_point.zig b/src/code_point.zig index 43b38d2..9a84080 100644 --- a/src/code_point.zig +++ b/src/code_point.zig | |||
| @@ -272,7 +272,6 @@ pub const ReverseIterator = struct { | |||
| 272 | 272 | ||
| 273 | while (i_prev > 0) : (i_prev -= 1) { | 273 | while (i_prev > 0) : (i_prev -= 1) { |
| 274 | if (!followbyte(iter.bytes[i_prev])) break; | 274 | if (!followbyte(iter.bytes[i_prev])) break; |
| 275 | if (i_prev == 0) break; | ||
| 276 | } | 275 | } |
| 277 | 276 | ||
| 278 | if (i_prev > 0) | 277 | if (i_prev > 0) |
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index ef459bf..8b02e98 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -222,32 +222,58 @@ test "Segmentation Word Iterator" { | |||
| 222 | try want.append(Word{ .len = gc_len, .offset = bytes_index }); | 222 | try want.append(Word{ .len = gc_len, .offset = bytes_index }); |
| 223 | bytes_index += cp_index; | 223 | bytes_index += cp_index; |
| 224 | } | 224 | } |
| 225 | const this_str = all_bytes.items; | ||
| 226 | |||
| 225 | { | 227 | { |
| 226 | var iter = wb.iterator(all_bytes.items); | 228 | var iter = wb.iterator(this_str); |
| 227 | var peeked: ?Word = iter.peek(); | 229 | var peeked: ?Word = iter.peek(); |
| 228 | 230 | ||
| 229 | // Check. | 231 | // Check. |
| 230 | for (want.items, 1..) |want_word, i| { | 232 | for (want.items, 1..) |want_word, idx| { |
| 231 | const got_word = (iter.next()).?; | 233 | const got_word = (iter.next()).?; |
| 232 | std.testing.expectEqualStrings( | 234 | std.testing.expectEqualStrings( |
| 233 | want_word.bytes(all_bytes.items), | 235 | want_word.bytes(this_str), |
| 234 | got_word.bytes(all_bytes.items), | 236 | got_word.bytes(this_str), |
| 235 | ) catch |err| { | 237 | ) catch |err| { |
| 236 | debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); | 238 | debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx }); |
| 237 | return err; | 239 | return err; |
| 238 | }; | 240 | }; |
| 239 | std.testing.expectEqualStrings( | 241 | std.testing.expectEqualStrings( |
| 240 | peeked.?.bytes(all_bytes.items), | 242 | peeked.?.bytes(this_str), |
| 241 | got_word.bytes(all_bytes.items), | 243 | got_word.bytes(this_str), |
| 242 | ) catch |err| { | 244 | ) catch |err| { |
| 243 | debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, i }); | 245 | debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx }); |
| 244 | return err; | 246 | return err; |
| 245 | }; | 247 | }; |
| 248 | var r_iter = iter.reverseIterator(); | ||
| 249 | const if_r_word = r_iter.prev(); | ||
| 250 | if (if_r_word) |r_word| { | ||
| 251 | std.testing.expectEqualStrings( | ||
| 252 | want_word.bytes(this_str), | ||
| 253 | r_word.bytes(this_str), | ||
| 254 | ) catch |err| { | ||
| 255 | debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx }); | ||
| 256 | return err; | ||
| 257 | }; | ||
| 258 | } else { | ||
| 259 | try testing.expect(false); | ||
| 260 | } | ||
| 261 | for (got_word.offset..got_word.offset + got_word.len) |i| { | ||
| 262 | const this_word = wb.wordAtIndex(this_str, i); | ||
| 263 | std.testing.expectEqualSlices( | ||
| 264 | u8, | ||
| 265 | got_word.bytes(this_str), | ||
| 266 | this_word.bytes(this_str), | ||
| 267 | ) catch |err| { | ||
| 268 | debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i }); | ||
| 269 | return err; | ||
| 270 | }; | ||
| 271 | } | ||
| 246 | peeked = iter.peek(); | 272 | peeked = iter.peek(); |
| 247 | } | 273 | } |
| 248 | } | 274 | } |
| 249 | { | 275 | { |
| 250 | var r_iter = wb.reverseIterator(all_bytes.items); | 276 | var r_iter = wb.reverseIterator(this_str); |
| 251 | var peeked: ?Word = r_iter.peek(); | 277 | var peeked: ?Word = r_iter.peek(); |
| 252 | var idx = want.items.len - 1; | 278 | var idx = want.items.len - 1; |
| 253 | 279 | ||
| @@ -256,19 +282,43 @@ test "Segmentation Word Iterator" { | |||
| 256 | const got_word = r_iter.prev().?; | 282 | const got_word = r_iter.prev().?; |
| 257 | std.testing.expectEqualSlices( | 283 | std.testing.expectEqualSlices( |
| 258 | u8, | 284 | u8, |
| 259 | want_word.bytes(all_bytes.items), | 285 | want_word.bytes(this_str), |
| 260 | got_word.bytes(all_bytes.items), | 286 | got_word.bytes(this_str), |
| 261 | ) catch |err| { | 287 | ) catch |err| { |
| 262 | debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 }); | 288 | debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 }); |
| 263 | return err; | 289 | return err; |
| 264 | }; | 290 | }; |
| 265 | std.testing.expectEqualStrings( | 291 | std.testing.expectEqualStrings( |
| 266 | peeked.?.bytes(all_bytes.items), | 292 | peeked.?.bytes(this_str), |
| 267 | got_word.bytes(all_bytes.items), | 293 | got_word.bytes(this_str), |
| 268 | ) catch |err| { | 294 | ) catch |err| { |
| 269 | debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx + 1 }); | 295 | debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx + 1 }); |
| 270 | return err; | 296 | return err; |
| 271 | }; | 297 | }; |
| 298 | var f_iter = r_iter.forwardIterator(); | ||
| 299 | const if_f_word = f_iter.next(); | ||
| 300 | if (if_f_word) |f_word| { | ||
| 301 | std.testing.expectEqualStrings( | ||
| 302 | want_word.bytes(this_str), | ||
| 303 | f_word.bytes(this_str), | ||
| 304 | ) catch |err| { | ||
| 305 | debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx }); | ||
| 306 | return err; | ||
| 307 | }; | ||
| 308 | } else { | ||
| 309 | try testing.expect(false); | ||
| 310 | } | ||
| 311 | for (got_word.offset..got_word.offset + got_word.len) |i| { | ||
| 312 | const this_word = wb.wordAtIndex(this_str, i); | ||
| 313 | std.testing.expectEqualSlices( | ||
| 314 | u8, | ||
| 315 | got_word.bytes(this_str), | ||
| 316 | this_word.bytes(this_str), | ||
| 317 | ) catch |err| { | ||
| 318 | debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i }); | ||
| 319 | return err; | ||
| 320 | }; | ||
| 321 | } | ||
| 272 | peeked = r_iter.peek(); | 322 | peeked = r_iter.peek(); |
| 273 | if (idx == 0) break; | 323 | if (idx == 0) break; |
| 274 | } | 324 | } |