summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--build.zig.zon2
-rw-r--r--src/Words.zig98
-rw-r--r--src/unicode_tests.zig38
3 files changed, 105 insertions, 33 deletions
diff --git a/build.zig.zon b/build.zig.zon
index b69249f..3e1df95 100644
--- a/build.zig.zon
+++ b/build.zig.zon
@@ -1,6 +1,6 @@
1.{ 1.{
2 .name = .zg, 2 .name = .zg,
3 .version = "0.14.0", 3 .version = "0.14.1",
4 .minimum_zig_version = "0.14.0", 4 .minimum_zig_version = "0.14.0",
5 .fingerprint = 0x47df7778dc946aa0, 5 .fingerprint = 0x47df7778dc946aa0,
6 6
diff --git a/src/Words.zig b/src/Words.zig
index 565a2fb..1d10b2a 100644
--- a/src/Words.zig
+++ b/src/Words.zig
@@ -1,4 +1,7 @@
1//! Word Breaking Algorithm. 1//! Word Breaking Algorithm.
2//!
3//! https://www.unicode.org/reports/tr29/#Word_Boundaries
4//!
2 5
3const WordBreakProperty = enum(u5) { 6const WordBreakProperty = enum(u5) {
4 none, 7 none,
@@ -42,9 +45,9 @@ pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void {
42 }; 45 };
43} 46}
44 47
45pub fn deinit(wordbreak: *const Words, allocator: mem.Allocator) void { 48pub fn deinit(words: *const Words, allocator: mem.Allocator) void {
46 allocator.free(wordbreak.s1); 49 allocator.free(words.s1);
47 allocator.free(wordbreak.s2); 50 allocator.free(words.s2);
48} 51}
49 52
50/// Represents a Unicode word span, as an offset into the source string 53/// Represents a Unicode word span, as an offset into the source string
@@ -54,51 +57,44 @@ pub const Word = struct {
54 len: u32, 57 len: u32,
55 58
56 /// Returns a slice of the word given the source string. 59 /// Returns a slice of the word given the source string.
57 pub fn bytes(self: Word, src: []const u8) []const u8 { 60 pub fn bytes(word: Word, src: []const u8) []const u8 {
58 return src[self.offset..][0..self.len]; 61 return src[word.offset..][0..word.len];
59 } 62 }
60}; 63};
61 64
62/// Returns the word break property type for `cp`. 65/// Returns the word break property type for `cp`.
63pub fn breakProperty(wordbreak: *const Words, cp: u21) WordBreakProperty { 66pub fn breakProperty(words: *const Words, cp: u21) WordBreakProperty {
64 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); 67 return @enumFromInt(words.s2[words.s1[cp >> 8] + (cp & 0xff)]);
65} 68}
66 69
67/// Convenience function for working with CodePoints 70/// Convenience function for working with CodePoints
68fn breakProp(wb: *const Words, point: CodePoint) WordBreakProperty { 71fn breakProp(words: *const Words, point: CodePoint) WordBreakProperty {
69 return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); 72 return @enumFromInt(words.s2[words.s1[point.code >> 8] + (point.code & 0xff)]);
70} 73}
71 74
72/// Returns the Word at the given index. Asserts that the index is less than 75/// Returns the Word at the given index. Asserts that the index is less than
73/// `string.len`, and that the string is not empty. Always returns a word. 76/// `string.len`, and that the string is not empty. Always returns a word.
74/// The index does not have to be the start of a codepoint in the word. 77/// The index does not have to be the start of a codepoint in the word.
75pub fn wordAtIndex(wordbreak: *const Words, string: []const u8, index: usize) Word { 78pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word {
76 assert(index < string.len and string.len > 0); 79 assert(index < string.len and string.len > 0);
77 var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); 80 var iter_back: ReverseIterator = reverseFromIndex(words, string, index);
78 const first_back = iter_back.prev(); 81 const first_back = iter_back.prev();
79 if (first_back) |back| { 82 if (first_back) |back| {
80 if (back.offset == 0) { 83 if (back.offset == 0) {
81 var iter_fwd = wordbreak.iterator(string); 84 var iter_fwd = words.iterator(string);
82 while (iter_fwd.next()) |word| { 85 while (iter_fwd.next()) |word| {
83 if (word.offset <= index and index < word.offset + word.len) 86 if (word.offset <= index and index < word.offset + word.len)
84 return word; 87 return word;
85 } 88 }
86 } 89 }
87 } else { 90 } else {
88 var iter_fwd = wordbreak.iterator(string); 91 var iter_fwd = words.iterator(string);
89 while (iter_fwd.next()) |word| { 92 while (iter_fwd.next()) |word| {
90 if (word.offset <= index and index < word.offset + word.len) 93 if (word.offset <= index and index < word.offset + word.len)
91 return word; 94 return word;
92 } 95 }
93 } 96 }
94 const second_back = iter_back.prev(); 97 _ = iter_back.prev();
95 if (second_back) |back| if (back.offset == 0) {
96 var iter_fwd = wordbreak.iterator(string);
97 while (iter_fwd.next()) |word| {
98 if (word.offset <= index and index < word.offset + word.len)
99 return word;
100 }
101 };
102 // There's sometimes flags: 98 // There's sometimes flags:
103 if (iter_back.flags > 0) { 99 if (iter_back.flags > 0) {
104 while (iter_back.flags > 0) { 100 while (iter_back.flags > 0) {
@@ -118,13 +114,23 @@ pub fn wordAtIndex(wordbreak: *const Words, string: []const u8, index: usize) Wo
118} 114}
119 115
120/// Returns an iterator over words in `slice`. 116/// Returns an iterator over words in `slice`.
121pub fn iterator(wordbreak: *const Words, slice: []const u8) Iterator { 117pub fn iterator(words: *const Words, slice: []const u8) Iterator {
122 return Iterator.init(wordbreak, slice); 118 return Iterator.init(words, slice);
123} 119}
124 120
125/// Returns a reverse iterator over the words in `slice`. 121/// Returns a reverse iterator over the words in `slice`.
126pub fn reverseIterator(wordbreak: *const Words, slice: []const u8) ReverseIterator { 122pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator {
127 return ReverseIterator.init(wordbreak, slice); 123 return ReverseIterator.init(words, slice);
124}
125
126/// Returns an iterator after the `word` in `slice`.
127pub fn iterateAfter(words: *const Words, slice: []const u8, word: Word) Iterator {
128 return forwardFromIndex(words, slice, word.offset + word.len);
129}
130
131/// Returns a reverse iterator before the `word` in `slice`.
132pub fn iterateBefore(words: *const Words, slice: []const u8, word: Word) ReverseIterator {
133 return reverseFromIndex(words, slice, word.offset);
128} 134}
129 135
130/// An iterator, forward, over all words in a provided string. 136/// An iterator, forward, over all words in a provided string.
@@ -135,8 +141,8 @@ pub const Iterator = struct {
135 wb: *const Words, 141 wb: *const Words,
136 142
137 /// Assumes `str` is valid UTF-8. 143 /// Assumes `str` is valid UTF-8.
138 pub fn init(wb: *const Words, str: []const u8) Iterator { 144 pub fn init(words: *const Words, str: []const u8) Iterator {
139 var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; 145 var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = words };
140 wb_iter.advance(); 146 wb_iter.advance();
141 return wb_iter; 147 return wb_iter;
142 } 148 }
@@ -318,8 +324,8 @@ pub const ReverseIterator = struct {
318 flags: usize = 0, 324 flags: usize = 0,
319 325
320 /// Assumes `str` is valid UTF-8. 326 /// Assumes `str` is valid UTF-8.
321 pub fn init(wb: *const Words, str: []const u8) ReverseIterator { 327 pub fn init(words: *const Words, str: []const u8) ReverseIterator {
322 var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; 328 var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = words };
323 wb_iter.advance(); 329 wb_iter.advance();
324 return wb_iter; 330 return wb_iter;
325 } 331 }
@@ -511,13 +517,13 @@ pub const ReverseIterator = struct {
511//| Implementation Details 517//| Implementation Details
512 518
513/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. 519/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`.
514fn initAtIndex(wb: *const Words, string: []const u8, index: usize) ReverseIterator { 520fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator {
515 var idx: u32 = @intCast(index); 521 var idx: u32 = @intCast(index);
516 // Find the next lead byte: 522 // Find the next lead byte:
517 while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} 523 while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
518 if (idx == string.len) return wb.reverseIterator(string); 524 if (idx == string.len) return words.reverseIterator(string);
519 var iter: ReverseIterator = undefined; 525 var iter: ReverseIterator = undefined;
520 iter.wb = wb; 526 iter.wb = words;
521 iter.flags = 0; 527 iter.flags = 0;
522 // We need to populate the CodePoints, and the codepoint iterator. 528 // We need to populate the CodePoints, and the codepoint iterator.
523 // Consider "abc| def" with the cursor as |. 529 // Consider "abc| def" with the cursor as |.
@@ -530,6 +536,34 @@ fn initAtIndex(wb: *const Words, string: []const u8, index: usize) ReverseIterat
530 return iter; 536 return iter;
531} 537}
532 538
539fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator {
540 var idx: u32 = @intCast(index);
541 if (idx == string.len) {
542 return .{
543 .cp_iter = .{ .bytes = string, .i = idx },
544 .this = null,
545 .that = null,
546 .wb = words,
547 };
548 }
549 while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
550 if (idx == 0) return words.iterator(string);
551 var iter: Iterator = undefined;
552 iter.wb = words;
553 // We need to populate the CodePoints, and the codepoint iterator.
554 // Consider "abc |def" with the cursor as |.
555 // We need `this` to be ` ` and `that` to be 'd',
556 // and `cp_iter.next()` to be `d`.
557 idx -= 1;
558 while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
559 // "abc| def"
560 var cp_iter: CodepointIterator = .{ .bytes = string, .i = idx };
561 iter.this = cp_iter.next();
562 iter.that = cp_iter.next();
563 iter.cp_iter = cp_iter;
564 return iter;
565}
566
533fn sneaky(iter: *const ReverseIterator) SneakIterator { 567fn sneaky(iter: *const ReverseIterator) SneakIterator {
534 return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; 568 return .{ .cp_iter = iter.cp_iter, .wb = iter.wb };
535} 569}
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 18f1814..195fdcb 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -287,6 +287,25 @@ test "Segmentation Word Iterator" {
287 } else { 287 } else {
288 try testing.expect(false); 288 try testing.expect(false);
289 } 289 }
290 var peek_iter = wb.iterateAfter(this_str, got_word);
291 const peek_1 = peek_iter.next();
292 if (peek_1) |p1| {
293 const peek_2 = iter.peek();
294 if (peek_2) |p2| {
295 std.testing.expectEqualSlices(
296 u8,
297 p1.bytes(this_str),
298 p2.bytes(this_str),
299 ) catch |err| {
300 debug.print("Bad peek on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, idx });
301 return err;
302 };
303 } else {
304 try testing.expect(false);
305 }
306 } else {
307 try testing.expectEqual(null, iter.peek());
308 }
290 for (got_word.offset..got_word.offset + got_word.len) |i| { 309 for (got_word.offset..got_word.offset + got_word.len) |i| {
291 const this_word = wb.wordAtIndex(this_str, i); 310 const this_word = wb.wordAtIndex(this_str, i);
292 std.testing.expectEqualSlices( 311 std.testing.expectEqualSlices(
@@ -337,6 +356,25 @@ test "Segmentation Word Iterator" {
337 } else { 356 } else {
338 try testing.expect(false); 357 try testing.expect(false);
339 } 358 }
359 var peek_iter = wb.iterateBefore(this_str, got_word);
360 const peek_1 = peek_iter.prev();
361 if (peek_1) |p1| {
362 const peek_2 = r_iter.peek();
363 if (peek_2) |p2| {
364 std.testing.expectEqualSlices(
365 u8,
366 p1.bytes(this_str),
367 p2.bytes(this_str),
368 ) catch |err| {
369 debug.print("Bad peek on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, idx });
370 return err;
371 };
372 } else {
373 try testing.expect(false);
374 }
375 } else {
376 try testing.expectEqual(null, r_iter.peek());
377 }
340 for (got_word.offset..got_word.offset + got_word.len) |i| { 378 for (got_word.offset..got_word.offset + got_word.len) |i| {
341 const this_word = wb.wordAtIndex(this_str, i); 379 const this_word = wb.wordAtIndex(this_str, i);
342 std.testing.expectEqualSlices( 380 std.testing.expectEqualSlices(