summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-13 17:19:56 -0400
committerGravatar Sam Atman2025-05-15 15:32:38 -0400
commit5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a (patch)
treef46287fbc0d92238644c23d0b176354567b647d1 /src
parentReverse Word Iterator (diff)
downloadzg-5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a.tar.gz
zg-5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a.tar.xz
zg-5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a.zip
Hooked up break test, some bugs squashed
The handling of ignorables is really different, because they 'adhere' to the future of the iteration, not the past.
Diffstat (limited to 'src')
-rw-r--r--src/WordBreak.zig39
-rw-r--r--src/code_point.zig10
-rw-r--r--src/unicode_tests.zig49
3 files changed, 64 insertions, 34 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 37c0df9..0cab30e 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -98,11 +98,16 @@ pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usiz
98 return this_word.?; 98 return this_word.?;
99} 99}
100 100
101/// Returns an iterator over words in `slice` 101/// Returns an iterator over words in `slice`.
102pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { 102pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator {
103 return Iterator.init(wordbreak, slice); 103 return Iterator.init(wordbreak, slice);
104} 104}
105 105
106/// Returns a reverse iterator over the words in `slice`.
107pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator {
108 return ReverseIterator.init(wordbreak, slice);
109}
110
106pub const Iterator = struct { 111pub const Iterator = struct {
107 this: ?CodePoint = null, 112 this: ?CodePoint = null,
108 that: ?CodePoint = null, 113 that: ?CodePoint = null,
@@ -111,7 +116,7 @@ pub const Iterator = struct {
111 116
112 /// Assumes `str` is valid UTF-8. 117 /// Assumes `str` is valid UTF-8.
113 pub fn init(wb: *const WordBreak, str: []const u8) Iterator { 118 pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
114 var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; 119 var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb };
115 wb_iter.advance(); 120 wb_iter.advance();
116 return wb_iter; 121 return wb_iter;
117 } 122 }
@@ -267,8 +272,8 @@ pub const ReverseIterator = struct {
267 wb: *const WordBreak, 272 wb: *const WordBreak,
268 273
269 /// Assumes `str` is valid UTF-8. 274 /// Assumes `str` is valid UTF-8.
270 pub fn init(wb: *const WordBreak, str: []const u8) Iterator { 275 pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator {
271 var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; 276 var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb };
272 wb_iter.advance(); 277 wb_iter.advance();
273 return wb_iter; 278 return wb_iter;
274 } 279 }
@@ -299,12 +304,19 @@ pub const ReverseIterator = struct {
299 var last_last_p: WordBreakProperty = .none; 304 var last_last_p: WordBreakProperty = .none;
300 var ri_count: usize = 0; 305 var ri_count: usize = 0;
301 306
307 // TODO: Ignorables have to be handled completely differently, unfortunately.
308 // We have to find whatever is before it, match against that, and use that
309 // decision to handle the break we're currently working on.
310 // --
311 // This is achieveable I think. Just need to use peekPast to get that, and then
312 // take it from there. Probably as long as an ignorable is an after_p we just keep
313 // going.
302 scan: while (true) : (iter.advance()) { 314 scan: while (true) : (iter.advance()) {
303 const after = iter.after.?; 315 const after = iter.after.?;
304 word_len += after.len; 316 word_len += after.len;
305 if (iter.before) |before| { 317 if (iter.before) |before| {
306 const after_p = iter.wb.breakProp(after); 318 const after_p = iter.wb.breakProp(after);
307 const before_p = iter.wb.breakProp(before); 319 var before_p = iter.wb.breakProp(before);
308 if (!isIgnorable(after_p)) { 320 if (!isIgnorable(after_p)) {
309 last_last_p = last_p; 321 last_last_p = last_p;
310 last_p = after_p; 322 last_p = after_p;
@@ -322,9 +334,18 @@ pub const ReverseIterator = struct {
322 // WB3d WSegSpace × WSegSpace 334 // WB3d WSegSpace × WSegSpace
323 if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; 335 if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan;
324 // WB4 X (Extend | Format | ZWJ)* → X 336 // WB4 X (Extend | Format | ZWJ)* → X
325 if (isIgnorable(after_p)) { 337 if (isIgnorable(before_p)) {
326 continue :scan; 338 const maybe_before = iter.peekPast();
327 } // Now we use last_p instead of after_p for ignorable's sake 339 if (maybe_before) |valid_before| {
340 before_p = iter.wb.breakProp(valid_before);
341 } else if (isIgnorable(after_p)) {
342 continue :scan;
343 // We're done
344 } else {
345 break :scan;
346 }
347 }
348 if (isIgnorable(after_p)) continue :scan;
328 // WB5 AHLetter × AHLetter 349 // WB5 AHLetter × AHLetter
329 if (isAHLetter(last_p) and isAHLetter(before_p)) { 350 if (isAHLetter(last_p) and isAHLetter(before_p)) {
330 continue :scan; 351 continue :scan;
@@ -334,7 +355,7 @@ pub const ReverseIterator = struct {
334 continue :scan; 355 continue :scan;
335 } 356 }
336 // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter 357 // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
337 if (isMidVal(before_p)) { 358 if (isMidVal(before_p) and isAHLetter(last_p)) {
338 const prev_val = iter.peekPast(); 359 const prev_val = iter.peekPast();
339 if (prev_val) |prev_cp| { 360 if (prev_val) |prev_cp| {
340 const prev_p = iter.wb.breakProp(prev_cp); 361 const prev_p = iter.wb.breakProp(prev_cp);
diff --git a/src/code_point.zig b/src/code_point.zig
index a5b10d4..ba0b434 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -53,22 +53,12 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
53 }; 53 };
54 // Multibyte 54 // Multibyte
55 55
56<<<<<<< HEAD
57 // Second: 56 // Second:
58 var class: u4 = @intCast(u8dfa[byte]); 57 var class: u4 = @intCast(u8dfa[byte]);
59 var st: u32 = state_dfa[class]; 58 var st: u32 = state_dfa[class];
60 if (st == RUNE_REJECT or cursor.* == bytes.len) { 59 if (st == RUNE_REJECT or cursor.* == bytes.len) {
61 @branchHint(.cold); 60 @branchHint(.cold);
62 // First one is never a truncation 61 // First one is never a truncation
63||||||| parent of ad4b046 (Various small iterator improvements)
64 // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte
65 if (cp.len > bytes.len) {
66 // Unicode replacement code point.
67=======
68 // Return replacement if we don't have a complete codepoint remaining. Consumes only one byte.
69 if (cp.len > bytes.len) {
70 // Unicode replacement code point.
71>>>>>>> ad4b046 (Various small iterator improvements)
72 return .{ 62 return .{
73 .code = 0xfffd, 63 .code = 0xfffd,
74 .len = 1, 64 .len = 1,
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 59f0c6f..8661bfd 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -195,7 +195,7 @@ test "Segmentation Word Iterator" {
195 line = line[0..final]; 195 line = line[0..final];
196 } 196 }
197 // Iterate over fields. 197 // Iterate over fields.
198 var want = std.ArrayList(Grapheme).init(allocator); 198 var want = std.ArrayList(Word).init(allocator);
199 defer want.deinit(); 199 defer want.deinit();
200 200
201 var all_bytes = std.ArrayList(u8).init(allocator); 201 var all_bytes = std.ArrayList(u8).init(allocator);
@@ -219,22 +219,40 @@ test "Segmentation Word Iterator" {
219 gc_len += len; 219 gc_len += len;
220 } 220 }
221 221
222 try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); 222 try want.append(Word{ .len = gc_len, .offset = bytes_index });
223 bytes_index += cp_index; 223 bytes_index += cp_index;
224 } 224 }
225 225 {
226 var iter = wb.iterator(all_bytes.items); 226 var iter = wb.iterator(all_bytes.items);
227 227
228 // Check. 228 // Check.
229 for (want.items, 1..) |want_word, i| { 229 for (want.items, 1..) |want_word, i| {
230 const got_word = (iter.next()).?; 230 const got_word = (iter.next()).?;
231 std.testing.expectEqualStrings( 231 std.testing.expectEqualStrings(
232 want_word.bytes(all_bytes.items), 232 want_word.bytes(all_bytes.items),
233 got_word.bytes(all_bytes.items), 233 got_word.bytes(all_bytes.items),
234 ) catch |err| { 234 ) catch |err| {
235 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); 235 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i });
236 return err; 236 return err;
237 }; 237 };
238 }
239 }
240 {
241 var r_iter = wb.reverseIterator(all_bytes.items);
242 var idx = want.items.len - 1;
243 while (true) : (idx -= 1) {
244 const want_word = want.items[idx];
245 const got_word = r_iter.prev().?;
246 std.testing.expectEqualSlices(
247 u8,
248 want_word.bytes(all_bytes.items),
249 got_word.bytes(all_bytes.items),
250 ) catch |err| {
251 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 });
252 return err;
253 };
254 if (idx == 0) break;
255 }
238 } 256 }
239 } 257 }
240} 258}
@@ -277,3 +295,4 @@ const GraphemeIterator = @import("Graphemes").Iterator;
277const Normalize = @import("Normalize"); 295const Normalize = @import("Normalize");
278 296
279const WordBreak = @import("WordBreak"); 297const WordBreak = @import("WordBreak");
298const Word = WordBreak.Word;