summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-13 17:19:56 -0400
committerGravatar Sam Atman2025-05-15 15:32:38 -0400
commit5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a (patch)
treef46287fbc0d92238644c23d0b176354567b647d1
parentReverse Word Iterator (diff)
downloadzg-5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a.tar.gz
zg-5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a.tar.xz
zg-5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a.zip
Hooked up break test, some bugs squashed
The handling of ignorables is really different, because they 'adhere' to the future of the iteration, not the past.
-rw-r--r--src/WordBreak.zig39
-rw-r--r--src/code_point.zig10
-rw-r--r--src/unicode_tests.zig49
3 files changed, 64 insertions, 34 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 37c0df9..0cab30e 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -98,11 +98,16 @@ pub fn wordAtCursor(wordbreak: *const WordBreak, string: []const u8, index: usiz
98 return this_word.?; 98 return this_word.?;
99} 99}
100 100
101/// Returns an iterator over words in `slice` 101/// Returns an iterator over words in `slice`.
102pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { 102pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator {
103 return Iterator.init(wordbreak, slice); 103 return Iterator.init(wordbreak, slice);
104} 104}
105 105
106/// Returns a reverse iterator over the words in `slice`.
107pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator {
108 return ReverseIterator.init(wordbreak, slice);
109}
110
106pub const Iterator = struct { 111pub const Iterator = struct {
107 this: ?CodePoint = null, 112 this: ?CodePoint = null,
108 that: ?CodePoint = null, 113 that: ?CodePoint = null,
@@ -111,7 +116,7 @@ pub const Iterator = struct {
111 116
112 /// Assumes `str` is valid UTF-8. 117 /// Assumes `str` is valid UTF-8.
113 pub fn init(wb: *const WordBreak, str: []const u8) Iterator { 118 pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
114 var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; 119 var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb };
115 wb_iter.advance(); 120 wb_iter.advance();
116 return wb_iter; 121 return wb_iter;
117 } 122 }
@@ -267,8 +272,8 @@ pub const ReverseIterator = struct {
267 wb: *const WordBreak, 272 wb: *const WordBreak,
268 273
269 /// Assumes `str` is valid UTF-8. 274 /// Assumes `str` is valid UTF-8.
270 pub fn init(wb: *const WordBreak, str: []const u8) Iterator { 275 pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator {
271 var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; 276 var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb };
272 wb_iter.advance(); 277 wb_iter.advance();
273 return wb_iter; 278 return wb_iter;
274 } 279 }
@@ -299,12 +304,19 @@ pub const ReverseIterator = struct {
299 var last_last_p: WordBreakProperty = .none; 304 var last_last_p: WordBreakProperty = .none;
300 var ri_count: usize = 0; 305 var ri_count: usize = 0;
301 306
307 // TODO: Ignorables have to be handled completely differently, unfortunately.
308 // We have to find whatever is before it, match against that, and use that
309 // decision to handle the break we're currently working on.
310 // --
311 // This is achieveable I think. Just need to use peekPast to get that, and then
312 // take it from there. Probably as long as an ignorable is an after_p we just keep
313 // going.
302 scan: while (true) : (iter.advance()) { 314 scan: while (true) : (iter.advance()) {
303 const after = iter.after.?; 315 const after = iter.after.?;
304 word_len += after.len; 316 word_len += after.len;
305 if (iter.before) |before| { 317 if (iter.before) |before| {
306 const after_p = iter.wb.breakProp(after); 318 const after_p = iter.wb.breakProp(after);
307 const before_p = iter.wb.breakProp(before); 319 var before_p = iter.wb.breakProp(before);
308 if (!isIgnorable(after_p)) { 320 if (!isIgnorable(after_p)) {
309 last_last_p = last_p; 321 last_last_p = last_p;
310 last_p = after_p; 322 last_p = after_p;
@@ -322,9 +334,18 @@ pub const ReverseIterator = struct {
322 // WB3d WSegSpace × WSegSpace 334 // WB3d WSegSpace × WSegSpace
323 if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; 335 if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan;
324 // WB4 X (Extend | Format | ZWJ)* → X 336 // WB4 X (Extend | Format | ZWJ)* → X
325 if (isIgnorable(after_p)) { 337 if (isIgnorable(before_p)) {
326 continue :scan; 338 const maybe_before = iter.peekPast();
327 } // Now we use last_p instead of after_p for ignorable's sake 339 if (maybe_before) |valid_before| {
340 before_p = iter.wb.breakProp(valid_before);
341 } else if (isIgnorable(after_p)) {
342 continue :scan;
343 // We're done
344 } else {
345 break :scan;
346 }
347 }
348 if (isIgnorable(after_p)) continue :scan;
328 // WB5 AHLetter × AHLetter 349 // WB5 AHLetter × AHLetter
329 if (isAHLetter(last_p) and isAHLetter(before_p)) { 350 if (isAHLetter(last_p) and isAHLetter(before_p)) {
330 continue :scan; 351 continue :scan;
@@ -334,7 +355,7 @@ pub const ReverseIterator = struct {
334 continue :scan; 355 continue :scan;
335 } 356 }
336 // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter 357 // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
337 if (isMidVal(before_p)) { 358 if (isMidVal(before_p) and isAHLetter(last_p)) {
338 const prev_val = iter.peekPast(); 359 const prev_val = iter.peekPast();
339 if (prev_val) |prev_cp| { 360 if (prev_val) |prev_cp| {
340 const prev_p = iter.wb.breakProp(prev_cp); 361 const prev_p = iter.wb.breakProp(prev_cp);
diff --git a/src/code_point.zig b/src/code_point.zig
index a5b10d4..ba0b434 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -53,22 +53,12 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
53 }; 53 };
54 // Multibyte 54 // Multibyte
55 55
56<<<<<<< HEAD
57 // Second: 56 // Second:
58 var class: u4 = @intCast(u8dfa[byte]); 57 var class: u4 = @intCast(u8dfa[byte]);
59 var st: u32 = state_dfa[class]; 58 var st: u32 = state_dfa[class];
60 if (st == RUNE_REJECT or cursor.* == bytes.len) { 59 if (st == RUNE_REJECT or cursor.* == bytes.len) {
61 @branchHint(.cold); 60 @branchHint(.cold);
62 // First one is never a truncation 61 // First one is never a truncation
63||||||| parent of ad4b046 (Various small iterator improvements)
64 // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte
65 if (cp.len > bytes.len) {
66 // Unicode replacement code point.
67=======
68 // Return replacement if we don't have a complete codepoint remaining. Consumes only one byte.
69 if (cp.len > bytes.len) {
70 // Unicode replacement code point.
71>>>>>>> ad4b046 (Various small iterator improvements)
72 return .{ 62 return .{
73 .code = 0xfffd, 63 .code = 0xfffd,
74 .len = 1, 64 .len = 1,
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 59f0c6f..8661bfd 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -195,7 +195,7 @@ test "Segmentation Word Iterator" {
195 line = line[0..final]; 195 line = line[0..final];
196 } 196 }
197 // Iterate over fields. 197 // Iterate over fields.
198 var want = std.ArrayList(Grapheme).init(allocator); 198 var want = std.ArrayList(Word).init(allocator);
199 defer want.deinit(); 199 defer want.deinit();
200 200
201 var all_bytes = std.ArrayList(u8).init(allocator); 201 var all_bytes = std.ArrayList(u8).init(allocator);
@@ -219,22 +219,40 @@ test "Segmentation Word Iterator" {
219 gc_len += len; 219 gc_len += len;
220 } 220 }
221 221
222 try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); 222 try want.append(Word{ .len = gc_len, .offset = bytes_index });
223 bytes_index += cp_index; 223 bytes_index += cp_index;
224 } 224 }
225 225 {
226 var iter = wb.iterator(all_bytes.items); 226 var iter = wb.iterator(all_bytes.items);
227 227
228 // Check. 228 // Check.
229 for (want.items, 1..) |want_word, i| { 229 for (want.items, 1..) |want_word, i| {
230 const got_word = (iter.next()).?; 230 const got_word = (iter.next()).?;
231 std.testing.expectEqualStrings( 231 std.testing.expectEqualStrings(
232 want_word.bytes(all_bytes.items), 232 want_word.bytes(all_bytes.items),
233 got_word.bytes(all_bytes.items), 233 got_word.bytes(all_bytes.items),
234 ) catch |err| { 234 ) catch |err| {
235 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); 235 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i });
236 return err; 236 return err;
237 }; 237 };
238 }
239 }
240 {
241 var r_iter = wb.reverseIterator(all_bytes.items);
242 var idx = want.items.len - 1;
243 while (true) : (idx -= 1) {
244 const want_word = want.items[idx];
245 const got_word = r_iter.prev().?;
246 std.testing.expectEqualSlices(
247 u8,
248 want_word.bytes(all_bytes.items),
249 got_word.bytes(all_bytes.items),
250 ) catch |err| {
251 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 });
252 return err;
253 };
254 if (idx == 0) break;
255 }
238 } 256 }
239 } 257 }
240} 258}
@@ -277,3 +295,4 @@ const GraphemeIterator = @import("Graphemes").Iterator;
277const Normalize = @import("Normalize"); 295const Normalize = @import("Normalize");
278 296
279const WordBreak = @import("WordBreak"); 297const WordBreak = @import("WordBreak");
298const Word = WordBreak.Word;