summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-13 16:33:02 -0400
committerGravatar Sam Atman2025-05-15 15:31:16 -0400
commit7ff729895e72fc841440ec73a44c142779fcae1e (patch)
tree8917658e78f42d14a824f2595664b0a88f018c3a /src
parentAdd wordAtCursor (diff)
downloadzg-7ff729895e72fc841440ec73a44c142779fcae1e.tar.gz
zg-7ff729895e72fc841440ec73a44c142779fcae1e.tar.xz
zg-7ff729895e72fc841440ec73a44c142779fcae1e.zip
Reverse Word Iterator
Next up I hook it to the tests.
Diffstat (limited to 'src')
-rw-r--r--src/WordBreak.zig156
-rw-r--r--src/code_point.zig2
2 files changed, 157 insertions, 1 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index f0da30d..37c0df9 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -260,6 +260,161 @@ pub const Iterator = struct {
260 } 260 }
261}; 261};
262 262
263pub const ReverseIterator = struct {
264 after: ?CodePoint = null,
265 before: ?CodePoint = null,
266 cp_iter: ReverseCodepointIterator,
267 wb: *const WordBreak,
268
269 /// Assumes `str` is valid UTF-8.
270 pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
271 var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb };
272 wb_iter.advance();
273 return wb_iter;
274 }
275
276 /// Returns the previous word segment, without advancing.
277 pub fn peek(iter: *ReverseIterator) ?Word {
278 const cache = .{ iter.before, iter.after, iter.cp_iter };
279 defer {
280 iter.before, iter.after, iter.cp_iter = cache;
281 }
282 return iter.prev();
283 }
284
285 /// Return the previous word, if any.
286 pub fn prev(iter: *ReverseIterator) ?Word {
287 iter.advance();
288
289 // Done?
290 if (iter.after == null) return null;
291 // Last?
292 if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 };
293
294 const word_end = iter.after.?.offset + iter.after.?.len;
295 var word_len: u32 = 0;
296
297 // State variables.
298 var last_p: WordBreakProperty = .none;
299 var last_last_p: WordBreakProperty = .none;
300 var ri_count: usize = 0;
301
302 scan: while (true) : (iter.advance()) {
303 const after = iter.after.?;
304 word_len += after.len;
305 if (iter.before) |before| {
306 const after_p = iter.wb.breakProp(after);
307 const before_p = iter.wb.breakProp(before);
308 if (!isIgnorable(after_p)) {
309 last_last_p = last_p;
310 last_p = after_p;
311 }
312 // WB3 CR × LF
313 if (before_p == .CR and after_p == .LF) continue :scan;
314 // WB3a (Newline | CR | LF) ÷
315 if (isNewline(before_p)) break :scan;
316 // WB3b ÷ (Newline | CR | LF)
317 if (isNewline(after_p)) break :scan;
318 // WB3c ZWJ × \p{Extended_Pictographic}
319 if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) {
320 continue :scan;
321 }
322 // WB3d WSegSpace × WSegSpace
323 if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan;
324 // WB4 X (Extend | Format | ZWJ)* → X
325 if (isIgnorable(after_p)) {
326 continue :scan;
327 } // Now we use last_p instead of after_p for ignorable's sake
328 // WB5 AHLetter × AHLetter
329 if (isAHLetter(last_p) and isAHLetter(before_p)) {
330 continue :scan;
331 }
332 // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter
333 if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) {
334 continue :scan;
335 }
336 // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
337 if (isMidVal(before_p)) {
338 const prev_val = iter.peekPast();
339 if (prev_val) |prev_cp| {
340 const prev_p = iter.wb.breakProp(prev_cp);
341 if (isAHLetter(prev_p)) {
342 continue :scan;
343 }
344 }
345 }
346 // WB7a Hebrew_Letter × Single_Quote
347 if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan;
348 // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
349 if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) {
350 continue :scan;
351 }
352 // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
353 if (before_p == .Double_Quote and last_p == .Hebrew_Letter) {
354 const prev_val = iter.peekPast();
355 if (prev_val) |prev_cp| {
356 const prev_p = iter.wb.breakProp(prev_cp);
357 if (prev_p == .Hebrew_Letter) {
358 continue :scan;
359 }
360 }
361 }
362 // WB8 Numeric × Numeric
363 if (before_p == .Numeric and last_p == .Numeric) continue :scan;
364 // WB9 AHLetter × Numeric
365 if (isAHLetter(before_p) and last_p == .Numeric) continue :scan;
366 // WB10 Numeric × AHLetter
367 if (before_p == .Numeric and isAHLetter(last_p)) continue :scan;
368 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric
369 if (isMidNum(before_p) and last_p == .Numeric) {
370 const prev_val = iter.peekPast();
371 if (prev_val) |prev_cp| {
372 const prev_p = iter.wb.breakProp(prev_cp);
373 if (prev_p == .Numeric) {
374 continue :scan;
375 }
376 }
377 }
378 // WB12 Numeric × (MidNum | MidNumLetQ) Numeric
379 if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) {
380 continue :scan;
381 }
382 // WB13 Katakana × Katakana
383 if (before_p == .Katakana and last_p == .Katakana) continue :scan;
384 // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
385 if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan;
386 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana)
387 if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan;
388 // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI
389 const maybe_flag = before_p == .Regional_Indicator and last_p == .Regional_Indicator;
390 if (maybe_flag) {
391 ri_count += 1;
392 if (ri_count % 2 == 1) continue :scan;
393 }
394 // WB999 Any ÷ Any
395 break :scan;
396 }
397 break :scan;
398 }
399 return Word{ .len = word_len, .offset = word_end - word_len };
400 }
401
402 fn peekPast(iter: *ReverseIterator) ?CodePoint {
403 const save_cp = iter.cp_iter;
404 defer iter.cp_iter = save_cp;
405 while (iter.cp_iter.peek()) |peeked| {
406 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
407 _ = iter.cp_iter.prev();
408 }
409 return null;
410 }
411
412 fn advance(iter: *ReverseIterator) void {
413 iter.after = iter.before;
414 iter.before = iter.cp_iter.prev();
415 }
416};
417
263inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { 418inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void {
264 const decompressor = compress.flate.inflate.decompressor; 419 const decompressor = compress.flate.inflate.decompressor;
265 const in_bytes = @embedFile("wbp"); 420 const in_bytes = @embedFile("wbp");
@@ -371,6 +526,7 @@ const testing = std.testing;
371 526
372const code_point = @import("code_point"); 527const code_point = @import("code_point");
373const CodepointIterator = code_point.Iterator; 528const CodepointIterator = code_point.Iterator;
529const ReverseCodepointIterator = code_point.ReverseIterator;
374const CodePoint = code_point.CodePoint; 530const CodePoint = code_point.CodePoint;
375 531
376const ext_pict = @import("micro_runeset.zig").Extended_Pictographic; 532const ext_pict = @import("micro_runeset.zig").Extended_Pictographic;
diff --git a/src/code_point.zig b/src/code_point.zig
index 79ee5cd..a5b10d4 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -12,7 +12,7 @@ pub const CodePoint = struct {
12 offset: u32, 12 offset: u32,
13 13
14 /// Return the slice of this codepoint, given the original string. 14 /// Return the slice of this codepoint, given the original string.
15 pub fn bytes(cp: CodePoint, str: []const u8) []const u8 { 15 pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 {
16 return str[cp.offset..][0..cp.len]; 16 return str[cp.offset..][0..cp.len];
17 } 17 }
18}; 18};