summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-06-01 14:08:25 -0400
committerGravatar Sam Atman2025-06-01 14:08:25 -0400
commit8f5209fa095c2ed9114ce102b2f9b2cc90d66b13 (patch)
tree4ec54815215a9a808be0ab9a2968159f144ba076 /src
parentDocument "fat_offset" in README (diff)
downloadzg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.tar.gz
zg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.tar.xz
zg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.zip
Add graphemeAtIndex + iterate before and after
That completes the set. I do think it's possible to bum a few more cycles from the implementation, but, I'm not going to. It passes the acceptance suite and that's what it needs to do.
Diffstat (limited to 'src')
-rw-r--r--src/Graphemes.zig220
-rw-r--r--src/Words.zig4
-rw-r--r--src/code_point.zig60
-rw-r--r--src/unicode_tests.zig69
4 files changed, 266 insertions, 87 deletions
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 49fdbf3..f1c56ed 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -1,15 +1,7 @@
1const std = @import("std"); 1//! Graphemes Module
2const builtin = @import("builtin"); 2//!
3const mem = std.mem; 3//! Code for handling graphemes: fragments of string which should be
4const Allocator = mem.Allocator; 4//! treated as one unit. Like Farmer Bob here: 👨🏻‍🌾
5const compress = std.compress;
6const unicode = std.unicode;
7
8const code_point = @import("code_point");
9const CodePoint = code_point.CodePoint;
10const CodePointIterator = code_point.Iterator;
11const CodePointReverseIterator = code_point.ReverseIterator;
12const uoffset = code_point.uoffset;
13 5
14s1: []u16 = undefined, 6s1: []u16 = undefined,
15s2: []u16 = undefined, 7s2: []u16 = undefined,
@@ -69,10 +61,12 @@ pub fn isEmoji(graphemes: Graphemes, cp: u21) bool {
69 return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; 61 return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1;
70} 62}
71 63
64/// Returns an iterator over the graphemes in `string`.
72pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { 65pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator {
73 return Iterator.init(string, graphemes); 66 return Iterator.init(string, graphemes);
74} 67}
75 68
69/// Returns a reverse iterator over the graphemes in `string`.
76pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { 70pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator {
77 return ReverseIterator.init(string, graphemes); 71 return ReverseIterator.init(string, graphemes);
78} 72}
@@ -116,6 +110,96 @@ pub const Grapheme = struct {
116 } 110 }
117}; 111};
118 112
113// NOTE: graphemeAtIndex is, probably, not in an optimal form. It has the advantage
114// of being composed of other parts, but the constant factor can _probably_ be improved
115// by a bespoke implmentation using graphemes.graphemeBreak directly. There's a limit
116// to how much cycle-bumming I'm willing to do at any given moment; that limit has been
117// reached. Perhaps you, Dear Reader, might pick up the torch?
118
119/// Returns the `Grapheme` at `string[index]`, which does not have to be a
120/// valid start of a codepoint. Asserts the string is not empty. Index must be
121/// less than `string.len`. Always returns a `Grapheme`.
122pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme {
123 assert(string.len != 0);
124 if (index == 0 or (index > 0 and
125 string[index] < 0x80 and
126 string[index - 1] < 0x80) and
127 (string[index - 1] != '\r' and string[index] != '\n'))
128 {
129 // There's always a grapheme break between two ASCII code points (except CRLF)
130 var iter = graphemes.iterator(string[index..]);
131 const next = iter.next().?;
132 return Grapheme{
133 .len = next.len,
134 .offset = @as(u32, @intCast(index)) + next.offset,
135 };
136 } // Otherwise it gets hairy.
137 const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset;
138 if (idx == string.len) {
139 var iter = graphemes.reverseIterator(string);
140 return iter.prev().?;
141 }
142 // We're on a valid codepoint boundary, we go back from here
143 var r_iter = graphemes.reverseIterAtIndex(string, idx);
144 if (r_iter.prev()) |g| {
145 if (g.offset == 0) {
146 var iter = graphemes.iterator(string);
147 while (iter.next()) |g2| {
148 if (g2.offset <= idx and idx < g2.offset + g2.len) return g2;
149 }
150 }
151 }
152 // We need to toss one, because otherwise we might not be pending when
153 // we in fact need to be.
154 _ = r_iter.prev();
155 while (r_iter.pending != .none) : (_ = r_iter.prev()) {}
156 var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0);
157 while (iter.next()) |g| {
158 if (g.offset <= idx and idx < g.offset + g.len) return g;
159 }
160 unreachable;
161}
162
163/// Return a (forward) iterator of `string` after `grapheme`.
164pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator {
165 return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len);
166}
167
168/// Return a reverse iterator of `string` before `grapheme`.
169pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator {
170 // This bit of weirdness is because reverse iterators are "advance last",
171 // while forward iterators are "advance first". This leaves some room for
172 // further optimization, if anyone dares.
173 var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1);
174 _ = r_iter.prev();
175 return r_iter;
176}
177
178fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator {
179 var r_iter: ReverseIterator = undefined;
180 r_iter.data = graphemes;
181 var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
182 r_iter.buf[1] = rcp_iter.prev();
183 r_iter.buf[0] = rcp_iter.prev();
184 r_iter.pending = .none;
185 r_iter.cp_iter = rcp_iter;
186 return r_iter;
187}
188
189fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator {
190 var iter: Iterator = undefined;
191 iter.data = graphemes;
192 iter.buf[0] = first: {
193 if (idx == string.len) break :first null;
194 var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
195 break :first r_cp_iter.prev();
196 };
197 var cp_iter: CodePointIterator = .{ .bytes = string, .i = idx };
198 iter.buf[1] = cp_iter.next();
199 iter.cp_iter = cp_iter;
200 return iter;
201}
202
119/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. 203/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
120pub const Iterator = struct { 204pub const Iterator = struct {
121 buf: [2]?CodePoint = .{ null, null }, 205 buf: [2]?CodePoint = .{ null, null },
@@ -150,7 +234,7 @@ pub const Iterator = struct {
150 234
151 const gc_start = self.buf[0].?.offset; 235 const gc_start = self.buf[0].?.offset;
152 var gc_len: u8 = self.buf[0].?.len; 236 var gc_len: u8 = self.buf[0].?.len;
153 var state = State{}; 237 var state = IterState{};
154 238
155 if (graphemeBreak( 239 if (graphemeBreak(
156 self.buf[0].?.code, 240 self.buf[0].?.code,
@@ -189,12 +273,13 @@ pub const Iterator = struct {
189 } 273 }
190}; 274};
191 275
276/// Iterate a string backward by Grapheme.
192pub const ReverseIterator = struct { 277pub const ReverseIterator = struct {
193 buf: [2]?CodePoint = .{ null, null }, 278 buf: [2]?CodePoint = .{ null, null },
194 cp_iter: CodePointReverseIterator, 279 cp_iter: CodePointReverseIterator,
195 data: *const Graphemes, 280 data: *const Graphemes,
196 /// Codepoint read from `cp_iter` but not returned by `previous` 281 /// Codepoint read from `cp_iter` but not returned by `previous`
197 pending: Pending = .{ .none = {} }, 282 pending: Pending = .none,
198 283
199 const Pending = union(enum) { 284 const Pending = union(enum) {
200 none: void, 285 none: void,
@@ -218,6 +303,12 @@ pub const ReverseIterator = struct {
218 self.buf[0] = self.cp_iter.prev(); 303 self.buf[0] = self.cp_iter.prev();
219 } 304 }
220 305
306 pub fn peek(self: *Self) ?Grapheme {
307 const cache = .{ self.buf, self.cp_iter, self.pending };
308 defer self.buf, self.cp_iter, self.pending = cache;
309 return self.prev();
310 }
311
221 pub fn prev(self: *Self) ?Grapheme { 312 pub fn prev(self: *Self) ?Grapheme {
222 if (self.buf[1] == null) return null; 313 if (self.buf[1] == null) return null;
223 314
@@ -255,10 +346,10 @@ pub const ReverseIterator = struct {
255 }; 346 };
256 347
257 while (self.buf[0] != null) { 348 while (self.buf[0] != null) {
258 var state: State = .{}; 349 var state: IterState = .{};
259 state.setXpic(); 350 state.xpic = true;
260 state.unsetRegional(); 351 state.regional = false;
261 state.setIndic(); 352 state.indic = true;
262 353
263 if (graphemeBreak( 354 if (graphemeBreak(
264 self.buf[0].?.code, 355 self.buf[0].?.code,
@@ -269,7 +360,7 @@ pub const ReverseIterator = struct {
269 360
270 self.advance(); 361 self.advance();
271 362
272 if (!state.hasIndic()) { 363 if (!state.indic) {
273 364
274 // BUF: [?Any, Extend | Linker] Consonant 365 // BUF: [?Any, Extend | Linker] Consonant
275 var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; 366 var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
@@ -296,11 +387,11 @@ pub const ReverseIterator = struct {
296 self.advance(); 387 self.advance();
297 388
298 if (self.buf[0]) |cp1| { 389 if (self.buf[0]) |cp1| {
299 state.setIndic(); 390 state.indic = true;
300 391
301 if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; 392 if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break;
302 393
303 if (!state.hasIndic()) { 394 if (!state.indic) {
304 continue :indic; 395 continue :indic;
305 } else { 396 } else {
306 break :indic; 397 break :indic;
@@ -321,7 +412,7 @@ pub const ReverseIterator = struct {
321 } 412 }
322 } 413 }
323 414
324 if (!state.hasXpic()) { 415 if (!state.xpic) {
325 // BUF: [?Any, ZWJ] Emoji 416 // BUF: [?Any, ZWJ] Emoji
326 var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; 417 var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
327 418
@@ -370,7 +461,7 @@ pub const ReverseIterator = struct {
370 } 461 }
371 } 462 }
372 463
373 if (state.hasRegional()) { 464 if (state.regional) {
374 var ri_count: usize = 0; 465 var ri_count: usize = 0;
375 while (self.buf[0] != null and 466 while (self.buf[0] != null and
376 self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) 467 self.data.gbp(self.buf[0].?.code) == .Regional_Indicator)
@@ -404,6 +495,13 @@ pub const ReverseIterator = struct {
404 } 495 }
405}; 496};
406 497
498/// Grapheme Iterator state.
499pub const IterState = packed struct(u3) {
500 xpic: bool = false,
501 regional: bool = false,
502 indic: bool = false,
503};
504
407// Predicates 505// Predicates
408fn isBreaker(cp: u21, data: *const Graphemes) bool { 506fn isBreaker(cp: u21, data: *const Graphemes) bool {
409 // Extract relevant properties. 507 // Extract relevant properties.
@@ -411,44 +509,6 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool {
411 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; 509 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
412} 510}
413 511
414// Grapheme break state.
415pub const State = struct {
416 bits: u3 = 0,
417
418 // Extended Pictographic (emoji)
419 fn hasXpic(self: State) bool {
420 return self.bits & 1 == 1;
421 }
422 fn setXpic(self: *State) void {
423 self.bits |= 1;
424 }
425 fn unsetXpic(self: *State) void {
426 self.bits &= ~@as(u3, 1);
427 }
428
429 // Regional Indicatior (flags)
430 fn hasRegional(self: State) bool {
431 return self.bits & 2 == 2;
432 }
433 fn setRegional(self: *State) void {
434 self.bits |= 2;
435 }
436 fn unsetRegional(self: *State) void {
437 self.bits &= ~@as(u3, 2);
438 }
439
440 // Indic Conjunct
441 fn hasIndic(self: State) bool {
442 return self.bits & 4 == 4;
443 }
444 fn setIndic(self: *State) void {
445 self.bits |= 4;
446 }
447 fn unsetIndic(self: *State) void {
448 self.bits &= ~@as(u3, 4);
449 }
450};
451
452/// `graphemeBreak` returns true only if a grapheme break point is required 512/// `graphemeBreak` returns true only if a grapheme break point is required
453/// between `cp1` and `cp2`. `state` should start out as 0. If calling 513/// between `cp1` and `cp2`. `state` should start out as 0. If calling
454/// iteratively over a sequence of code points, this function must be called 514/// iteratively over a sequence of code points, this function must be called
@@ -459,7 +519,7 @@ pub fn graphemeBreak(
459 cp1: u21, 519 cp1: u21,
460 cp2: u21, 520 cp2: u21,
461 data: *const Graphemes, 521 data: *const Graphemes,
462 state: *State, 522 state: *IterState,
463) bool { 523) bool {
464 // Extract relevant properties. 524 // Extract relevant properties.
465 const cp1_gbp_prop = data.gbp(cp1); 525 const cp1_gbp_prop = data.gbp(cp1);
@@ -471,9 +531,9 @@ pub fn graphemeBreak(
471 const cp2_is_emoji = data.isEmoji(cp2); 531 const cp2_is_emoji = data.isEmoji(cp2);
472 532
473 // GB11: Emoji Extend* ZWJ x Emoji 533 // GB11: Emoji Extend* ZWJ x Emoji
474 if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); 534 if (!state.xpic and cp1_is_emoji) state.xpic = true;
475 // GB9c: Indic Conjunct Break 535 // GB9c: Indic Conjunct Break
476 if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic(); 536 if (!state.indic and cp1_indic_prop == .Consonant) state.indic = true;
477 537
478 // GB3: CR x LF 538 // GB3: CR x LF
479 if (cp1 == '\r' and cp2 == '\n') return false; 539 if (cp1 == '\r' and cp2 == '\n') return false;
@@ -482,11 +542,11 @@ pub fn graphemeBreak(
482 if (isBreaker(cp1, data)) return true; 542 if (isBreaker(cp1, data)) return true;
483 543
484 // GB11: Emoji Extend* ZWJ x Emoji 544 // GB11: Emoji Extend* ZWJ x Emoji
485 if (state.hasXpic() and 545 if (state.xpic and
486 cp1_gbp_prop == .ZWJ and 546 cp1_gbp_prop == .ZWJ and
487 cp2_is_emoji) 547 cp2_is_emoji)
488 { 548 {
489 state.unsetXpic(); 549 state.xpic = false;
490 return false; 550 return false;
491 } 551 }
492 552
@@ -501,11 +561,11 @@ pub fn graphemeBreak(
501 561
502 // GB12, GB13: RI x RI 562 // GB12, GB13: RI x RI
503 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { 563 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
504 if (state.hasRegional()) { 564 if (state.regional) {
505 state.unsetRegional(); 565 state.regional = false;
506 return true; 566 return true;
507 } else { 567 } else {
508 state.setRegional(); 568 state.regional = true;
509 return false; 569 return false;
510 } 570 }
511 } 571 }
@@ -530,25 +590,25 @@ pub fn graphemeBreak(
530 } 590 }
531 591
532 // GB9c: Indic Conjunct Break 592 // GB9c: Indic Conjunct Break
533 if (state.hasIndic() and 593 if (state.indic and
534 cp1_indic_prop == .Consonant and 594 cp1_indic_prop == .Consonant and
535 (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) 595 (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker))
536 { 596 {
537 return false; 597 return false;
538 } 598 }
539 599
540 if (state.hasIndic() and 600 if (state.indic and
541 cp1_indic_prop == .Extend and 601 cp1_indic_prop == .Extend and
542 cp2_indic_prop == .Linker) 602 cp2_indic_prop == .Linker)
543 { 603 {
544 return false; 604 return false;
545 } 605 }
546 606
547 if (state.hasIndic() and 607 if (state.indic and
548 (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and 608 (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and
549 cp2_indic_prop == .Consonant) 609 cp2_indic_prop == .Consonant)
550 { 610 {
551 state.unsetIndic(); 611 state.indic = false;
552 return false; 612 return false;
553 } 613 }
554 614
@@ -608,3 +668,17 @@ test "Iterator.peek" {
608 try std.testing.expectEqual(null, iter.peek()); 668 try std.testing.expectEqual(null, iter.peek());
609 try std.testing.expectEqual(iter.peek(), iter.next()); 669 try std.testing.expectEqual(iter.peek(), iter.next());
610} 670}
671
672const std = @import("std");
673const builtin = @import("builtin");
674const assert = std.debug.assert;
675const mem = std.mem;
676const Allocator = mem.Allocator;
677const compress = std.compress;
678const unicode = std.unicode;
679
680const code_point = @import("code_point");
681const CodePoint = code_point.CodePoint;
682const CodePointIterator = code_point.Iterator;
683const CodePointReverseIterator = code_point.ReverseIterator;
684const uoffset = code_point.uoffset;
diff --git a/src/Words.zig b/src/Words.zig
index 1707881..af82562 100644
--- a/src/Words.zig
+++ b/src/Words.zig
@@ -124,12 +124,12 @@ pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator {
124} 124}
125 125
126/// Returns an iterator after the `word` in `slice`. 126/// Returns an iterator after the `word` in `slice`.
127pub fn iterateAfter(words: *const Words, slice: []const u8, word: Word) Iterator { 127pub fn iterateAfterWord(words: *const Words, slice: []const u8, word: Word) Iterator {
128 return forwardFromIndex(words, slice, word.offset + word.len); 128 return forwardFromIndex(words, slice, word.offset + word.len);
129} 129}
130 130
131/// Returns a reverse iterator before the `word` in `slice`. 131/// Returns a reverse iterator before the `word` in `slice`.
132pub fn iterateBefore(words: *const Words, slice: []const u8, word: Word) ReverseIterator { 132pub fn iterateBeforeWord(words: *const Words, slice: []const u8, word: Word) ReverseIterator {
133 return reverseFromIndex(words, slice, word.offset); 133 return reverseFromIndex(words, slice, word.offset);
134} 134}
135 135
diff --git a/src/code_point.zig b/src/code_point.zig
index 8bd3d5b..16648af 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -39,9 +39,17 @@ pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint {
39 return null; 39 return null;
40} 40}
41 41
42/// Return the codepoint at `index`, even if `index` is in the middle
43/// of that codepoint.
44pub fn codepointAtIndex(bytes: []const u8, index: uoffset) ?CodePoint {
45 var idx = index;
46 while (idx > 0 and 0x80 <= bytes[idx] and bytes[idx] <= 0xbf) : (idx -= 1) {}
47 return decodeAtIndex(bytes, idx);
48}
49
42/// Decode the CodePoint, if any, at `bytes[idx]`. 50/// Decode the CodePoint, if any, at `bytes[idx]`.
43pub fn decodeAtIndex(bytes: []const u8, idx: uoffset) ?CodePoint { 51pub fn decodeAtIndex(bytes: []const u8, index: uoffset) ?CodePoint {
44 var off = idx; 52 var off = index;
45 return decodeAtCursor(bytes, &off); 53 return decodeAtCursor(bytes, &off);
46} 54}
47 55
@@ -329,6 +337,54 @@ test Iterator {
329 try expectEqual(@as(?CodePoint, null), iter.next()); 337 try expectEqual(@as(?CodePoint, null), iter.next());
330} 338}
331 339
340const code_point = @This();
341
342// Keep this in sync with the README
343test "Code point iterator" {
344 const str = "Hi 😊";
345 var iter: code_point.Iterator = .init(str);
346 var i: usize = 0;
347
348 while (iter.next()) |cp| : (i += 1) {
349 // The `code` field is the actual code point scalar as a `u21`.
350 if (i == 0) try expect(cp.code == 'H');
351 if (i == 1) try expect(cp.code == 'i');
352 if (i == 2) try expect(cp.code == ' ');
353
354 if (i == 3) {
355 try expect(cp.code == '😊');
356 // The `offset` field is the byte offset in the
357 // source string.
358 try expect(cp.offset == 3);
359 try expectEqual(cp, code_point.decodeAtIndex(str, cp.offset).?);
360 // The `len` field is the length in bytes of the
361 // code point in the source string.
362 try expect(cp.len == 4);
363 // There is also a 'cursor' decode, like so:
364 {
365 var cursor = cp.offset;
366 try expectEqual(cp, code_point.decodeAtCursor(str, &cursor).?);
367 // Which advances the cursor variable to the next possible
368 // offset, in this case, `str.len`. Don't forget to account
369 // for this possibility!
370 try expectEqual(cp.offset + cp.len, cursor);
371 }
372 // There's also this, for when you aren't sure if you have the
373 // correct start for a code point:
374 try expectEqual(cp, code_point.codepointAtIndex(str, cp.offset + 1).?);
375 }
376 // Reverse iteration is also an option:
377 var r_iter: code_point.ReverseIterator = .init(str);
378 // Both iterators can be peeked:
379 try expectEqual('😊', r_iter.peek().?.code);
380 try expectEqual('😊', r_iter.prev().?.code);
381 // Both kinds of iterators can be reversed:
382 var fwd_iter = r_iter.forwardIterator(); // or iter.reverseIterator();
383 // This will always return the last codepoint from
384 // the prior iterator, _if_ it yielded one:
385 try expectEqual('😊', fwd_iter.next().?.code);
386 }
387}
332test "overlongs" { 388test "overlongs" {
333 // None of these should equal `/`, all should be byte-for-byte 389 // None of these should equal `/`, all should be byte-for-byte
334 // handled as replacement characters. 390 // handled as replacement characters.
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index c463dcc..ae177a9 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -162,20 +162,51 @@ test "Segmentation GraphemeIterator" {
162 bytes_index += cp_index; 162 bytes_index += cp_index;
163 } 163 }
164 164
165 const this_str = all_bytes.items;
166
165 { 167 {
166 var iter = graph.iterator(all_bytes.items); 168 var iter = graph.iterator(this_str);
167 169
168 // Check. 170 // Check.
169 for (want.items) |want_gc| { 171 for (want.items, 1..) |want_gc, idx| {
170 const got_gc = (iter.next()).?; 172 const got_gc = (iter.next()).?;
171 try std.testing.expectEqualStrings( 173 try std.testing.expectEqualStrings(
172 want_gc.bytes(all_bytes.items), 174 want_gc.bytes(this_str),
173 got_gc.bytes(all_bytes.items), 175 got_gc.bytes(this_str),
174 ); 176 );
177 for (got_gc.offset..got_gc.offset + got_gc.len) |i| {
178 const this_gc = graph.graphemeAtIndex(this_str, i);
179 std.testing.expectEqualSlices(
180 u8,
181 got_gc.bytes(this_str),
182 this_gc.bytes(this_str),
183 ) catch |err| {
184 debug.print("Wrong grapheme on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i });
185 return err;
186 };
187 }
188 var after_iter = graph.iterateAfterGrapheme(this_str, got_gc);
189 if (after_iter.next()) |next_gc| {
190 if (iter.peek()) |next_peek| {
191 std.testing.expectEqualSlices(
192 u8,
193 next_gc.bytes(this_str),
194 next_peek.bytes(this_str),
195 ) catch |err| {
196 debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, idx });
197 return err;
198 };
199 } else {
200 debug.print("Mismatch: peek missing, next found, line {d} #{d}\n", .{ line_iter.line, idx });
201 try testing.expect(false);
202 }
203 } else {
204 try testing.expectEqual(null, iter.peek());
205 }
175 } 206 }
176 } 207 }
177 { 208 {
178 var iter = graph.reverseIterator(all_bytes.items); 209 var iter = graph.reverseIterator(this_str);
179 210
180 // Check. 211 // Check.
181 var i: usize = want.items.len; 212 var i: usize = want.items.len;
@@ -190,8 +221,8 @@ test "Segmentation GraphemeIterator" {
190 return error.TestExpectedEqual; 221 return error.TestExpectedEqual;
191 }; 222 };
192 std.testing.expectEqualStrings( 223 std.testing.expectEqualStrings(
193 want_gc.bytes(all_bytes.items), 224 want_gc.bytes(this_str),
194 got_gc.bytes(all_bytes.items), 225 got_gc.bytes(this_str),
195 ) catch |err| { 226 ) catch |err| {
196 std.debug.print( 227 std.debug.print(
197 "line {d} grapheme {d}: expected {any} found {any}\n", 228 "line {d} grapheme {d}: expected {any} found {any}\n",
@@ -199,6 +230,24 @@ test "Segmentation GraphemeIterator" {
199 ); 230 );
200 return err; 231 return err;
201 }; 232 };
233 var before_iter = graph.iterateBeforeGrapheme(this_str, got_gc);
234 if (before_iter.prev()) |prev_gc| {
235 if (iter.peek()) |prev_peek| {
236 std.testing.expectEqualSlices(
237 u8,
238 prev_gc.bytes(this_str),
239 prev_peek.bytes(this_str),
240 ) catch |err| {
241 debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, i });
242 return err;
243 };
244 } else {
245 debug.print("Mismatch: peek missing, prev found, line {d} #{d}\n", .{ line_iter.line, i });
246 try testing.expect(false);
247 }
248 } else {
249 try testing.expectEqual(null, iter.peek());
250 }
202 } 251 }
203 } 252 }
204 } 253 }
@@ -287,7 +336,7 @@ test "Segmentation Word Iterator" {
287 } else { 336 } else {
288 try testing.expect(false); 337 try testing.expect(false);
289 } 338 }
290 var peek_iter = wb.iterateAfter(this_str, got_word); 339 var peek_iter = wb.iterateAfterWord(this_str, got_word);
291 const peek_1 = peek_iter.next(); 340 const peek_1 = peek_iter.next();
292 if (peek_1) |p1| { 341 if (peek_1) |p1| {
293 const peek_2 = iter.peek(); 342 const peek_2 = iter.peek();
@@ -313,7 +362,7 @@ test "Segmentation Word Iterator" {
313 got_word.bytes(this_str), 362 got_word.bytes(this_str),
314 this_word.bytes(this_str), 363 this_word.bytes(this_str),
315 ) catch |err| { 364 ) catch |err| {
316 debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i }); 365 debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i });
317 return err; 366 return err;
318 }; 367 };
319 } 368 }
@@ -356,7 +405,7 @@ test "Segmentation Word Iterator" {
356 } else { 405 } else {
357 try testing.expect(false); 406 try testing.expect(false);
358 } 407 }
359 var peek_iter = wb.iterateBefore(this_str, got_word); 408 var peek_iter = wb.iterateBeforeWord(this_str, got_word);
360 const peek_1 = peek_iter.prev(); 409 const peek_1 = peek_iter.prev();
361 if (peek_1) |p1| { 410 if (peek_1) |p1| {
362 const peek_2 = r_iter.peek(); 411 const peek_2 = r_iter.peek();