summaryrefslogtreecommitdiff
path: root/src/Graphemes.zig
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-07-08 12:15:32 -0400
committerGravatar Sam Atman2025-07-08 12:15:32 -0400
commit9427a9e53aaa29ee071f4dcb35b809a699d75aa9 (patch)
tree2607c185fd8053b84d60041fadc35c05a0225d34 /src/Graphemes.zig
parentMerge pull request 'Fix benchmarks' (#56) from jacobsandlund/zg:benchmarks in... (diff)
parentAdd Words.zig example to README (diff)
downloadzg-master.tar.gz
zg-master.tar.xz
zg-master.zip
Merge branch 'develop-next'HEADv0.14.1master
Diffstat (limited to 'src/Graphemes.zig')
-rw-r--r--src/Graphemes.zig479
1 files changed, 370 insertions, 109 deletions
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 7bf328a..f1c56ed 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -1,12 +1,7 @@
1const std = @import("std"); 1//! Graphemes Module
2const builtin = @import("builtin"); 2//!
3const mem = std.mem; 3//! Code for handling graphemes: fragments of string which should be
4const Allocator = mem.Allocator; 4//! treated as one unit. Like Farmer Bob here: 👨🏻‍🌾
5const compress = std.compress;
6const unicode = std.unicode;
7
8const CodePoint = @import("code_point").CodePoint;
9const CodePointIterator = @import("code_point").Iterator;
10 5
11s1: []u16 = undefined, 6s1: []u16 = undefined,
12s2: []u16 = undefined, 7s2: []u16 = undefined,
@@ -66,10 +61,16 @@ pub fn isEmoji(graphemes: Graphemes, cp: u21) bool {
66 return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; 61 return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1;
67} 62}
68 63
64/// Returns an iterator over the graphemes in `string`.
69pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { 65pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator {
70 return Iterator.init(string, graphemes); 66 return Iterator.init(string, graphemes);
71} 67}
72 68
69/// Returns a reverse iterator over the graphemes in `string`.
70pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator {
71 return ReverseIterator.init(string, graphemes);
72}
73
73/// Indic syllable type. 74/// Indic syllable type.
74pub const Indic = enum { 75pub const Indic = enum {
75 none, 76 none,
@@ -99,8 +100,8 @@ pub const Gbp = enum {
99 100
100/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. 101/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
101pub const Grapheme = struct { 102pub const Grapheme = struct {
102 len: u8, 103 len: uoffset,
103 offset: u32, 104 offset: uoffset,
104 105
105 /// `bytes` returns the slice of bytes that correspond to 106 /// `bytes` returns the slice of bytes that correspond to
106 /// this grapheme cluster in `src`. 107 /// this grapheme cluster in `src`.
@@ -109,6 +110,96 @@ pub const Grapheme = struct {
109 } 110 }
110}; 111};
111 112
113// NOTE: graphemeAtIndex is, probably, not in an optimal form. It has the advantage
114// of being composed of other parts, but the constant factor can _probably_ be improved
115// by a bespoke implmentation using graphemes.graphemeBreak directly. There's a limit
116// to how much cycle-bumming I'm willing to do at any given moment; that limit has been
117// reached. Perhaps you, Dear Reader, might pick up the torch?
118
119/// Returns the `Grapheme` at `string[index]`, which does not have to be a
120/// valid start of a codepoint. Asserts the string is not empty. Index must be
121/// less than `string.len`. Always returns a `Grapheme`.
122pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme {
123 assert(string.len != 0);
124 if (index == 0 or (index > 0 and
125 string[index] < 0x80 and
126 string[index - 1] < 0x80) and
127 (string[index - 1] != '\r' and string[index] != '\n'))
128 {
129 // There's always a grapheme break between two ASCII code points (except CRLF)
130 var iter = graphemes.iterator(string[index..]);
131 const next = iter.next().?;
132 return Grapheme{
133 .len = next.len,
134 .offset = @as(u32, @intCast(index)) + next.offset,
135 };
136 } // Otherwise it gets hairy.
137 const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset;
138 if (idx == string.len) {
139 var iter = graphemes.reverseIterator(string);
140 return iter.prev().?;
141 }
142 // We're on a valid codepoint boundary, we go back from here
143 var r_iter = graphemes.reverseIterAtIndex(string, idx);
144 if (r_iter.prev()) |g| {
145 if (g.offset == 0) {
146 var iter = graphemes.iterator(string);
147 while (iter.next()) |g2| {
148 if (g2.offset <= idx and idx < g2.offset + g2.len) return g2;
149 }
150 }
151 }
152 // We need to toss one, because otherwise we might not be pending when
153 // we in fact need to be.
154 _ = r_iter.prev();
155 while (r_iter.pending != .none) : (_ = r_iter.prev()) {}
156 var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0);
157 while (iter.next()) |g| {
158 if (g.offset <= idx and idx < g.offset + g.len) return g;
159 }
160 unreachable;
161}
162
163/// Return a (forward) iterator of `string` after `grapheme`.
164pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator {
165 return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len);
166}
167
168/// Return a reverse iterator of `string` before `grapheme`.
169pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator {
170 // This bit of weirdness is because reverse iterators are "advance last",
171 // while forward iterators are "advance first". This leaves some room for
172 // further optimization, if anyone dares.
173 var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1);
174 _ = r_iter.prev();
175 return r_iter;
176}
177
178fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator {
179 var r_iter: ReverseIterator = undefined;
180 r_iter.data = graphemes;
181 var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
182 r_iter.buf[1] = rcp_iter.prev();
183 r_iter.buf[0] = rcp_iter.prev();
184 r_iter.pending = .none;
185 r_iter.cp_iter = rcp_iter;
186 return r_iter;
187}
188
189fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator {
190 var iter: Iterator = undefined;
191 iter.data = graphemes;
192 iter.buf[0] = first: {
193 if (idx == string.len) break :first null;
194 var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
195 break :first r_cp_iter.prev();
196 };
197 var cp_iter: CodePointIterator = .{ .bytes = string, .i = idx };
198 iter.buf[1] = cp_iter.next();
199 iter.cp_iter = cp_iter;
200 return iter;
201}
202
112/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. 203/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
113pub const Iterator = struct { 204pub const Iterator = struct {
114 buf: [2]?CodePoint = .{ null, null }, 205 buf: [2]?CodePoint = .{ null, null },
@@ -143,7 +234,7 @@ pub const Iterator = struct {
143 234
144 const gc_start = self.buf[0].?.offset; 235 const gc_start = self.buf[0].?.offset;
145 var gc_len: u8 = self.buf[0].?.len; 236 var gc_len: u8 = self.buf[0].?.len;
146 var state = State{}; 237 var state = IterState{};
147 238
148 if (graphemeBreak( 239 if (graphemeBreak(
149 self.buf[0].?.code, 240 self.buf[0].?.code,
@@ -173,72 +264,244 @@ pub const Iterator = struct {
173 const saved_cp_iter = self.cp_iter; 264 const saved_cp_iter = self.cp_iter;
174 const s0 = self.buf[0]; 265 const s0 = self.buf[0];
175 const s1 = self.buf[1]; 266 const s1 = self.buf[1];
176 267 defer {
177 self.advance();
178
179 // If no more
180 if (self.buf[0] == null) {
181 self.cp_iter = saved_cp_iter;
182 self.buf[0] = s0;
183 self.buf[1] = s1;
184 return null;
185 }
186 // If last one
187 if (self.buf[1] == null) {
188 const len = self.buf[0].?.len;
189 const offset = self.buf[0].?.offset;
190 self.cp_iter = saved_cp_iter;
191 self.buf[0] = s0;
192 self.buf[1] = s1;
193 return Grapheme{ .len = len, .offset = offset };
194 }
195 // If ASCII
196 if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) {
197 const len = self.buf[0].?.len;
198 const offset = self.buf[0].?.offset;
199 self.cp_iter = saved_cp_iter; 268 self.cp_iter = saved_cp_iter;
200 self.buf[0] = s0; 269 self.buf[0] = s0;
201 self.buf[1] = s1; 270 self.buf[1] = s1;
202 return Grapheme{ .len = len, .offset = offset };
203 } 271 }
272 return self.next();
273 }
274};
204 275
205 const gc_start = self.buf[0].?.offset; 276/// Iterate a string backward by Grapheme.
206 var gc_len: u8 = self.buf[0].?.len; 277pub const ReverseIterator = struct {
207 var state = State{}; 278 buf: [2]?CodePoint = .{ null, null },
279 cp_iter: CodePointReverseIterator,
280 data: *const Graphemes,
281 /// Codepoint read from `cp_iter` but not returned by `previous`
282 pending: Pending = .none,
208 283
209 if (graphemeBreak( 284 const Pending = union(enum) {
210 self.buf[0].?.code, 285 none: void,
211 self.buf[1].?.code, 286 /// Count of pending RI codepoints, it is an even number
212 self.data, 287 ri_count: usize,
213 &state, 288 /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji
214 )) { 289 extend_end: uoffset,
215 self.cp_iter = saved_cp_iter; 290 };
216 self.buf[0] = s0;
217 self.buf[1] = s1;
218 return Grapheme{ .len = gc_len, .offset = gc_start };
219 }
220 291
221 while (true) { 292 const Self = @This();
222 self.advance();
223 if (self.buf[0] == null) break;
224 293
225 gc_len += self.buf[0].?.len; 294 pub fn init(str: []const u8, data: *const Graphemes) Self {
295 var self: Self = .{ .cp_iter = .init(str), .data = data };
296 self.advance();
297 self.advance();
298 return self;
299 }
300
301 fn advance(self: *Self) void {
302 self.buf[1] = self.buf[0];
303 self.buf[0] = self.cp_iter.prev();
304 }
305
306 pub fn peek(self: *Self) ?Grapheme {
307 const cache = .{ self.buf, self.cp_iter, self.pending };
308 defer self.buf, self.cp_iter, self.pending = cache;
309 return self.prev();
310 }
311
312 pub fn prev(self: *Self) ?Grapheme {
313 if (self.buf[1] == null) return null;
314
315 const grapheme_end: uoffset = end: {
316 const codepoint = self.buf[1].?;
317
318 switch (self.pending) {
319 // BUF: [?Any, Any]
320 .none => break :end codepoint.offset + codepoint.len,
321 .ri_count => |ri_count| {
322 std.debug.assert(ri_count > 0);
323 std.debug.assert(ri_count % 2 == 0);
324
325 if (ri_count > 2) {
326 self.pending.ri_count -= 2;
327
328 // Use the fact that all RI have length 4 in utf8 encoding
329 // since they are in range 0x1f1e6...0x1f1ff
330 // https://en.wikipedia.org/wiki/UTF-8#Encoding
331 return Grapheme{
332 .len = 8,
333 .offset = @intCast(codepoint.offset + self.pending.ri_count * 4),
334 };
335 } else {
336 self.pending = .{ .none = {} };
337 break :end codepoint.offset + codepoint.len + 4;
338 }
339 },
340 // BUF: [?Any, Extend] Extend* ZWJ
341 .extend_end => |extend_end| {
342 self.pending = .{ .none = {} };
343 break :end extend_end;
344 },
345 }
346 };
347
348 while (self.buf[0] != null) {
349 var state: IterState = .{};
350 state.xpic = true;
351 state.regional = false;
352 state.indic = true;
226 353
227 if (graphemeBreak( 354 if (graphemeBreak(
228 self.buf[0].?.code, 355 self.buf[0].?.code,
229 if (self.buf[1]) |ncp| ncp.code else 0, 356 self.buf[1].?.code,
230 self.data, 357 self.data,
231 &state, 358 &state,
232 )) break; 359 )) break;
360
361 self.advance();
362
363 if (!state.indic) {
364
365 // BUF: [?Any, Extend | Linker] Consonant
366 var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
367
368 indic: while (true) {
369 if (self.buf[0] == null) {
370 self.pending = .{ .extend_end = indic_offset };
371 return .{
372 .len = @intCast(grapheme_end - indic_offset),
373 .offset = indic_offset,
374 };
375 }
376
377 const codepoint = self.buf[0].?;
378
379 switch (self.data.indic(codepoint.code)) {
380 .Extend, .Linker => {
381 self.advance();
382 continue :indic;
383 },
384 .Consonant => {
385 // BUF: [Consonant, Extend | Linker] (Extend | Linker)* Consonant
386 indic_offset = codepoint.offset;
387 self.advance();
388
389 if (self.buf[0]) |cp1| {
390 state.indic = true;
391
392 if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break;
393
394 if (!state.indic) {
395 continue :indic;
396 } else {
397 break :indic;
398 }
399 } else {
400 break :indic;
401 }
402 },
403 .none => {
404 // BUF: [Any, Extend | Linker] (Extend | Linker)* Consonant
405 self.pending = .{ .extend_end = indic_offset };
406 return .{
407 .len = @intCast(grapheme_end - indic_offset),
408 .offset = indic_offset,
409 };
410 },
411 }
412 }
413 }
414
415 if (!state.xpic) {
416 // BUF: [?Any, ZWJ] Emoji
417 var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
418
419 // Look for previous Emoji
420 emoji: while (true) {
421 if (self.buf[0] == null) {
422 self.pending = .{ .extend_end = emoji_offset };
423 return .{
424 .len = @intCast(grapheme_end - emoji_offset),
425 .offset = emoji_offset,
426 };
427 }
428
429 const codepoint = self.buf[0].?;
430
431 if (self.data.gbp(codepoint.code) == .Extend) {
432 self.advance();
433 continue :emoji;
434 }
435
436 if (self.data.isEmoji(codepoint.code)) {
437 // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)*
438 emoji_offset = codepoint.offset;
439 self.advance();
440
441 if (self.buf[0] != null and
442 // ZWJ = 0x200d
443 self.buf[0].?.code == 0x200d)
444 {
445 // BUF: [ZWJ, Emoji] (Extend* ZWJ Emoji)*
446 // Back at the beginning of the loop, "recursively" look for emoji
447 self.advance();
448 continue :emoji;
449 } else {
450 // BUF: [?Any, Emoji] (Extend* ZWJ Emoji)*
451 break :emoji;
452 }
453 } else {
454 // BUF: [Any, Extend] (Extend* ZWJ Emoji)*
455 self.pending = .{ .extend_end = emoji_offset };
456 return .{
457 .len = @intCast(grapheme_end - emoji_offset),
458 .offset = emoji_offset,
459 };
460 }
461 }
462 }
463
464 if (state.regional) {
465 var ri_count: usize = 0;
466 while (self.buf[0] != null and
467 self.data.gbp(self.buf[0].?.code) == .Regional_Indicator)
468 {
469 ri_count += 1;
470 self.advance();
471 }
472
473 // Use the fact that all RI have length 4 in utf8 encoding
474 // since they are in range 0x1f1e6...0x1f1ff
475 // https://en.wikipedia.org/wiki/UTF-8#Encoding
476 if (ri_count == 0) {
477 // There are no pending RI codepoints
478 } else if (ri_count % 2 == 0) {
479 self.pending = .{ .ri_count = ri_count };
480 return .{ .len = 8, .offset = grapheme_end - 8 };
481 } else {
482 // Add one to count for the unused RI
483 self.pending = .{ .ri_count = ri_count + 1 };
484 return .{ .len = 4, .offset = grapheme_end - 4 };
485 }
486 }
233 } 487 }
234 self.cp_iter = saved_cp_iter;
235 self.buf[0] = s0;
236 self.buf[1] = s1;
237 488
238 return Grapheme{ .len = gc_len, .offset = gc_start }; 489 const grapheme_start = if (self.buf[1]) |codepoint| codepoint.offset else 0;
490 self.advance();
491 return .{
492 .len = @intCast(grapheme_end - grapheme_start),
493 .offset = grapheme_start,
494 };
239 } 495 }
240}; 496};
241 497
498/// Grapheme Iterator state.
499pub const IterState = packed struct(u3) {
500 xpic: bool = false,
501 regional: bool = false,
502 indic: bool = false,
503};
504
242// Predicates 505// Predicates
243fn isBreaker(cp: u21, data: *const Graphemes) bool { 506fn isBreaker(cp: u21, data: *const Graphemes) bool {
244 // Extract relevant properties. 507 // Extract relevant properties.
@@ -246,44 +509,6 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool {
246 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; 509 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
247} 510}
248 511
249// Grapheme break state.
250pub const State = struct {
251 bits: u3 = 0,
252
253 // Extended Pictographic (emoji)
254 fn hasXpic(self: State) bool {
255 return self.bits & 1 == 1;
256 }
257 fn setXpic(self: *State) void {
258 self.bits |= 1;
259 }
260 fn unsetXpic(self: *State) void {
261 self.bits ^= 1;
262 }
263
264 // Regional Indicatior (flags)
265 fn hasRegional(self: State) bool {
266 return self.bits & 2 == 2;
267 }
268 fn setRegional(self: *State) void {
269 self.bits |= 2;
270 }
271 fn unsetRegional(self: *State) void {
272 self.bits ^= 2;
273 }
274
275 // Indic Conjunct
276 fn hasIndic(self: State) bool {
277 return self.bits & 4 == 4;
278 }
279 fn setIndic(self: *State) void {
280 self.bits |= 4;
281 }
282 fn unsetIndic(self: *State) void {
283 self.bits ^= 4;
284 }
285};
286
287/// `graphemeBreak` returns true only if a grapheme break point is required 512/// `graphemeBreak` returns true only if a grapheme break point is required
288/// between `cp1` and `cp2`. `state` should start out as 0. If calling 513/// between `cp1` and `cp2`. `state` should start out as 0. If calling
289/// iteratively over a sequence of code points, this function must be called 514/// iteratively over a sequence of code points, this function must be called
@@ -294,7 +519,7 @@ pub fn graphemeBreak(
294 cp1: u21, 519 cp1: u21,
295 cp2: u21, 520 cp2: u21,
296 data: *const Graphemes, 521 data: *const Graphemes,
297 state: *State, 522 state: *IterState,
298) bool { 523) bool {
299 // Extract relevant properties. 524 // Extract relevant properties.
300 const cp1_gbp_prop = data.gbp(cp1); 525 const cp1_gbp_prop = data.gbp(cp1);
@@ -306,9 +531,9 @@ pub fn graphemeBreak(
306 const cp2_is_emoji = data.isEmoji(cp2); 531 const cp2_is_emoji = data.isEmoji(cp2);
307 532
308 // GB11: Emoji Extend* ZWJ x Emoji 533 // GB11: Emoji Extend* ZWJ x Emoji
309 if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); 534 if (!state.xpic and cp1_is_emoji) state.xpic = true;
310 // GB9c: Indic Conjunct Break 535 // GB9c: Indic Conjunct Break
311 if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic(); 536 if (!state.indic and cp1_indic_prop == .Consonant) state.indic = true;
312 537
313 // GB3: CR x LF 538 // GB3: CR x LF
314 if (cp1 == '\r' and cp2 == '\n') return false; 539 if (cp1 == '\r' and cp2 == '\n') return false;
@@ -317,11 +542,11 @@ pub fn graphemeBreak(
317 if (isBreaker(cp1, data)) return true; 542 if (isBreaker(cp1, data)) return true;
318 543
319 // GB11: Emoji Extend* ZWJ x Emoji 544 // GB11: Emoji Extend* ZWJ x Emoji
320 if (state.hasXpic() and 545 if (state.xpic and
321 cp1_gbp_prop == .ZWJ and 546 cp1_gbp_prop == .ZWJ and
322 cp2_is_emoji) 547 cp2_is_emoji)
323 { 548 {
324 state.unsetXpic(); 549 state.xpic = false;
325 return false; 550 return false;
326 } 551 }
327 552
@@ -336,11 +561,11 @@ pub fn graphemeBreak(
336 561
337 // GB12, GB13: RI x RI 562 // GB12, GB13: RI x RI
338 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { 563 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
339 if (state.hasRegional()) { 564 if (state.regional) {
340 state.unsetRegional(); 565 state.regional = false;
341 return true; 566 return true;
342 } else { 567 } else {
343 state.setRegional(); 568 state.regional = true;
344 return false; 569 return false;
345 } 570 }
346 } 571 }
@@ -365,25 +590,25 @@ pub fn graphemeBreak(
365 } 590 }
366 591
367 // GB9c: Indic Conjunct Break 592 // GB9c: Indic Conjunct Break
368 if (state.hasIndic() and 593 if (state.indic and
369 cp1_indic_prop == .Consonant and 594 cp1_indic_prop == .Consonant and
370 (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) 595 (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker))
371 { 596 {
372 return false; 597 return false;
373 } 598 }
374 599
375 if (state.hasIndic() and 600 if (state.indic and
376 cp1_indic_prop == .Extend and 601 cp1_indic_prop == .Extend and
377 cp2_indic_prop == .Linker) 602 cp2_indic_prop == .Linker)
378 { 603 {
379 return false; 604 return false;
380 } 605 }
381 606
382 if (state.hasIndic() and 607 if (state.indic and
383 (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and 608 (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and
384 cp2_indic_prop == .Consonant) 609 cp2_indic_prop == .Consonant)
385 { 610 {
386 state.unsetIndic(); 611 state.indic = false;
387 return false; 612 return false;
388 } 613 }
389 614
@@ -421,3 +646,39 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
421 try std.testing.expectEqual(@as(usize, 2), i); 646 try std.testing.expectEqual(@as(usize, 2), i);
422 } 647 }
423} 648}
649
650test "Iterator.peek" {
651 const peek_seq = "aΔ👨🏻‍🌾→";
652 const data = try Graphemes.init(std.testing.allocator);
653 defer data.deinit(std.testing.allocator);
654
655 var iter = data.iterator(peek_seq);
656 const peek_a = iter.peek().?;
657 const next_a = iter.next().?;
658 try std.testing.expectEqual(peek_a, next_a);
659 try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq));
660 const peek_d1 = iter.peek().?;
661 const peek_d2 = iter.peek().?;
662 try std.testing.expectEqual(peek_d1, peek_d2);
663 const next_d = iter.next().?;
664 try std.testing.expectEqual(peek_d2, next_d);
665 try std.testing.expectEqual(iter.peek(), iter.next());
666 try std.testing.expectEqual(iter.peek(), iter.next());
667 try std.testing.expectEqual(null, iter.peek());
668 try std.testing.expectEqual(null, iter.peek());
669 try std.testing.expectEqual(iter.peek(), iter.next());
670}
671
672const std = @import("std");
673const builtin = @import("builtin");
674const assert = std.debug.assert;
675const mem = std.mem;
676const Allocator = mem.Allocator;
677const compress = std.compress;
678const unicode = std.unicode;
679
680const code_point = @import("code_point");
681const CodePoint = code_point.CodePoint;
682const CodePointIterator = code_point.Iterator;
683const CodePointReverseIterator = code_point.ReverseIterator;
684const uoffset = code_point.uoffset;