summaryrefslogtreecommitdiff
path: root/src/Graphemes.zig
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-06-01 14:08:25 -0400
committerGravatar Sam Atman2025-06-01 14:08:25 -0400
commit8f5209fa095c2ed9114ce102b2f9b2cc90d66b13 (patch)
tree4ec54815215a9a808be0ab9a2968159f144ba076 /src/Graphemes.zig
parentDocument "fat_offset" in README (diff)
downloadzg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.tar.gz
zg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.tar.xz
zg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.zip
Add graphemeAtIndex + iterate before and after
That completes the set. I do think it's possible to bum a few more cycles from the implementation, but, I'm not going to. It passes the acceptance suite and that's what it needs to do.
Diffstat (limited to 'src/Graphemes.zig')
-rw-r--r--src/Graphemes.zig220
1 files changed, 147 insertions, 73 deletions
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 49fdbf3..f1c56ed 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -1,15 +1,7 @@
1const std = @import("std"); 1//! Graphemes Module
2const builtin = @import("builtin"); 2//!
3const mem = std.mem; 3//! Code for handling graphemes: fragments of string which should be
4const Allocator = mem.Allocator; 4//! treated as one unit. Like Farmer Bob here: 👨🏻‍🌾
5const compress = std.compress;
6const unicode = std.unicode;
7
8const code_point = @import("code_point");
9const CodePoint = code_point.CodePoint;
10const CodePointIterator = code_point.Iterator;
11const CodePointReverseIterator = code_point.ReverseIterator;
12const uoffset = code_point.uoffset;
13 5
14s1: []u16 = undefined, 6s1: []u16 = undefined,
15s2: []u16 = undefined, 7s2: []u16 = undefined,
@@ -69,10 +61,12 @@ pub fn isEmoji(graphemes: Graphemes, cp: u21) bool {
69 return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; 61 return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1;
70} 62}
71 63
64/// Returns an iterator over the graphemes in `string`.
72pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { 65pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator {
73 return Iterator.init(string, graphemes); 66 return Iterator.init(string, graphemes);
74} 67}
75 68
69/// Returns a reverse iterator over the graphemes in `string`.
76pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { 70pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator {
77 return ReverseIterator.init(string, graphemes); 71 return ReverseIterator.init(string, graphemes);
78} 72}
@@ -116,6 +110,96 @@ pub const Grapheme = struct {
116 } 110 }
117}; 111};
118 112
113// NOTE: graphemeAtIndex is, probably, not in an optimal form. It has the advantage
114// of being composed of other parts, but the constant factor can _probably_ be improved
115// by a bespoke implmentation using graphemes.graphemeBreak directly. There's a limit
116// to how much cycle-bumming I'm willing to do at any given moment; that limit has been
117// reached. Perhaps you, Dear Reader, might pick up the torch?
118
119/// Returns the `Grapheme` at `string[index]`, which does not have to be a
120/// valid start of a codepoint. Asserts the string is not empty. Index must be
121/// less than `string.len`. Always returns a `Grapheme`.
122pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme {
123 assert(string.len != 0);
124 if (index == 0 or (index > 0 and
125 string[index] < 0x80 and
126 string[index - 1] < 0x80) and
127 (string[index - 1] != '\r' and string[index] != '\n'))
128 {
129 // There's always a grapheme break between two ASCII code points (except CRLF)
130 var iter = graphemes.iterator(string[index..]);
131 const next = iter.next().?;
132 return Grapheme{
133 .len = next.len,
134 .offset = @as(u32, @intCast(index)) + next.offset,
135 };
136 } // Otherwise it gets hairy.
137 const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset;
138 if (idx == string.len) {
139 var iter = graphemes.reverseIterator(string);
140 return iter.prev().?;
141 }
142 // We're on a valid codepoint boundary, we go back from here
143 var r_iter = graphemes.reverseIterAtIndex(string, idx);
144 if (r_iter.prev()) |g| {
145 if (g.offset == 0) {
146 var iter = graphemes.iterator(string);
147 while (iter.next()) |g2| {
148 if (g2.offset <= idx and idx < g2.offset + g2.len) return g2;
149 }
150 }
151 }
152 // We need to toss one, because otherwise we might not be pending when
153 // we in fact need to be.
154 _ = r_iter.prev();
155 while (r_iter.pending != .none) : (_ = r_iter.prev()) {}
156 var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0);
157 while (iter.next()) |g| {
158 if (g.offset <= idx and idx < g.offset + g.len) return g;
159 }
160 unreachable;
161}
162
163/// Return a (forward) iterator of `string` after `grapheme`.
164pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator {
165 return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len);
166}
167
168/// Return a reverse iterator of `string` before `grapheme`.
169pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator {
170 // This bit of weirdness is because reverse iterators are "advance last",
171 // while forward iterators are "advance first". This leaves some room for
172 // further optimization, if anyone dares.
173 var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1);
174 _ = r_iter.prev();
175 return r_iter;
176}
177
178fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator {
179 var r_iter: ReverseIterator = undefined;
180 r_iter.data = graphemes;
181 var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
182 r_iter.buf[1] = rcp_iter.prev();
183 r_iter.buf[0] = rcp_iter.prev();
184 r_iter.pending = .none;
185 r_iter.cp_iter = rcp_iter;
186 return r_iter;
187}
188
189fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator {
190 var iter: Iterator = undefined;
191 iter.data = graphemes;
192 iter.buf[0] = first: {
193 if (idx == string.len) break :first null;
194 var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
195 break :first r_cp_iter.prev();
196 };
197 var cp_iter: CodePointIterator = .{ .bytes = string, .i = idx };
198 iter.buf[1] = cp_iter.next();
199 iter.cp_iter = cp_iter;
200 return iter;
201}
202
119/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. 203/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
120pub const Iterator = struct { 204pub const Iterator = struct {
121 buf: [2]?CodePoint = .{ null, null }, 205 buf: [2]?CodePoint = .{ null, null },
@@ -150,7 +234,7 @@ pub const Iterator = struct {
150 234
151 const gc_start = self.buf[0].?.offset; 235 const gc_start = self.buf[0].?.offset;
152 var gc_len: u8 = self.buf[0].?.len; 236 var gc_len: u8 = self.buf[0].?.len;
153 var state = State{}; 237 var state = IterState{};
154 238
155 if (graphemeBreak( 239 if (graphemeBreak(
156 self.buf[0].?.code, 240 self.buf[0].?.code,
@@ -189,12 +273,13 @@ pub const Iterator = struct {
189 } 273 }
190}; 274};
191 275
276/// Iterate a string backward by Grapheme.
192pub const ReverseIterator = struct { 277pub const ReverseIterator = struct {
193 buf: [2]?CodePoint = .{ null, null }, 278 buf: [2]?CodePoint = .{ null, null },
194 cp_iter: CodePointReverseIterator, 279 cp_iter: CodePointReverseIterator,
195 data: *const Graphemes, 280 data: *const Graphemes,
196 /// Codepoint read from `cp_iter` but not returned by `previous` 281 /// Codepoint read from `cp_iter` but not returned by `previous`
197 pending: Pending = .{ .none = {} }, 282 pending: Pending = .none,
198 283
199 const Pending = union(enum) { 284 const Pending = union(enum) {
200 none: void, 285 none: void,
@@ -218,6 +303,12 @@ pub const ReverseIterator = struct {
218 self.buf[0] = self.cp_iter.prev(); 303 self.buf[0] = self.cp_iter.prev();
219 } 304 }
220 305
306 pub fn peek(self: *Self) ?Grapheme {
307 const cache = .{ self.buf, self.cp_iter, self.pending };
308 defer self.buf, self.cp_iter, self.pending = cache;
309 return self.prev();
310 }
311
221 pub fn prev(self: *Self) ?Grapheme { 312 pub fn prev(self: *Self) ?Grapheme {
222 if (self.buf[1] == null) return null; 313 if (self.buf[1] == null) return null;
223 314
@@ -255,10 +346,10 @@ pub const ReverseIterator = struct {
255 }; 346 };
256 347
257 while (self.buf[0] != null) { 348 while (self.buf[0] != null) {
258 var state: State = .{}; 349 var state: IterState = .{};
259 state.setXpic(); 350 state.xpic = true;
260 state.unsetRegional(); 351 state.regional = false;
261 state.setIndic(); 352 state.indic = true;
262 353
263 if (graphemeBreak( 354 if (graphemeBreak(
264 self.buf[0].?.code, 355 self.buf[0].?.code,
@@ -269,7 +360,7 @@ pub const ReverseIterator = struct {
269 360
270 self.advance(); 361 self.advance();
271 362
272 if (!state.hasIndic()) { 363 if (!state.indic) {
273 364
274 // BUF: [?Any, Extend | Linker] Consonant 365 // BUF: [?Any, Extend | Linker] Consonant
275 var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; 366 var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
@@ -296,11 +387,11 @@ pub const ReverseIterator = struct {
296 self.advance(); 387 self.advance();
297 388
298 if (self.buf[0]) |cp1| { 389 if (self.buf[0]) |cp1| {
299 state.setIndic(); 390 state.indic = true;
300 391
301 if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; 392 if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break;
302 393
303 if (!state.hasIndic()) { 394 if (!state.indic) {
304 continue :indic; 395 continue :indic;
305 } else { 396 } else {
306 break :indic; 397 break :indic;
@@ -321,7 +412,7 @@ pub const ReverseIterator = struct {
321 } 412 }
322 } 413 }
323 414
324 if (!state.hasXpic()) { 415 if (!state.xpic) {
325 // BUF: [?Any, ZWJ] Emoji 416 // BUF: [?Any, ZWJ] Emoji
326 var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; 417 var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
327 418
@@ -370,7 +461,7 @@ pub const ReverseIterator = struct {
370 } 461 }
371 } 462 }
372 463
373 if (state.hasRegional()) { 464 if (state.regional) {
374 var ri_count: usize = 0; 465 var ri_count: usize = 0;
375 while (self.buf[0] != null and 466 while (self.buf[0] != null and
376 self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) 467 self.data.gbp(self.buf[0].?.code) == .Regional_Indicator)
@@ -404,6 +495,13 @@ pub const ReverseIterator = struct {
404 } 495 }
405}; 496};
406 497
498/// Grapheme Iterator state.
499pub const IterState = packed struct(u3) {
500 xpic: bool = false,
501 regional: bool = false,
502 indic: bool = false,
503};
504
407// Predicates 505// Predicates
408fn isBreaker(cp: u21, data: *const Graphemes) bool { 506fn isBreaker(cp: u21, data: *const Graphemes) bool {
409 // Extract relevant properties. 507 // Extract relevant properties.
@@ -411,44 +509,6 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool {
411 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; 509 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
412} 510}
413 511
414// Grapheme break state.
415pub const State = struct {
416 bits: u3 = 0,
417
418 // Extended Pictographic (emoji)
419 fn hasXpic(self: State) bool {
420 return self.bits & 1 == 1;
421 }
422 fn setXpic(self: *State) void {
423 self.bits |= 1;
424 }
425 fn unsetXpic(self: *State) void {
426 self.bits &= ~@as(u3, 1);
427 }
428
429 // Regional Indicatior (flags)
430 fn hasRegional(self: State) bool {
431 return self.bits & 2 == 2;
432 }
433 fn setRegional(self: *State) void {
434 self.bits |= 2;
435 }
436 fn unsetRegional(self: *State) void {
437 self.bits &= ~@as(u3, 2);
438 }
439
440 // Indic Conjunct
441 fn hasIndic(self: State) bool {
442 return self.bits & 4 == 4;
443 }
444 fn setIndic(self: *State) void {
445 self.bits |= 4;
446 }
447 fn unsetIndic(self: *State) void {
448 self.bits &= ~@as(u3, 4);
449 }
450};
451
452/// `graphemeBreak` returns true only if a grapheme break point is required 512/// `graphemeBreak` returns true only if a grapheme break point is required
453/// between `cp1` and `cp2`. `state` should start out as 0. If calling 513/// between `cp1` and `cp2`. `state` should start out as 0. If calling
454/// iteratively over a sequence of code points, this function must be called 514/// iteratively over a sequence of code points, this function must be called
@@ -459,7 +519,7 @@ pub fn graphemeBreak(
459 cp1: u21, 519 cp1: u21,
460 cp2: u21, 520 cp2: u21,
461 data: *const Graphemes, 521 data: *const Graphemes,
462 state: *State, 522 state: *IterState,
463) bool { 523) bool {
464 // Extract relevant properties. 524 // Extract relevant properties.
465 const cp1_gbp_prop = data.gbp(cp1); 525 const cp1_gbp_prop = data.gbp(cp1);
@@ -471,9 +531,9 @@ pub fn graphemeBreak(
471 const cp2_is_emoji = data.isEmoji(cp2); 531 const cp2_is_emoji = data.isEmoji(cp2);
472 532
473 // GB11: Emoji Extend* ZWJ x Emoji 533 // GB11: Emoji Extend* ZWJ x Emoji
474 if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); 534 if (!state.xpic and cp1_is_emoji) state.xpic = true;
475 // GB9c: Indic Conjunct Break 535 // GB9c: Indic Conjunct Break
476 if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic(); 536 if (!state.indic and cp1_indic_prop == .Consonant) state.indic = true;
477 537
478 // GB3: CR x LF 538 // GB3: CR x LF
479 if (cp1 == '\r' and cp2 == '\n') return false; 539 if (cp1 == '\r' and cp2 == '\n') return false;
@@ -482,11 +542,11 @@ pub fn graphemeBreak(
482 if (isBreaker(cp1, data)) return true; 542 if (isBreaker(cp1, data)) return true;
483 543
484 // GB11: Emoji Extend* ZWJ x Emoji 544 // GB11: Emoji Extend* ZWJ x Emoji
485 if (state.hasXpic() and 545 if (state.xpic and
486 cp1_gbp_prop == .ZWJ and 546 cp1_gbp_prop == .ZWJ and
487 cp2_is_emoji) 547 cp2_is_emoji)
488 { 548 {
489 state.unsetXpic(); 549 state.xpic = false;
490 return false; 550 return false;
491 } 551 }
492 552
@@ -501,11 +561,11 @@ pub fn graphemeBreak(
501 561
502 // GB12, GB13: RI x RI 562 // GB12, GB13: RI x RI
503 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { 563 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
504 if (state.hasRegional()) { 564 if (state.regional) {
505 state.unsetRegional(); 565 state.regional = false;
506 return true; 566 return true;
507 } else { 567 } else {
508 state.setRegional(); 568 state.regional = true;
509 return false; 569 return false;
510 } 570 }
511 } 571 }
@@ -530,25 +590,25 @@ pub fn graphemeBreak(
530 } 590 }
531 591
532 // GB9c: Indic Conjunct Break 592 // GB9c: Indic Conjunct Break
533 if (state.hasIndic() and 593 if (state.indic and
534 cp1_indic_prop == .Consonant and 594 cp1_indic_prop == .Consonant and
535 (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) 595 (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker))
536 { 596 {
537 return false; 597 return false;
538 } 598 }
539 599
540 if (state.hasIndic() and 600 if (state.indic and
541 cp1_indic_prop == .Extend and 601 cp1_indic_prop == .Extend and
542 cp2_indic_prop == .Linker) 602 cp2_indic_prop == .Linker)
543 { 603 {
544 return false; 604 return false;
545 } 605 }
546 606
547 if (state.hasIndic() and 607 if (state.indic and
548 (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and 608 (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and
549 cp2_indic_prop == .Consonant) 609 cp2_indic_prop == .Consonant)
550 { 610 {
551 state.unsetIndic(); 611 state.indic = false;
552 return false; 612 return false;
553 } 613 }
554 614
@@ -608,3 +668,17 @@ test "Iterator.peek" {
608 try std.testing.expectEqual(null, iter.peek()); 668 try std.testing.expectEqual(null, iter.peek());
609 try std.testing.expectEqual(iter.peek(), iter.next()); 669 try std.testing.expectEqual(iter.peek(), iter.next());
610} 670}
671
672const std = @import("std");
673const builtin = @import("builtin");
674const assert = std.debug.assert;
675const mem = std.mem;
676const Allocator = mem.Allocator;
677const compress = std.compress;
678const unicode = std.unicode;
679
680const code_point = @import("code_point");
681const CodePoint = code_point.CodePoint;
682const CodePointIterator = code_point.Iterator;
683const CodePointReverseIterator = code_point.ReverseIterator;
684const uoffset = code_point.uoffset;