From 8f5209fa095c2ed9114ce102b2f9b2cc90d66b13 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Sun, 1 Jun 2025 14:08:25 -0400 Subject: Add graphemeAtIndex + iterate before and after That completes the set. I do think it's possible to bum a few more cycles from the implementation, but, I'm not going to. It passes the acceptance suite and that's what it needs to do. --- src/Graphemes.zig | 220 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 147 insertions(+), 73 deletions(-) (limited to 'src/Graphemes.zig') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 49fdbf3..f1c56ed 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -1,15 +1,7 @@ -const std = @import("std"); -const builtin = @import("builtin"); -const mem = std.mem; -const Allocator = mem.Allocator; -const compress = std.compress; -const unicode = std.unicode; - -const code_point = @import("code_point"); -const CodePoint = code_point.CodePoint; -const CodePointIterator = code_point.Iterator; -const CodePointReverseIterator = code_point.ReverseIterator; -const uoffset = code_point.uoffset; +//! Graphemes Module +//! +//! Code for handling graphemes: fragments of string which should be +//! treated as one unit. Like Farmer Bob here: 👨🏻‍🌾 s1: []u16 = undefined, s2: []u16 = undefined, @@ -69,10 +61,12 @@ pub fn isEmoji(graphemes: Graphemes, cp: u21) bool { return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; } +/// Returns an iterator over the graphemes in `string`. pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { return Iterator.init(string, graphemes); } +/// Returns a reverse iterator over the graphemes in `string`. pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { return ReverseIterator.init(string, graphemes); } @@ -116,6 +110,96 @@ pub const Grapheme = struct { } }; +// NOTE: graphemeAtIndex is, probably, not in an optimal form. It has the advantage +// of being composed of other parts, but the constant factor can _probably_ be improved +// by a bespoke implmentation using graphemes.graphemeBreak directly. There's a limit +// to how much cycle-bumming I'm willing to do at any given moment; that limit has been +// reached. Perhaps you, Dear Reader, might pick up the torch? + +/// Returns the `Grapheme` at `string[index]`, which does not have to be a +/// valid start of a codepoint. Asserts the string is not empty. Index must be +/// less than `string.len`. Always returns a `Grapheme`. +pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme { + assert(string.len != 0); + if (index == 0 or (index > 0 and + string[index] < 0x80 and + string[index - 1] < 0x80) and + (string[index - 1] != '\r' and string[index] != '\n')) + { + // There's always a grapheme break between two ASCII code points (except CRLF) + var iter = graphemes.iterator(string[index..]); + const next = iter.next().?; + return Grapheme{ + .len = next.len, + .offset = @as(u32, @intCast(index)) + next.offset, + }; + } // Otherwise it gets hairy. + const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset; + if (idx == string.len) { + var iter = graphemes.reverseIterator(string); + return iter.prev().?; + } + // We're on a valid codepoint boundary, we go back from here + var r_iter = graphemes.reverseIterAtIndex(string, idx); + if (r_iter.prev()) |g| { + if (g.offset == 0) { + var iter = graphemes.iterator(string); + while (iter.next()) |g2| { + if (g2.offset <= idx and idx < g2.offset + g2.len) return g2; + } + } + } + // We need to toss one, because otherwise we might not be pending when + // we in fact need to be. + _ = r_iter.prev(); + while (r_iter.pending != .none) : (_ = r_iter.prev()) {} + var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0); + while (iter.next()) |g| { + if (g.offset <= idx and idx < g.offset + g.len) return g; + } + unreachable; +} + +/// Return a (forward) iterator of `string` after `grapheme`. +pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator { + return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len); +} + +/// Return a reverse iterator of `string` before `grapheme`. +pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator { + // This bit of weirdness is because reverse iterators are "advance last", + // while forward iterators are "advance first". This leaves some room for + // further optimization, if anyone dares. + var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1); + _ = r_iter.prev(); + return r_iter; +} + +fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator { + var r_iter: ReverseIterator = undefined; + r_iter.data = graphemes; + var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; + r_iter.buf[1] = rcp_iter.prev(); + r_iter.buf[0] = rcp_iter.prev(); + r_iter.pending = .none; + r_iter.cp_iter = rcp_iter; + return r_iter; +} + +fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator { + var iter: Iterator = undefined; + iter.data = graphemes; + iter.buf[0] = first: { + if (idx == string.len) break :first null; + var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; + break :first r_cp_iter.prev(); + }; + var cp_iter: CodePointIterator = .{ .bytes = string, .i = idx }; + iter.buf[1] = cp_iter.next(); + iter.cp_iter = cp_iter; + return iter; +} + /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. pub const Iterator = struct { buf: [2]?CodePoint = .{ null, null }, @@ -150,7 +234,7 @@ pub const Iterator = struct { const gc_start = self.buf[0].?.offset; var gc_len: u8 = self.buf[0].?.len; - var state = State{}; + var state = IterState{}; if (graphemeBreak( self.buf[0].?.code, @@ -189,12 +273,13 @@ pub const Iterator = struct { } }; +/// Iterate a string backward by Grapheme. pub const ReverseIterator = struct { buf: [2]?CodePoint = .{ null, null }, cp_iter: CodePointReverseIterator, data: *const Graphemes, /// Codepoint read from `cp_iter` but not returned by `previous` - pending: Pending = .{ .none = {} }, + pending: Pending = .none, const Pending = union(enum) { none: void, @@ -218,6 +303,12 @@ pub const ReverseIterator = struct { self.buf[0] = self.cp_iter.prev(); } + pub fn peek(self: *Self) ?Grapheme { + const cache = .{ self.buf, self.cp_iter, self.pending }; + defer self.buf, self.cp_iter, self.pending = cache; + return self.prev(); + } + pub fn prev(self: *Self) ?Grapheme { if (self.buf[1] == null) return null; @@ -255,10 +346,10 @@ pub const ReverseIterator = struct { }; while (self.buf[0] != null) { - var state: State = .{}; - state.setXpic(); - state.unsetRegional(); - state.setIndic(); + var state: IterState = .{}; + state.xpic = true; + state.regional = false; + state.indic = true; if (graphemeBreak( self.buf[0].?.code, @@ -269,7 +360,7 @@ pub const ReverseIterator = struct { self.advance(); - if (!state.hasIndic()) { + if (!state.indic) { // BUF: [?Any, Extend | Linker] Consonant var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; @@ -296,11 +387,11 @@ pub const ReverseIterator = struct { self.advance(); if (self.buf[0]) |cp1| { - state.setIndic(); + state.indic = true; if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; - if (!state.hasIndic()) { + if (!state.indic) { continue :indic; } else { break :indic; @@ -321,7 +412,7 @@ pub const ReverseIterator = struct { } } - if (!state.hasXpic()) { + if (!state.xpic) { // BUF: [?Any, ZWJ] Emoji var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; @@ -370,7 +461,7 @@ pub const ReverseIterator = struct { } } - if (state.hasRegional()) { + if (state.regional) { var ri_count: usize = 0; while (self.buf[0] != null and self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) @@ -404,6 +495,13 @@ pub const ReverseIterator = struct { } }; +/// Grapheme Iterator state. +pub const IterState = packed struct(u3) { + xpic: bool = false, + regional: bool = false, + indic: bool = false, +}; + // Predicates fn isBreaker(cp: u21, data: *const Graphemes) bool { // Extract relevant properties. @@ -411,44 +509,6 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool { return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; } -// Grapheme break state. -pub const State = struct { - bits: u3 = 0, - - // Extended Pictographic (emoji) - fn hasXpic(self: State) bool { - return self.bits & 1 == 1; - } - fn setXpic(self: *State) void { - self.bits |= 1; - } - fn unsetXpic(self: *State) void { - self.bits &= ~@as(u3, 1); - } - - // Regional Indicatior (flags) - fn hasRegional(self: State) bool { - return self.bits & 2 == 2; - } - fn setRegional(self: *State) void { - self.bits |= 2; - } - fn unsetRegional(self: *State) void { - self.bits &= ~@as(u3, 2); - } - - // Indic Conjunct - fn hasIndic(self: State) bool { - return self.bits & 4 == 4; - } - fn setIndic(self: *State) void { - self.bits |= 4; - } - fn unsetIndic(self: *State) void { - self.bits &= ~@as(u3, 4); - } -}; - /// `graphemeBreak` returns true only if a grapheme break point is required /// between `cp1` and `cp2`. `state` should start out as 0. If calling /// iteratively over a sequence of code points, this function must be called @@ -459,7 +519,7 @@ pub fn graphemeBreak( cp1: u21, cp2: u21, data: *const Graphemes, - state: *State, + state: *IterState, ) bool { // Extract relevant properties. const cp1_gbp_prop = data.gbp(cp1); @@ -471,9 +531,9 @@ pub fn graphemeBreak( const cp2_is_emoji = data.isEmoji(cp2); // GB11: Emoji Extend* ZWJ x Emoji - if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); + if (!state.xpic and cp1_is_emoji) state.xpic = true; // GB9c: Indic Conjunct Break - if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic(); + if (!state.indic and cp1_indic_prop == .Consonant) state.indic = true; // GB3: CR x LF if (cp1 == '\r' and cp2 == '\n') return false; @@ -482,11 +542,11 @@ pub fn graphemeBreak( if (isBreaker(cp1, data)) return true; // GB11: Emoji Extend* ZWJ x Emoji - if (state.hasXpic() and + if (state.xpic and cp1_gbp_prop == .ZWJ and cp2_is_emoji) { - state.unsetXpic(); + state.xpic = false; return false; } @@ -501,11 +561,11 @@ pub fn graphemeBreak( // GB12, GB13: RI x RI if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { - if (state.hasRegional()) { - state.unsetRegional(); + if (state.regional) { + state.regional = false; return true; } else { - state.setRegional(); + state.regional = true; return false; } } @@ -530,25 +590,25 @@ pub fn graphemeBreak( } // GB9c: Indic Conjunct Break - if (state.hasIndic() and + if (state.indic and cp1_indic_prop == .Consonant and (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) { return false; } - if (state.hasIndic() and + if (state.indic and cp1_indic_prop == .Extend and cp2_indic_prop == .Linker) { return false; } - if (state.hasIndic() and + if (state.indic and (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and cp2_indic_prop == .Consonant) { - state.unsetIndic(); + state.indic = false; return false; } @@ -608,3 +668,17 @@ test "Iterator.peek" { try std.testing.expectEqual(null, iter.peek()); try std.testing.expectEqual(iter.peek(), iter.next()); } + +const std = @import("std"); +const builtin = @import("builtin"); +const assert = std.debug.assert; +const mem = std.mem; +const Allocator = mem.Allocator; +const compress = std.compress; +const unicode = std.unicode; + +const code_point = @import("code_point"); +const CodePoint = code_point.CodePoint; +const CodePointIterator = code_point.Iterator; +const CodePointReverseIterator = code_point.ReverseIterator; +const uoffset = code_point.uoffset; -- cgit v1.2.3