From 8f5209fa095c2ed9114ce102b2f9b2cc90d66b13 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Sun, 1 Jun 2025 14:08:25 -0400 Subject: Add graphemeAtIndex + iterate before and after That completes the set. I do think it's possible to bum a few more cycles from the implementation, but, I'm not going to. It passes the acceptance suite and that's what it needs to do. --- src/Graphemes.zig | 220 +++++++++++++++++++++++++++++++++----------------- src/Words.zig | 4 +- src/code_point.zig | 60 +++++++++++++- src/unicode_tests.zig | 69 +++++++++++++--- 4 files changed, 266 insertions(+), 87 deletions(-) (limited to 'src') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 49fdbf3..f1c56ed 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -1,15 +1,7 @@ -const std = @import("std"); -const builtin = @import("builtin"); -const mem = std.mem; -const Allocator = mem.Allocator; -const compress = std.compress; -const unicode = std.unicode; - -const code_point = @import("code_point"); -const CodePoint = code_point.CodePoint; -const CodePointIterator = code_point.Iterator; -const CodePointReverseIterator = code_point.ReverseIterator; -const uoffset = code_point.uoffset; +//! Graphemes Module +//! +//! Code for handling graphemes: fragments of string which should be +//! treated as one unit. Like Farmer Bob here: 👨🏻‍🌾 s1: []u16 = undefined, s2: []u16 = undefined, @@ -69,10 +61,12 @@ pub fn isEmoji(graphemes: Graphemes, cp: u21) bool { return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; } +/// Returns an iterator over the graphemes in `string`. pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { return Iterator.init(string, graphemes); } +/// Returns a reverse iterator over the graphemes in `string`. pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { return ReverseIterator.init(string, graphemes); } @@ -116,6 +110,96 @@ pub const Grapheme = struct { } }; +// NOTE: graphemeAtIndex is, probably, not in an optimal form. It has the advantage +// of being composed of other parts, but the constant factor can _probably_ be improved +// by a bespoke implmentation using graphemes.graphemeBreak directly. There's a limit +// to how much cycle-bumming I'm willing to do at any given moment; that limit has been +// reached. Perhaps you, Dear Reader, might pick up the torch? + +/// Returns the `Grapheme` at `string[index]`, which does not have to be a +/// valid start of a codepoint. Asserts the string is not empty. Index must be +/// less than `string.len`. Always returns a `Grapheme`. +pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme { + assert(string.len != 0); + if (index == 0 or (index > 0 and + string[index] < 0x80 and + string[index - 1] < 0x80) and + (string[index - 1] != '\r' and string[index] != '\n')) + { + // There's always a grapheme break between two ASCII code points (except CRLF) + var iter = graphemes.iterator(string[index..]); + const next = iter.next().?; + return Grapheme{ + .len = next.len, + .offset = @as(u32, @intCast(index)) + next.offset, + }; + } // Otherwise it gets hairy. + const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset; + if (idx == string.len) { + var iter = graphemes.reverseIterator(string); + return iter.prev().?; + } + // We're on a valid codepoint boundary, we go back from here + var r_iter = graphemes.reverseIterAtIndex(string, idx); + if (r_iter.prev()) |g| { + if (g.offset == 0) { + var iter = graphemes.iterator(string); + while (iter.next()) |g2| { + if (g2.offset <= idx and idx < g2.offset + g2.len) return g2; + } + } + } + // We need to toss one, because otherwise we might not be pending when + // we in fact need to be. + _ = r_iter.prev(); + while (r_iter.pending != .none) : (_ = r_iter.prev()) {} + var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0); + while (iter.next()) |g| { + if (g.offset <= idx and idx < g.offset + g.len) return g; + } + unreachable; +} + +/// Return a (forward) iterator of `string` after `grapheme`. +pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator { + return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len); +} + +/// Return a reverse iterator of `string` before `grapheme`. +pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator { + // This bit of weirdness is because reverse iterators are "advance last", + // while forward iterators are "advance first". This leaves some room for + // further optimization, if anyone dares. + var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1); + _ = r_iter.prev(); + return r_iter; +} + +fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator { + var r_iter: ReverseIterator = undefined; + r_iter.data = graphemes; + var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; + r_iter.buf[1] = rcp_iter.prev(); + r_iter.buf[0] = rcp_iter.prev(); + r_iter.pending = .none; + r_iter.cp_iter = rcp_iter; + return r_iter; +} + +fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator { + var iter: Iterator = undefined; + iter.data = graphemes; + iter.buf[0] = first: { + if (idx == string.len) break :first null; + var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; + break :first r_cp_iter.prev(); + }; + var cp_iter: CodePointIterator = .{ .bytes = string, .i = idx }; + iter.buf[1] = cp_iter.next(); + iter.cp_iter = cp_iter; + return iter; +} + /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. pub const Iterator = struct { buf: [2]?CodePoint = .{ null, null }, @@ -150,7 +234,7 @@ pub const Iterator = struct { const gc_start = self.buf[0].?.offset; var gc_len: u8 = self.buf[0].?.len; - var state = State{}; + var state = IterState{}; if (graphemeBreak( self.buf[0].?.code, @@ -189,12 +273,13 @@ pub const Iterator = struct { } }; +/// Iterate a string backward by Grapheme. pub const ReverseIterator = struct { buf: [2]?CodePoint = .{ null, null }, cp_iter: CodePointReverseIterator, data: *const Graphemes, /// Codepoint read from `cp_iter` but not returned by `previous` - pending: Pending = .{ .none = {} }, + pending: Pending = .none, const Pending = union(enum) { none: void, @@ -218,6 +303,12 @@ pub const ReverseIterator = struct { self.buf[0] = self.cp_iter.prev(); } + pub fn peek(self: *Self) ?Grapheme { + const cache = .{ self.buf, self.cp_iter, self.pending }; + defer self.buf, self.cp_iter, self.pending = cache; + return self.prev(); + } + pub fn prev(self: *Self) ?Grapheme { if (self.buf[1] == null) return null; @@ -255,10 +346,10 @@ pub const ReverseIterator = struct { }; while (self.buf[0] != null) { - var state: State = .{}; - state.setXpic(); - state.unsetRegional(); - state.setIndic(); + var state: IterState = .{}; + state.xpic = true; + state.regional = false; + state.indic = true; if (graphemeBreak( self.buf[0].?.code, @@ -269,7 +360,7 @@ pub const ReverseIterator = struct { self.advance(); - if (!state.hasIndic()) { + if (!state.indic) { // BUF: [?Any, Extend | Linker] Consonant var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; @@ -296,11 +387,11 @@ pub const ReverseIterator = struct { self.advance(); if (self.buf[0]) |cp1| { - state.setIndic(); + state.indic = true; if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; - if (!state.hasIndic()) { + if (!state.indic) { continue :indic; } else { break :indic; @@ -321,7 +412,7 @@ pub const ReverseIterator = struct { } } - if (!state.hasXpic()) { + if (!state.xpic) { // BUF: [?Any, ZWJ] Emoji var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; @@ -370,7 +461,7 @@ pub const ReverseIterator = struct { } } - if (state.hasRegional()) { + if (state.regional) { var ri_count: usize = 0; while (self.buf[0] != null and self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) @@ -404,6 +495,13 @@ pub const ReverseIterator = struct { } }; +/// Grapheme Iterator state. +pub const IterState = packed struct(u3) { + xpic: bool = false, + regional: bool = false, + indic: bool = false, +}; + // Predicates fn isBreaker(cp: u21, data: *const Graphemes) bool { // Extract relevant properties. @@ -411,44 +509,6 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool { return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; } -// Grapheme break state. -pub const State = struct { - bits: u3 = 0, - - // Extended Pictographic (emoji) - fn hasXpic(self: State) bool { - return self.bits & 1 == 1; - } - fn setXpic(self: *State) void { - self.bits |= 1; - } - fn unsetXpic(self: *State) void { - self.bits &= ~@as(u3, 1); - } - - // Regional Indicatior (flags) - fn hasRegional(self: State) bool { - return self.bits & 2 == 2; - } - fn setRegional(self: *State) void { - self.bits |= 2; - } - fn unsetRegional(self: *State) void { - self.bits &= ~@as(u3, 2); - } - - // Indic Conjunct - fn hasIndic(self: State) bool { - return self.bits & 4 == 4; - } - fn setIndic(self: *State) void { - self.bits |= 4; - } - fn unsetIndic(self: *State) void { - self.bits &= ~@as(u3, 4); - } -}; - /// `graphemeBreak` returns true only if a grapheme break point is required /// between `cp1` and `cp2`. `state` should start out as 0. If calling /// iteratively over a sequence of code points, this function must be called @@ -459,7 +519,7 @@ pub fn graphemeBreak( cp1: u21, cp2: u21, data: *const Graphemes, - state: *State, + state: *IterState, ) bool { // Extract relevant properties. const cp1_gbp_prop = data.gbp(cp1); @@ -471,9 +531,9 @@ pub fn graphemeBreak( const cp2_is_emoji = data.isEmoji(cp2); // GB11: Emoji Extend* ZWJ x Emoji - if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); + if (!state.xpic and cp1_is_emoji) state.xpic = true; // GB9c: Indic Conjunct Break - if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic(); + if (!state.indic and cp1_indic_prop == .Consonant) state.indic = true; // GB3: CR x LF if (cp1 == '\r' and cp2 == '\n') return false; @@ -482,11 +542,11 @@ pub fn graphemeBreak( if (isBreaker(cp1, data)) return true; // GB11: Emoji Extend* ZWJ x Emoji - if (state.hasXpic() and + if (state.xpic and cp1_gbp_prop == .ZWJ and cp2_is_emoji) { - state.unsetXpic(); + state.xpic = false; return false; } @@ -501,11 +561,11 @@ pub fn graphemeBreak( // GB12, GB13: RI x RI if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { - if (state.hasRegional()) { - state.unsetRegional(); + if (state.regional) { + state.regional = false; return true; } else { - state.setRegional(); + state.regional = true; return false; } } @@ -530,25 +590,25 @@ pub fn graphemeBreak( } // GB9c: Indic Conjunct Break - if (state.hasIndic() and + if (state.indic and cp1_indic_prop == .Consonant and (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) { return false; } - if (state.hasIndic() and + if (state.indic and cp1_indic_prop == .Extend and cp2_indic_prop == .Linker) { return false; } - if (state.hasIndic() and + if (state.indic and (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and cp2_indic_prop == .Consonant) { - state.unsetIndic(); + state.indic = false; return false; } @@ -608,3 +668,17 @@ test "Iterator.peek" { try std.testing.expectEqual(null, iter.peek()); try std.testing.expectEqual(iter.peek(), iter.next()); } + +const std = @import("std"); +const builtin = @import("builtin"); +const assert = std.debug.assert; +const mem = std.mem; +const Allocator = mem.Allocator; +const compress = std.compress; +const unicode = std.unicode; + +const code_point = @import("code_point"); +const CodePoint = code_point.CodePoint; +const CodePointIterator = code_point.Iterator; +const CodePointReverseIterator = code_point.ReverseIterator; +const uoffset = code_point.uoffset; diff --git a/src/Words.zig b/src/Words.zig index 1707881..af82562 100644 --- a/src/Words.zig +++ b/src/Words.zig @@ -124,12 +124,12 @@ pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator { } /// Returns an iterator after the `word` in `slice`. -pub fn iterateAfter(words: *const Words, slice: []const u8, word: Word) Iterator { +pub fn iterateAfterWord(words: *const Words, slice: []const u8, word: Word) Iterator { return forwardFromIndex(words, slice, word.offset + word.len); } /// Returns a reverse iterator before the `word` in `slice`. -pub fn iterateBefore(words: *const Words, slice: []const u8, word: Word) ReverseIterator { +pub fn iterateBeforeWord(words: *const Words, slice: []const u8, word: Word) ReverseIterator { return reverseFromIndex(words, slice, word.offset); } diff --git a/src/code_point.zig b/src/code_point.zig index 8bd3d5b..16648af 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -39,9 +39,17 @@ pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint { return null; } +/// Return the codepoint at `index`, even if `index` is in the middle +/// of that codepoint. +pub fn codepointAtIndex(bytes: []const u8, index: uoffset) ?CodePoint { + var idx = index; + while (idx > 0 and 0x80 <= bytes[idx] and bytes[idx] <= 0xbf) : (idx -= 1) {} + return decodeAtIndex(bytes, idx); +} + /// Decode the CodePoint, if any, at `bytes[idx]`. -pub fn decodeAtIndex(bytes: []const u8, idx: uoffset) ?CodePoint { - var off = idx; +pub fn decodeAtIndex(bytes: []const u8, index: uoffset) ?CodePoint { + var off = index; return decodeAtCursor(bytes, &off); } @@ -329,6 +337,54 @@ test Iterator { try expectEqual(@as(?CodePoint, null), iter.next()); } +const code_point = @This(); + +// Keep this in sync with the README +test "Code point iterator" { + const str = "Hi 😊"; + var iter: code_point.Iterator = .init(str); + var i: usize = 0; + + while (iter.next()) |cp| : (i += 1) { + // The `code` field is the actual code point scalar as a `u21`. + if (i == 0) try expect(cp.code == 'H'); + if (i == 1) try expect(cp.code == 'i'); + if (i == 2) try expect(cp.code == ' '); + + if (i == 3) { + try expect(cp.code == '😊'); + // The `offset` field is the byte offset in the + // source string. + try expect(cp.offset == 3); + try expectEqual(cp, code_point.decodeAtIndex(str, cp.offset).?); + // The `len` field is the length in bytes of the + // code point in the source string. + try expect(cp.len == 4); + // There is also a 'cursor' decode, like so: + { + var cursor = cp.offset; + try expectEqual(cp, code_point.decodeAtCursor(str, &cursor).?); + // Which advances the cursor variable to the next possible + // offset, in this case, `str.len`. Don't forget to account + // for this possibility! + try expectEqual(cp.offset + cp.len, cursor); + } + // There's also this, for when you aren't sure if you have the + // correct start for a code point: + try expectEqual(cp, code_point.codepointAtIndex(str, cp.offset + 1).?); + } + // Reverse iteration is also an option: + var r_iter: code_point.ReverseIterator = .init(str); + // Both iterators can be peeked: + try expectEqual('😊', r_iter.peek().?.code); + try expectEqual('😊', r_iter.prev().?.code); + // Both kinds of iterators can be reversed: + var fwd_iter = r_iter.forwardIterator(); // or iter.reverseIterator(); + // This will always return the last codepoint from + // the prior iterator, _if_ it yielded one: + try expectEqual('😊', fwd_iter.next().?.code); + } +} test "overlongs" { // None of these should equal `/`, all should be byte-for-byte // handled as replacement characters. diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index c463dcc..ae177a9 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -162,20 +162,51 @@ test "Segmentation GraphemeIterator" { bytes_index += cp_index; } + const this_str = all_bytes.items; + { - var iter = graph.iterator(all_bytes.items); + var iter = graph.iterator(this_str); // Check. - for (want.items) |want_gc| { + for (want.items, 1..) |want_gc, idx| { const got_gc = (iter.next()).?; try std.testing.expectEqualStrings( - want_gc.bytes(all_bytes.items), - got_gc.bytes(all_bytes.items), + want_gc.bytes(this_str), + got_gc.bytes(this_str), ); + for (got_gc.offset..got_gc.offset + got_gc.len) |i| { + const this_gc = graph.graphemeAtIndex(this_str, i); + std.testing.expectEqualSlices( + u8, + got_gc.bytes(this_str), + this_gc.bytes(this_str), + ) catch |err| { + debug.print("Wrong grapheme on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i }); + return err; + }; + } + var after_iter = graph.iterateAfterGrapheme(this_str, got_gc); + if (after_iter.next()) |next_gc| { + if (iter.peek()) |next_peek| { + std.testing.expectEqualSlices( + u8, + next_gc.bytes(this_str), + next_peek.bytes(this_str), + ) catch |err| { + debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, idx }); + return err; + }; + } else { + debug.print("Mismatch: peek missing, next found, line {d} #{d}\n", .{ line_iter.line, idx }); + try testing.expect(false); + } + } else { + try testing.expectEqual(null, iter.peek()); + } } } { - var iter = graph.reverseIterator(all_bytes.items); + var iter = graph.reverseIterator(this_str); // Check. var i: usize = want.items.len; @@ -190,8 +221,8 @@ test "Segmentation GraphemeIterator" { return error.TestExpectedEqual; }; std.testing.expectEqualStrings( - want_gc.bytes(all_bytes.items), - got_gc.bytes(all_bytes.items), + want_gc.bytes(this_str), + got_gc.bytes(this_str), ) catch |err| { std.debug.print( "line {d} grapheme {d}: expected {any} found {any}\n", @@ -199,6 +230,24 @@ test "Segmentation GraphemeIterator" { ); return err; }; + var before_iter = graph.iterateBeforeGrapheme(this_str, got_gc); + if (before_iter.prev()) |prev_gc| { + if (iter.peek()) |prev_peek| { + std.testing.expectEqualSlices( + u8, + prev_gc.bytes(this_str), + prev_peek.bytes(this_str), + ) catch |err| { + debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, i }); + return err; + }; + } else { + debug.print("Mismatch: peek missing, prev found, line {d} #{d}\n", .{ line_iter.line, i }); + try testing.expect(false); + } + } else { + try testing.expectEqual(null, iter.peek()); + } } } } @@ -287,7 +336,7 @@ test "Segmentation Word Iterator" { } else { try testing.expect(false); } - var peek_iter = wb.iterateAfter(this_str, got_word); + var peek_iter = wb.iterateAfterWord(this_str, got_word); const peek_1 = peek_iter.next(); if (peek_1) |p1| { const peek_2 = iter.peek(); @@ -313,7 +362,7 @@ test "Segmentation Word Iterator" { got_word.bytes(this_str), this_word.bytes(this_str), ) catch |err| { - debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i }); + debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i }); return err; }; } @@ -356,7 +405,7 @@ test "Segmentation Word Iterator" { } else { try testing.expect(false); } - var peek_iter = wb.iterateBefore(this_str, got_word); + var peek_iter = wb.iterateBeforeWord(this_str, got_word); const peek_1 = peek_iter.prev(); if (peek_1) |p1| { const peek_2 = r_iter.peek(); -- cgit v1.2.3