From 94b1f37474c7444d8129445ae7984f922cb9c283 Mon Sep 17 00:00:00 2001 From: Matteo Romano Date: Mon, 12 May 2025 12:14:14 +0200 Subject: fix: State.unset* did toggle the bit instead of unsetting it --- src/Graphemes.zig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/Graphemes.zig') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 7bf328a..5780ed4 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -258,7 +258,7 @@ pub const State = struct { self.bits |= 1; } fn unsetXpic(self: *State) void { - self.bits ^= 1; + self.bits &= ~@as(u3, 1); } // Regional Indicatior (flags) @@ -269,7 +269,7 @@ pub const State = struct { self.bits |= 2; } fn unsetRegional(self: *State) void { - self.bits ^= 2; + self.bits &= ~@as(u3, 2); } // Indic Conjunct @@ -280,7 +280,7 @@ pub const State = struct { self.bits |= 4; } fn unsetIndic(self: *State) void { - self.bits ^= 4; + self.bits &= ~@as(u3, 4); } }; -- cgit v1.2.3 From 890370f5479299940f505e1247c408064f789bd5 Mon Sep 17 00:00:00 2001 From: Matteo Romano Date: Mon, 12 May 2025 12:14:30 +0200 Subject: feat: add reverse grapheme iterator Closes #53 --- src/Graphemes.zig | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) (limited to 'src/Graphemes.zig') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 5780ed4..3bff18d 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -7,6 +7,7 @@ const unicode = std.unicode; const CodePoint = @import("code_point").CodePoint; const CodePointIterator = @import("code_point").Iterator; +const CodePointReverseIterator = @import("code_point").ReverseIterator; s1: []u16 = undefined, s2: []u16 = undefined, @@ -70,6 +71,10 @@ pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { return Iterator.init(string, graphemes); } +pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { + return ReverseIterator.init(string, graphemes); +} + /// Indic syllable type. pub const Indic = enum { none, @@ -239,6 +244,221 @@ pub const Iterator = struct { } }; +pub const ReverseIterator = struct { + buf: [2]?CodePoint = .{ null, null }, + cp_iter: CodePointReverseIterator, + data: *const Graphemes, + /// Codepoint read from `cp_iter` but not returned by `previous` + pending: Pending = .{ .none = {} }, + + const Pending = union(enum) { + none: void, + /// Count of pending RI codepoints, it is an even number + ri_count: usize, + /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji + extend_end: u32, + }; + + const Self = @This(); + + pub fn init(str: []const u8, data: *const Graphemes) Self { + var self: Self = .{ .cp_iter = .init(str), .data = data }; + self.advance(); + self.advance(); + return self; + } + + fn advance(self: *Self) void { + self.buf[1] = self.buf[0]; + self.buf[0] = self.cp_iter.prev(); + } + + pub fn prev(self: *Self) ?Grapheme { + if (self.buf[1] == null) return null; + + const grapheme_end: u32 = end: { + const codepoint = self.buf[1].?; + + switch (self.pending) { + // BUF: [?Any, Any] + .none => break :end codepoint.offset + codepoint.len, + .ri_count => |ri_count| { + std.debug.assert(ri_count > 0); + std.debug.assert(ri_count % 2 == 0); + + if (ri_count > 2) { + self.pending.ri_count -= 2; + + // Use the fact that all RI have length 4 in utf8 encoding + // since they are in range 0x1f1e6...0x1f1ff + // https://en.wikipedia.org/wiki/UTF-8#Encoding + return Grapheme{ + .len = 8, + .offset = @intCast(codepoint.offset + self.pending.ri_count * 4), + }; + } else { + self.pending = .{ .none = {} }; + break :end codepoint.offset + codepoint.len + 4; + } + }, + // BUF: [?Any, Extend] Extend* ZWJ + .extend_end => |extend_end| { + self.pending = .{ .none = {} }; + break :end extend_end; + }, + } + }; + + while (self.buf[0] != null) { + var state: State = .{}; + state.setXpic(); + state.unsetRegional(); + state.setIndic(); + + if (graphemeBreak( + self.buf[0].?.code, + self.buf[1].?.code, + self.data, + &state, + )) break; + + self.advance(); + + if (!state.hasIndic()) { + + // BUF: [?Any, Extend | Linker] Consonant + var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; + + indic: while (true) { + if (self.buf[0] == null) { + self.pending = .{ .extend_end = indic_offset }; + return .{ + .len = @intCast(grapheme_end - indic_offset), + .offset = indic_offset, + }; + } + + const codepoint = self.buf[0].?; + + switch (self.data.indic(codepoint.code)) { + .Extend, .Linker => { + self.advance(); + continue :indic; + }, + .Consonant => { + // BUF: [Consonant, Extend | Linker] (Extend | Linker)* Consonant + indic_offset = codepoint.offset; + self.advance(); + + if (self.buf[0]) |cp1| { + state.setIndic(); + + if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; + + if (!state.hasIndic()) { + continue :indic; + } else { + break :indic; + } + } else { + break :indic; + } + }, + .none => { + // BUF: [Any, Extend | Linker] (Extend | Linker)* Consonant + self.pending = .{ .extend_end = indic_offset }; + return .{ + .len = @intCast(grapheme_end - indic_offset), + .offset = indic_offset, + }; + }, + } + } + } + + if (!state.hasXpic()) { + // BUF: [?Any, ZWJ] Emoji + var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; + + // Look for previous Emoji + emoji: while (true) { + if (self.buf[0] == null) { + self.pending = .{ .extend_end = emoji_offset }; + return .{ + .len = @intCast(grapheme_end - emoji_offset), + .offset = emoji_offset, + }; + } + + const codepoint = self.buf[0].?; + + if (self.data.gbp(codepoint.code) == .Extend) { + self.advance(); + continue :emoji; + } + + if (self.data.isEmoji(codepoint.code)) { + // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)* + emoji_offset = codepoint.offset; + self.advance(); + + if (self.buf[0] != null and + // ZWJ = 0x200d + self.buf[0].?.code == 0x200d) + { + // BUF: [ZWJ, Emoji] (Extend* ZWJ Emoji)* + // Back at the beginning of the loop, "recursively" look for emoji + self.advance(); + continue :emoji; + } else { + // BUF: [?Any, Emoji] (Extend* ZWJ Emoji)* + break :emoji; + } + } else { + // BUF: [Any, Extend] (Extend* ZWJ Emoji)* + self.pending = .{ .extend_end = emoji_offset }; + return .{ + .len = @intCast(grapheme_end - emoji_offset), + .offset = emoji_offset, + }; + } + } + } + + if (state.hasRegional()) { + var ri_count: usize = 0; + while (self.buf[0] != null and + self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) + { + ri_count += 1; + self.advance(); + } + + // Use the fact that all RI have length 4 in utf8 encoding + // since they are in range 0x1f1e6...0x1f1ff + // https://en.wikipedia.org/wiki/UTF-8#Encoding + if (ri_count == 0) { + // There are no pending RI codepoints + } else if (ri_count % 2 == 0) { + self.pending = .{ .ri_count = ri_count }; + return .{ .len = 8, .offset = grapheme_end - 8 }; + } else { + // Add one to count for the unused RI + self.pending = .{ .ri_count = ri_count + 1 }; + return .{ .len = 4, .offset = grapheme_end - 4 }; + } + } + } + + const grapheme_start = if (self.buf[1]) |codepoint| codepoint.offset else 0; + self.advance(); + return .{ + .len = @intCast(grapheme_end - grapheme_start), + .offset = grapheme_start, + }; + } +}; + // Predicates fn isBreaker(cp: u21, data: *const Graphemes) bool { // Extract relevant properties. -- cgit v1.2.3 From 04123c2280088acbe4501bbe4c314ca64ff27dab Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Mon, 12 May 2025 12:57:04 -0400 Subject: Vastly simplify peek() Idiomatic Zig takes awhile, what can I say (yes I wrote the first one). --- src/Graphemes.zig | 63 +++---------------------------------------------------- 1 file changed, 3 insertions(+), 60 deletions(-) (limited to 'src/Graphemes.zig') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 7bf328a..1ce1ea6 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -99,7 +99,7 @@ pub const Gbp = enum { /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. pub const Grapheme = struct { - len: u8, + len: u32, offset: u32, /// `bytes` returns the slice of bytes that correspond to @@ -173,69 +173,12 @@ pub const Iterator = struct { const saved_cp_iter = self.cp_iter; const s0 = self.buf[0]; const s1 = self.buf[1]; - - self.advance(); - - // If no more - if (self.buf[0] == null) { - self.cp_iter = saved_cp_iter; - self.buf[0] = s0; - self.buf[1] = s1; - return null; - } - // If last one - if (self.buf[1] == null) { - const len = self.buf[0].?.len; - const offset = self.buf[0].?.offset; + defer { self.cp_iter = saved_cp_iter; self.buf[0] = s0; self.buf[1] = s1; - return Grapheme{ .len = len, .offset = offset }; } - // If ASCII - if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) { - const len = self.buf[0].?.len; - const offset = self.buf[0].?.offset; - self.cp_iter = saved_cp_iter; - self.buf[0] = s0; - self.buf[1] = s1; - return Grapheme{ .len = len, .offset = offset }; - } - - const gc_start = self.buf[0].?.offset; - var gc_len: u8 = self.buf[0].?.len; - var state = State{}; - - if (graphemeBreak( - self.buf[0].?.code, - self.buf[1].?.code, - self.data, - &state, - )) { - self.cp_iter = saved_cp_iter; - self.buf[0] = s0; - self.buf[1] = s1; - return Grapheme{ .len = gc_len, .offset = gc_start }; - } - - while (true) { - self.advance(); - if (self.buf[0] == null) break; - - gc_len += self.buf[0].?.len; - - if (graphemeBreak( - self.buf[0].?.code, - if (self.buf[1]) |ncp| ncp.code else 0, - self.data, - &state, - )) break; - } - self.cp_iter = saved_cp_iter; - self.buf[0] = s0; - self.buf[1] = s1; - - return Grapheme{ .len = gc_len, .offset = gc_start }; + return self.next(); } }; -- cgit v1.2.3 From cf8d8fe5d640511f6c4134fdaa36e930232ca7da Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Mon, 12 May 2025 15:22:37 -0400 Subject: Begin conformance test I'm not sure the details of this strategy can actually be made to work. But, something can. --- src/Graphemes.zig | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'src/Graphemes.zig') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 1ce1ea6..1f67fc6 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -364,3 +364,25 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { try std.testing.expectEqual(@as(usize, 2), i); } } + +test "Iterator.peek" { + const peek_seq = "aΔ👨🏻‍🌾→"; + const data = try Graphemes.init(std.testing.allocator); + defer data.deinit(std.testing.allocator); + + var iter = data.iterator(peek_seq); + const peek_a = iter.peek().?; + const next_a = iter.next().?; + try std.testing.expectEqual(peek_a, next_a); + try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq)); + const peek_d1 = iter.peek().?; + const peek_d2 = iter.peek().?; + try std.testing.expectEqual(peek_d1, peek_d2); + const next_d = iter.next().?; + try std.testing.expectEqual(peek_d2, next_d); + try std.testing.expectEqual(iter.peek(), iter.next()); + try std.testing.expectEqual(iter.peek(), iter.next()); + try std.testing.expectEqual(null, iter.peek()); + try std.testing.expectEqual(null, iter.peek()); + try std.testing.expectEqual(iter.peek(), iter.next()); +} -- cgit v1.2.3 From c9a1b3392973ee30e6a9a532f1da8605619b5b06 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 23 May 2025 18:46:30 -0400 Subject: Make offset size configurable Hopefully I can talk users out of taking advantage of this configuration but I'll have better luck with that if it's available. --- src/Graphemes.zig | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'src/Graphemes.zig') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 0338c04..49fdbf3 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -5,9 +5,11 @@ const Allocator = mem.Allocator; const compress = std.compress; const unicode = std.unicode; -const CodePoint = @import("code_point").CodePoint; -const CodePointIterator = @import("code_point").Iterator; -const CodePointReverseIterator = @import("code_point").ReverseIterator; +const code_point = @import("code_point"); +const CodePoint = code_point.CodePoint; +const CodePointIterator = code_point.Iterator; +const CodePointReverseIterator = code_point.ReverseIterator; +const uoffset = code_point.uoffset; s1: []u16 = undefined, s2: []u16 = undefined, @@ -104,8 +106,8 @@ pub const Gbp = enum { /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. pub const Grapheme = struct { - len: u32, - offset: u32, + len: uoffset, + offset: uoffset, /// `bytes` returns the slice of bytes that correspond to /// this grapheme cluster in `src`. @@ -199,7 +201,7 @@ pub const ReverseIterator = struct { /// Count of pending RI codepoints, it is an even number ri_count: usize, /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji - extend_end: u32, + extend_end: uoffset, }; const Self = @This(); @@ -219,7 +221,7 @@ pub const ReverseIterator = struct { pub fn prev(self: *Self) ?Grapheme { if (self.buf[1] == null) return null; - const grapheme_end: u32 = end: { + const grapheme_end: uoffset = end: { const codepoint = self.buf[1].?; switch (self.pending) { @@ -270,7 +272,7 @@ pub const ReverseIterator = struct { if (!state.hasIndic()) { // BUF: [?Any, Extend | Linker] Consonant - var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; + var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; indic: while (true) { if (self.buf[0] == null) { @@ -321,7 +323,7 @@ pub const ReverseIterator = struct { if (!state.hasXpic()) { // BUF: [?Any, ZWJ] Emoji - var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; + var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; // Look for previous Emoji emoji: while (true) { -- cgit v1.2.3 From 8f5209fa095c2ed9114ce102b2f9b2cc90d66b13 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Sun, 1 Jun 2025 14:08:25 -0400 Subject: Add graphemeAtIndex + iterate before and after That completes the set. I do think it's possible to bum a few more cycles from the implementation, but, I'm not going to. It passes the acceptance suite and that's what it needs to do. --- src/Graphemes.zig | 220 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 147 insertions(+), 73 deletions(-) (limited to 'src/Graphemes.zig') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 49fdbf3..f1c56ed 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -1,15 +1,7 @@ -const std = @import("std"); -const builtin = @import("builtin"); -const mem = std.mem; -const Allocator = mem.Allocator; -const compress = std.compress; -const unicode = std.unicode; - -const code_point = @import("code_point"); -const CodePoint = code_point.CodePoint; -const CodePointIterator = code_point.Iterator; -const CodePointReverseIterator = code_point.ReverseIterator; -const uoffset = code_point.uoffset; +//! Graphemes Module +//! +//! Code for handling graphemes: fragments of string which should be +//! treated as one unit. Like Farmer Bob here: 👨🏻‍🌾 s1: []u16 = undefined, s2: []u16 = undefined, @@ -69,10 +61,12 @@ pub fn isEmoji(graphemes: Graphemes, cp: u21) bool { return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; } +/// Returns an iterator over the graphemes in `string`. pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { return Iterator.init(string, graphemes); } +/// Returns a reverse iterator over the graphemes in `string`. pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { return ReverseIterator.init(string, graphemes); } @@ -116,6 +110,96 @@ pub const Grapheme = struct { } }; +// NOTE: graphemeAtIndex is, probably, not in an optimal form. It has the advantage +// of being composed of other parts, but the constant factor can _probably_ be improved +// by a bespoke implmentation using graphemes.graphemeBreak directly. There's a limit +// to how much cycle-bumming I'm willing to do at any given moment; that limit has been +// reached. Perhaps you, Dear Reader, might pick up the torch? + +/// Returns the `Grapheme` at `string[index]`, which does not have to be a +/// valid start of a codepoint. Asserts the string is not empty. Index must be +/// less than `string.len`. Always returns a `Grapheme`. +pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme { + assert(string.len != 0); + if (index == 0 or (index > 0 and + string[index] < 0x80 and + string[index - 1] < 0x80) and + (string[index - 1] != '\r' and string[index] != '\n')) + { + // There's always a grapheme break between two ASCII code points (except CRLF) + var iter = graphemes.iterator(string[index..]); + const next = iter.next().?; + return Grapheme{ + .len = next.len, + .offset = @as(u32, @intCast(index)) + next.offset, + }; + } // Otherwise it gets hairy. + const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset; + if (idx == string.len) { + var iter = graphemes.reverseIterator(string); + return iter.prev().?; + } + // We're on a valid codepoint boundary, we go back from here + var r_iter = graphemes.reverseIterAtIndex(string, idx); + if (r_iter.prev()) |g| { + if (g.offset == 0) { + var iter = graphemes.iterator(string); + while (iter.next()) |g2| { + if (g2.offset <= idx and idx < g2.offset + g2.len) return g2; + } + } + } + // We need to toss one, because otherwise we might not be pending when + // we in fact need to be. + _ = r_iter.prev(); + while (r_iter.pending != .none) : (_ = r_iter.prev()) {} + var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0); + while (iter.next()) |g| { + if (g.offset <= idx and idx < g.offset + g.len) return g; + } + unreachable; +} + +/// Return a (forward) iterator of `string` after `grapheme`. +pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator { + return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len); +} + +/// Return a reverse iterator of `string` before `grapheme`. +pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator { + // This bit of weirdness is because reverse iterators are "advance last", + // while forward iterators are "advance first". This leaves some room for + // further optimization, if anyone dares. + var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1); + _ = r_iter.prev(); + return r_iter; +} + +fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator { + var r_iter: ReverseIterator = undefined; + r_iter.data = graphemes; + var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; + r_iter.buf[1] = rcp_iter.prev(); + r_iter.buf[0] = rcp_iter.prev(); + r_iter.pending = .none; + r_iter.cp_iter = rcp_iter; + return r_iter; +} + +fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator { + var iter: Iterator = undefined; + iter.data = graphemes; + iter.buf[0] = first: { + if (idx == string.len) break :first null; + var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; + break :first r_cp_iter.prev(); + }; + var cp_iter: CodePointIterator = .{ .bytes = string, .i = idx }; + iter.buf[1] = cp_iter.next(); + iter.cp_iter = cp_iter; + return iter; +} + /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. pub const Iterator = struct { buf: [2]?CodePoint = .{ null, null }, @@ -150,7 +234,7 @@ pub const Iterator = struct { const gc_start = self.buf[0].?.offset; var gc_len: u8 = self.buf[0].?.len; - var state = State{}; + var state = IterState{}; if (graphemeBreak( self.buf[0].?.code, @@ -189,12 +273,13 @@ pub const Iterator = struct { } }; +/// Iterate a string backward by Grapheme. pub const ReverseIterator = struct { buf: [2]?CodePoint = .{ null, null }, cp_iter: CodePointReverseIterator, data: *const Graphemes, /// Codepoint read from `cp_iter` but not returned by `previous` - pending: Pending = .{ .none = {} }, + pending: Pending = .none, const Pending = union(enum) { none: void, @@ -218,6 +303,12 @@ pub const ReverseIterator = struct { self.buf[0] = self.cp_iter.prev(); } + pub fn peek(self: *Self) ?Grapheme { + const cache = .{ self.buf, self.cp_iter, self.pending }; + defer self.buf, self.cp_iter, self.pending = cache; + return self.prev(); + } + pub fn prev(self: *Self) ?Grapheme { if (self.buf[1] == null) return null; @@ -255,10 +346,10 @@ pub const ReverseIterator = struct { }; while (self.buf[0] != null) { - var state: State = .{}; - state.setXpic(); - state.unsetRegional(); - state.setIndic(); + var state: IterState = .{}; + state.xpic = true; + state.regional = false; + state.indic = true; if (graphemeBreak( self.buf[0].?.code, @@ -269,7 +360,7 @@ pub const ReverseIterator = struct { self.advance(); - if (!state.hasIndic()) { + if (!state.indic) { // BUF: [?Any, Extend | Linker] Consonant var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; @@ -296,11 +387,11 @@ pub const ReverseIterator = struct { self.advance(); if (self.buf[0]) |cp1| { - state.setIndic(); + state.indic = true; if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; - if (!state.hasIndic()) { + if (!state.indic) { continue :indic; } else { break :indic; @@ -321,7 +412,7 @@ pub const ReverseIterator = struct { } } - if (!state.hasXpic()) { + if (!state.xpic) { // BUF: [?Any, ZWJ] Emoji var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; @@ -370,7 +461,7 @@ pub const ReverseIterator = struct { } } - if (state.hasRegional()) { + if (state.regional) { var ri_count: usize = 0; while (self.buf[0] != null and self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) @@ -404,6 +495,13 @@ pub const ReverseIterator = struct { } }; +/// Grapheme Iterator state. +pub const IterState = packed struct(u3) { + xpic: bool = false, + regional: bool = false, + indic: bool = false, +}; + // Predicates fn isBreaker(cp: u21, data: *const Graphemes) bool { // Extract relevant properties. @@ -411,44 +509,6 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool { return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; } -// Grapheme break state. -pub const State = struct { - bits: u3 = 0, - - // Extended Pictographic (emoji) - fn hasXpic(self: State) bool { - return self.bits & 1 == 1; - } - fn setXpic(self: *State) void { - self.bits |= 1; - } - fn unsetXpic(self: *State) void { - self.bits &= ~@as(u3, 1); - } - - // Regional Indicatior (flags) - fn hasRegional(self: State) bool { - return self.bits & 2 == 2; - } - fn setRegional(self: *State) void { - self.bits |= 2; - } - fn unsetRegional(self: *State) void { - self.bits &= ~@as(u3, 2); - } - - // Indic Conjunct - fn hasIndic(self: State) bool { - return self.bits & 4 == 4; - } - fn setIndic(self: *State) void { - self.bits |= 4; - } - fn unsetIndic(self: *State) void { - self.bits &= ~@as(u3, 4); - } -}; - /// `graphemeBreak` returns true only if a grapheme break point is required /// between `cp1` and `cp2`. `state` should start out as 0. If calling /// iteratively over a sequence of code points, this function must be called @@ -459,7 +519,7 @@ pub fn graphemeBreak( cp1: u21, cp2: u21, data: *const Graphemes, - state: *State, + state: *IterState, ) bool { // Extract relevant properties. const cp1_gbp_prop = data.gbp(cp1); @@ -471,9 +531,9 @@ pub fn graphemeBreak( const cp2_is_emoji = data.isEmoji(cp2); // GB11: Emoji Extend* ZWJ x Emoji - if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); + if (!state.xpic and cp1_is_emoji) state.xpic = true; // GB9c: Indic Conjunct Break - if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic(); + if (!state.indic and cp1_indic_prop == .Consonant) state.indic = true; // GB3: CR x LF if (cp1 == '\r' and cp2 == '\n') return false; @@ -482,11 +542,11 @@ pub fn graphemeBreak( if (isBreaker(cp1, data)) return true; // GB11: Emoji Extend* ZWJ x Emoji - if (state.hasXpic() and + if (state.xpic and cp1_gbp_prop == .ZWJ and cp2_is_emoji) { - state.unsetXpic(); + state.xpic = false; return false; } @@ -501,11 +561,11 @@ pub fn graphemeBreak( // GB12, GB13: RI x RI if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { - if (state.hasRegional()) { - state.unsetRegional(); + if (state.regional) { + state.regional = false; return true; } else { - state.setRegional(); + state.regional = true; return false; } } @@ -530,25 +590,25 @@ pub fn graphemeBreak( } // GB9c: Indic Conjunct Break - if (state.hasIndic() and + if (state.indic and cp1_indic_prop == .Consonant and (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) { return false; } - if (state.hasIndic() and + if (state.indic and cp1_indic_prop == .Extend and cp2_indic_prop == .Linker) { return false; } - if (state.hasIndic() and + if (state.indic and (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and cp2_indic_prop == .Consonant) { - state.unsetIndic(); + state.indic = false; return false; } @@ -608,3 +668,17 @@ test "Iterator.peek" { try std.testing.expectEqual(null, iter.peek()); try std.testing.expectEqual(iter.peek(), iter.next()); } + +const std = @import("std"); +const builtin = @import("builtin"); +const assert = std.debug.assert; +const mem = std.mem; +const Allocator = mem.Allocator; +const compress = std.compress; +const unicode = std.unicode; + +const code_point = @import("code_point"); +const CodePoint = code_point.CodePoint; +const CodePointIterator = code_point.Iterator; +const CodePointReverseIterator = code_point.ReverseIterator; +const uoffset = code_point.uoffset; -- cgit v1.2.3