From bf319e504e5476d9b0d2dec3e2f8d81ef6439ce4 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Sat, 2 Nov 2024 10:31:28 -0400 Subject: Add peek() to Grapheme.Iterator This does the expected thing: returns the next ?Grapheme without mutation of the iteration state. --- src/grapheme.zig | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/unicode_tests.zig | 26 +++++++++++++++++++ 2 files changed, 95 insertions(+) (limited to 'src') diff --git a/src/grapheme.zig b/src/grapheme.zig index 911c856..7538f5b 100644 --- a/src/grapheme.zig +++ b/src/grapheme.zig @@ -77,6 +77,75 @@ pub const Iterator = struct { return Grapheme{ .len = gc_len, .offset = gc_start }; } + + pub fn peek(self: *Self) ?Grapheme { + const saved_cp_iter = self.cp_iter; + const s0 = self.buf[0]; + const s1 = self.buf[1]; + + self.advance(); + + // If no more + if (self.buf[0] == null) { + self.cp_iter = saved_cp_iter; + self.buf[0] = s0; + self.buf[1] = s1; + return null; + } + // If last one + if (self.buf[1] == null) { + const len = self.buf[0].?.len; + const offset = self.buf[0].?.offset; + self.cp_iter = saved_cp_iter; + self.buf[0] = s0; + self.buf[1] = s1; + return Grapheme{ .len = len, .offset = offset }; + } + // If ASCII + if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) { + const len = self.buf[0].?.len; + const offset = self.buf[0].?.offset; + self.cp_iter = saved_cp_iter; + self.buf[0] = s0; + self.buf[1] = s1; + return Grapheme{ .len = len, .offset = offset }; + } + + const gc_start = self.buf[0].?.offset; + var gc_len: u8 = self.buf[0].?.len; + var state = State{}; + + if (graphemeBreak( + self.buf[0].?.code, + self.buf[1].?.code, + self.data, + &state, + )) { + self.cp_iter = saved_cp_iter; + self.buf[0] = s0; + self.buf[1] = s1; + return Grapheme{ .len = gc_len, .offset = gc_start }; + } + + while (true) { + self.advance(); + if (self.buf[0] == null) break; + + gc_len += self.buf[0].?.len; + + if (graphemeBreak( + self.buf[0].?.code, + if (self.buf[1]) |ncp| ncp.code else 0, + self.data, + &state, + )) break; + } + self.cp_iter = saved_cp_iter; + self.buf[0] = s0; + self.buf[1] = s1; + + return Grapheme{ .len = gc_len, .offset = gc_start }; + } }; // Predicates diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 691ccfb..245c03f 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -7,11 +7,37 @@ const mem = std.mem; const testing = std.testing; const unicode = std.unicode; +const grapheme = @import("grapheme"); const Grapheme = @import("grapheme").Grapheme; const GraphemeData = @import("grapheme").GraphemeData; const GraphemeIterator = @import("grapheme").Iterator; const Normalize = @import("Normalize"); +comptime { + testing.refAllDecls(grapheme); +} +test "Iterator.peek" { + const peek_seq = "aΔ👨🏻‍🌾→"; + const data = try GraphemeData.init(std.testing.allocator); + defer data.deinit(); + + var iter = grapheme.Iterator.init(peek_seq, &data); + const peek_a = iter.peek().?; + const next_a = iter.next().?; + try std.testing.expectEqual(peek_a, next_a); + try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq)); + const peek_d1 = iter.peek().?; + const peek_d2 = iter.peek().?; + try std.testing.expectEqual(peek_d1, peek_d2); + const next_d = iter.next().?; + try std.testing.expectEqual(peek_d2, next_d); + try std.testing.expectEqual(iter.peek(), iter.next()); + try std.testing.expectEqual(iter.peek(), iter.next()); + try std.testing.expectEqual(null, iter.peek()); + try std.testing.expectEqual(null, iter.peek()); + try std.testing.expectEqual(iter.peek(), iter.next()); +} + test "Unicode normalization tests" { var arena = heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); -- cgit v1.2.3