From 8e24f0ac6a0c9052381001688cf425f2a4017b30 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 9 May 2025 12:58:23 -0400 Subject: Add reverse CodePoint iterator --- src/code_point.zig | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 75 insertions(+), 6 deletions(-) (limited to 'src') diff --git a/src/code_point.zig b/src/code_point.zig index 13e38bf..ed5fede 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -94,6 +94,45 @@ pub const Iterator = struct { } }; +pub const ReverseIterator = struct { + bytes: []const u8, + i: ?u32, + + pub fn init(str: []const u8) ReverseIterator { + var r_iter: ReverseIterator = undefined; + r_iter.bytes = str; + r_iter.i = if (str.len == 0) 0 else @intCast(str.len - 1); + return r_iter; + } + + pub fn prev(iter: *ReverseIterator) ?CodePoint { + if (iter.i == null) return null; + var i_prev = iter.i.?; + + while (i_prev > 0) : (i_prev -= 1) { + if (!followbyte(iter.bytes[i_prev])) break; + if (i_prev == 0) break; + } + + if (i_prev > 0) + iter.i = i_prev - 1 + else + iter.i = null; + + return decode(iter.bytes[i_prev..], i_prev); + } + + pub fn peek(iter: *ReverseIterator) ?CodePoint { + const saved_i = iter.i; + defer iter.i = saved_i; + return iter.prev(); + } +}; + +inline fn followbyte(b: u8) bool { + return 0x80 <= b and b <= 0xbf; +} + test "decode" { const bytes = "🌩️"; const res = decode(bytes, 0); @@ -107,12 +146,42 @@ test "decode" { } } -test "peek" { +test Iterator { var iter = Iterator{ .bytes = "Hi" }; - try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); - try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); - try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code); - try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); - try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); + try testing.expectEqual(@as(u21, 'H'), iter.next().?.code); + try testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); + try testing.expectEqual(@as(u21, 'i'), iter.next().?.code); + try testing.expectEqual(@as(?CodePoint, null), iter.peek()); + try testing.expectEqual(@as(?CodePoint, null), iter.next()); + try testing.expectEqual(@as(?CodePoint, null), iter.next()); } + +test ReverseIterator { + { + var r_iter: ReverseIterator = .init("ABC"); + try testing.expectEqual(@as(u21, 'C'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, 'B'), r_iter.peek().?.code); + try testing.expectEqual(@as(u21, 'B'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, 'A'), r_iter.prev().?.code); + try testing.expectEqual(@as(?CodePoint, null), r_iter.peek()); + try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); + try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); + } + { + var r_iter: ReverseIterator = .init("∅δq🦾ă"); + try testing.expectEqual(@as(u21, 'ă'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '🦾'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, 'q'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, 'δ'), r_iter.peek().?.code); + try testing.expectEqual(@as(u21, 'δ'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '∅'), r_iter.peek().?.code); + try testing.expectEqual(@as(u21, '∅'), r_iter.peek().?.code); + try testing.expectEqual(@as(u21, '∅'), r_iter.prev().?.code); + try testing.expectEqual(@as(?CodePoint, null), r_iter.peek()); + try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); + try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); + } +} + +const testing = std.testing; -- cgit v1.2.3 From 94b1f37474c7444d8129445ae7984f922cb9c283 Mon Sep 17 00:00:00 2001 From: Matteo Romano Date: Mon, 12 May 2025 12:14:14 +0200 Subject: fix: State.unset* did toggle the bit instead of unsetting it --- src/Graphemes.zig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 7bf328a..5780ed4 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -258,7 +258,7 @@ pub const State = struct { self.bits |= 1; } fn unsetXpic(self: *State) void { - self.bits ^= 1; + self.bits &= ~@as(u3, 1); } // Regional Indicatior (flags) @@ -269,7 +269,7 @@ pub const State = struct { self.bits |= 2; } fn unsetRegional(self: *State) void { - self.bits ^= 2; + self.bits &= ~@as(u3, 2); } // Indic Conjunct @@ -280,7 +280,7 @@ pub const State = struct { self.bits |= 4; } fn unsetIndic(self: *State) void { - self.bits ^= 4; + self.bits &= ~@as(u3, 4); } }; -- cgit v1.2.3 From ad4b046c36953b01ae31d109197ee9db5c16d5ea Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Tue, 13 May 2025 13:38:28 -0400 Subject: Various small iterator improvements --- src/code_point.zig | 55 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 9 deletions(-) (limited to 'src') diff --git a/src/code_point.zig b/src/code_point.zig index ed5fede..5aa12b7 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -42,7 +42,7 @@ pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { .offset = offset, }; - // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte + // Return replacement if we don't have a complete codepoint remaining. Consumes only one byte. if (cp.len > bytes.len) { // Unicode replacement code point. return .{ @@ -76,21 +76,30 @@ pub const Iterator = struct { bytes: []const u8, i: u32 = 0, - pub fn next(self: *Iterator) ?CodePoint { - if (self.i >= self.bytes.len) return null; + pub fn next(iter: *Iterator) ?CodePoint { + if (iter.i >= iter.bytes.len) return null; - const res = decode(self.bytes[self.i..], self.i); + const res = decode(iter.bytes[iter.i..], iter.i); if (res) |cp| { - self.i += cp.len; + iter.i += cp.len; } return res; } - pub fn peek(self: *Iterator) ?CodePoint { - const saved_i = self.i; - defer self.i = saved_i; - return self.next(); + pub fn peek(iter: *Iterator) ?CodePoint { + const saved_i = iter.i; + defer iter.i = saved_i; + return iter.next(); + } + + /// Create a backward iterator at this point. It will repeat + /// the last CodePoint seen. + pub fn reverseIterator(iter: *Iterator) ReverseIterator { + if (iter.i == iter.bytes.len) { + return .init(iter.bytes); + } + return .{ .i = iter.i, .bytes = iter.bytes }; } }; @@ -127,6 +136,17 @@ pub const ReverseIterator = struct { defer iter.i = saved_i; return iter.prev(); } + + /// Create a forward iterator at this point. It will repeat the + /// last CodePoint seen. + pub fn forwardIterator(iter: *ReverseIterator) Iterator { + if (iter.i) |i| { + var fwd: Iterator = .{ .i = i, .bytes = iter.bytes }; + _ = fwd.next(); + return fwd; + } + return .{ .i = 0, .bytes = iter.bytes }; + } }; inline fn followbyte(b: u8) bool { @@ -182,6 +202,23 @@ test ReverseIterator { try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); } + { + var r_iter: ReverseIterator = .init("123"); + try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '1'), r_iter.prev().?.code); + var iter = r_iter.forwardIterator(); + try testing.expectEqual(@as(u21, '1'), iter.next().?.code); + try testing.expectEqual(@as(u21, '2'), iter.next().?.code); + try testing.expectEqual(@as(u21, '3'), iter.next().?.code); + r_iter = iter.reverseIterator(); + try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); + iter = r_iter.forwardIterator(); + r_iter = iter.reverseIterator(); + try testing.expectEqual(@as(u21, '2'), iter.next().?.code); + try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); + } } const testing = std.testing; -- cgit v1.2.3 From 890370f5479299940f505e1247c408064f789bd5 Mon Sep 17 00:00:00 2001 From: Matteo Romano Date: Mon, 12 May 2025 12:14:30 +0200 Subject: feat: add reverse grapheme iterator Closes #53 --- src/Graphemes.zig | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/unicode_tests.zig | 74 +++++++++++++++++ 2 files changed, 294 insertions(+) (limited to 'src') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 5780ed4..3bff18d 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -7,6 +7,7 @@ const unicode = std.unicode; const CodePoint = @import("code_point").CodePoint; const CodePointIterator = @import("code_point").Iterator; +const CodePointReverseIterator = @import("code_point").ReverseIterator; s1: []u16 = undefined, s2: []u16 = undefined, @@ -70,6 +71,10 @@ pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { return Iterator.init(string, graphemes); } +pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { + return ReverseIterator.init(string, graphemes); +} + /// Indic syllable type. pub const Indic = enum { none, @@ -239,6 +244,221 @@ pub const Iterator = struct { } }; +pub const ReverseIterator = struct { + buf: [2]?CodePoint = .{ null, null }, + cp_iter: CodePointReverseIterator, + data: *const Graphemes, + /// Codepoint read from `cp_iter` but not returned by `previous` + pending: Pending = .{ .none = {} }, + + const Pending = union(enum) { + none: void, + /// Count of pending RI codepoints, it is an even number + ri_count: usize, + /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji + extend_end: u32, + }; + + const Self = @This(); + + pub fn init(str: []const u8, data: *const Graphemes) Self { + var self: Self = .{ .cp_iter = .init(str), .data = data }; + self.advance(); + self.advance(); + return self; + } + + fn advance(self: *Self) void { + self.buf[1] = self.buf[0]; + self.buf[0] = self.cp_iter.prev(); + } + + pub fn prev(self: *Self) ?Grapheme { + if (self.buf[1] == null) return null; + + const grapheme_end: u32 = end: { + const codepoint = self.buf[1].?; + + switch (self.pending) { + // BUF: [?Any, Any] + .none => break :end codepoint.offset + codepoint.len, + .ri_count => |ri_count| { + std.debug.assert(ri_count > 0); + std.debug.assert(ri_count % 2 == 0); + + if (ri_count > 2) { + self.pending.ri_count -= 2; + + // Use the fact that all RI have length 4 in utf8 encoding + // since they are in range 0x1f1e6...0x1f1ff + // https://en.wikipedia.org/wiki/UTF-8#Encoding + return Grapheme{ + .len = 8, + .offset = @intCast(codepoint.offset + self.pending.ri_count * 4), + }; + } else { + self.pending = .{ .none = {} }; + break :end codepoint.offset + codepoint.len + 4; + } + }, + // BUF: [?Any, Extend] Extend* ZWJ + .extend_end => |extend_end| { + self.pending = .{ .none = {} }; + break :end extend_end; + }, + } + }; + + while (self.buf[0] != null) { + var state: State = .{}; + state.setXpic(); + state.unsetRegional(); + state.setIndic(); + + if (graphemeBreak( + self.buf[0].?.code, + self.buf[1].?.code, + self.data, + &state, + )) break; + + self.advance(); + + if (!state.hasIndic()) { + + // BUF: [?Any, Extend | Linker] Consonant + var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; + + indic: while (true) { + if (self.buf[0] == null) { + self.pending = .{ .extend_end = indic_offset }; + return .{ + .len = @intCast(grapheme_end - indic_offset), + .offset = indic_offset, + }; + } + + const codepoint = self.buf[0].?; + + switch (self.data.indic(codepoint.code)) { + .Extend, .Linker => { + self.advance(); + continue :indic; + }, + .Consonant => { + // BUF: [Consonant, Extend | Linker] (Extend | Linker)* Consonant + indic_offset = codepoint.offset; + self.advance(); + + if (self.buf[0]) |cp1| { + state.setIndic(); + + if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; + + if (!state.hasIndic()) { + continue :indic; + } else { + break :indic; + } + } else { + break :indic; + } + }, + .none => { + // BUF: [Any, Extend | Linker] (Extend | Linker)* Consonant + self.pending = .{ .extend_end = indic_offset }; + return .{ + .len = @intCast(grapheme_end - indic_offset), + .offset = indic_offset, + }; + }, + } + } + } + + if (!state.hasXpic()) { + // BUF: [?Any, ZWJ] Emoji + var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; + + // Look for previous Emoji + emoji: while (true) { + if (self.buf[0] == null) { + self.pending = .{ .extend_end = emoji_offset }; + return .{ + .len = @intCast(grapheme_end - emoji_offset), + .offset = emoji_offset, + }; + } + + const codepoint = self.buf[0].?; + + if (self.data.gbp(codepoint.code) == .Extend) { + self.advance(); + continue :emoji; + } + + if (self.data.isEmoji(codepoint.code)) { + // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)* + emoji_offset = codepoint.offset; + self.advance(); + + if (self.buf[0] != null and + // ZWJ = 0x200d + self.buf[0].?.code == 0x200d) + { + // BUF: [ZWJ, Emoji] (Extend* ZWJ Emoji)* + // Back at the beginning of the loop, "recursively" look for emoji + self.advance(); + continue :emoji; + } else { + // BUF: [?Any, Emoji] (Extend* ZWJ Emoji)* + break :emoji; + } + } else { + // BUF: [Any, Extend] (Extend* ZWJ Emoji)* + self.pending = .{ .extend_end = emoji_offset }; + return .{ + .len = @intCast(grapheme_end - emoji_offset), + .offset = emoji_offset, + }; + } + } + } + + if (state.hasRegional()) { + var ri_count: usize = 0; + while (self.buf[0] != null and + self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) + { + ri_count += 1; + self.advance(); + } + + // Use the fact that all RI have length 4 in utf8 encoding + // since they are in range 0x1f1e6...0x1f1ff + // https://en.wikipedia.org/wiki/UTF-8#Encoding + if (ri_count == 0) { + // There are no pending RI codepoints + } else if (ri_count % 2 == 0) { + self.pending = .{ .ri_count = ri_count }; + return .{ .len = 8, .offset = grapheme_end - 8 }; + } else { + // Add one to count for the unused RI + self.pending = .{ .ri_count = ri_count + 1 }; + return .{ .len = 4, .offset = grapheme_end - 4 }; + } + } + } + + const grapheme_start = if (self.buf[1]) |codepoint| codepoint.offset else 0; + self.advance(); + return .{ + .len = @intCast(grapheme_end - grapheme_start), + .offset = grapheme_start, + }; + } +}; + // Predicates fn isBreaker(cp: u21, data: *const Graphemes) bool { // Extract relevant properties. diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 2249007..828559a 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -219,3 +219,77 @@ test "Segmentation GraphemeIterator" { } } } + +test "Segmentation ReverseGraphemeIterator" { + const allocator = std.testing.allocator; + var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); + defer file.close(); + var buf_reader = std.io.bufferedReader(file.reader()); + var input_stream = buf_reader.reader(); + + const data = try Graphemes.init(allocator); + defer data.deinit(allocator); + + var buf: [4096]u8 = undefined; + var line_no: usize = 1; + + while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { + // Skip comments or empty lines. + if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; + + // Clean up. + var line = std.mem.trimLeft(u8, raw, "÷ "); + if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { + line = line[0..octo]; + } + // Iterate over fields. + var want = std.ArrayList(Grapheme).init(allocator); + defer want.deinit(); + + var all_bytes = std.ArrayList(u8).init(allocator); + defer all_bytes.deinit(); + + var graphemes = std.mem.splitSequence(u8, line, " ÷ "); + var bytes_index: u32 = 0; + + while (graphemes.next()) |field| { + var code_points = std.mem.splitScalar(u8, field, ' '); + var cp_buf: [4]u8 = undefined; + var cp_index: u32 = 0; + var gc_len: u8 = 0; + + while (code_points.next()) |code_point| { + if (std.mem.eql(u8, code_point, "×")) continue; + const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); + const len = try unicode.utf8Encode(cp, &cp_buf); + try all_bytes.appendSlice(cp_buf[0..len]); + cp_index += len; + gc_len += len; + } + + try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); + bytes_index += cp_index; + } + + // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); + var iter = data.reverseIterator(all_bytes.items); + + // Check. + var i: usize = want.items.len; + while (i > 0) { + i -= 1; + const want_gc = want.items[i]; + const got_gc = iter.prev() orelse { + std.debug.print("line {d} grapheme {d}: expected {any} found null\n", .{ line_no, i, want_gc }); + return error.TestExpectedEqual; + }; + std.testing.expectEqualStrings( + want_gc.bytes(all_bytes.items), + got_gc.bytes(all_bytes.items), + ) catch |err| { + std.debug.print("line {d} grapheme {d}: expected {any} found {any}\n", .{ line_no, i, want_gc, got_gc }); + return err; + }; + } + } +} -- cgit v1.2.3