From 75dedc6aec86c1a4f43d0d7cd120abf68b5baeb1 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 9 May 2025 12:58:23 -0400 Subject: Add reverse CodePoint iterator --- src/code_point.zig | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) (limited to 'src/code_point.zig') diff --git a/src/code_point.zig b/src/code_point.zig index fe7ad6e..a319d36 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -233,6 +233,45 @@ const class_mask: [12]u8 = .{ 0, }; +pub const ReverseIterator = struct { + bytes: []const u8, + i: ?u32, + + pub fn init(str: []const u8) ReverseIterator { + var r_iter: ReverseIterator = undefined; + r_iter.bytes = str; + r_iter.i = if (str.len == 0) 0 else @intCast(str.len - 1); + return r_iter; + } + + pub fn prev(iter: *ReverseIterator) ?CodePoint { + if (iter.i == null) return null; + var i_prev = iter.i.?; + + while (i_prev > 0) : (i_prev -= 1) { + if (!followbyte(iter.bytes[i_prev])) break; + if (i_prev == 0) break; + } + + if (i_prev > 0) + iter.i = i_prev - 1 + else + iter.i = null; + + return decode(iter.bytes[i_prev..], i_prev); + } + + pub fn peek(iter: *ReverseIterator) ?CodePoint { + const saved_i = iter.i; + defer iter.i = saved_i; + return iter.prev(); + } +}; + +inline fn followbyte(b: u8) bool { + return 0x80 <= b and b <= 0xbf; +} + test "decode" { const bytes = "๐ŸŒฉ๏ธ"; const res = decode(bytes, 0); @@ -246,7 +285,7 @@ test "decode" { } } -test "peek" { +test Iterator { var iter = Iterator{ .bytes = "Hi" }; try expectEqual(@as(u21, 'H'), iter.next().?.code); @@ -346,6 +385,33 @@ test "truncation" { } } +test ReverseIterator { + { + var r_iter: ReverseIterator = .init("ABC"); + try testing.expectEqual(@as(u21, 'C'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, 'B'), r_iter.peek().?.code); + try testing.expectEqual(@as(u21, 'B'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, 'A'), r_iter.prev().?.code); + try testing.expectEqual(@as(?CodePoint, null), r_iter.peek()); + try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); + try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); + } + { + var r_iter: ReverseIterator = .init("โˆ…ฮดq๐Ÿฆพฤƒ"); + try testing.expectEqual(@as(u21, 'ฤƒ'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '๐Ÿฆพ'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, 'q'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, 'ฮด'), r_iter.peek().?.code); + try testing.expectEqual(@as(u21, 'ฮด'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, 'โˆ…'), r_iter.peek().?.code); + try testing.expectEqual(@as(u21, 'โˆ…'), r_iter.peek().?.code); + try testing.expectEqual(@as(u21, 'โˆ…'), r_iter.prev().?.code); + try testing.expectEqual(@as(?CodePoint, null), r_iter.peek()); + try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); + try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); + } +} + const std = @import("std"); const testing = std.testing; const expect = testing.expect; -- cgit v1.2.3 From 7e4cc4641e966a1aa32fa021a9476af3b9813485 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Tue, 13 May 2025 13:38:28 -0400 Subject: Various small iterator improvements --- src/code_point.zig | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 4 deletions(-) (limited to 'src/code_point.zig') diff --git a/src/code_point.zig b/src/code_point.zig index a319d36..b3c9aea 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -48,12 +48,22 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { }; // Multibyte +<<<<<<< HEAD // Second: var class: u4 = @intCast(u8dfa[byte]); var st: u32 = state_dfa[class]; if (st == RUNE_REJECT or cursor.* == bytes.len) { @branchHint(.cold); // First one is never a truncation +||||||| parent of ad4b046 (Various small iterator improvements) + // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte + if (cp.len > bytes.len) { + // Unicode replacement code point. +======= + // Return replacement if we don't have a complete codepoint remaining. Consumes only one byte. + if (cp.len > bytes.len) { + // Unicode replacement code point. +>>>>>>> ad4b046 (Various small iterator improvements) return .{ .code = 0xfffd, .len = 1, @@ -158,10 +168,19 @@ pub const Iterator = struct { return decodeAtCursor(self.bytes, &self.i); } - pub fn peek(self: *Iterator) ?CodePoint { - const saved_i = self.i; - defer self.i = saved_i; - return self.next(); + pub fn peek(iter: *Iterator) ?CodePoint { + const saved_i = iter.i; + defer iter.i = saved_i; + return iter.next(); + } + + /// Create a backward iterator at this point. It will repeat + /// the last CodePoint seen. + pub fn reverseIterator(iter: *Iterator) ReverseIterator { + if (iter.i == iter.bytes.len) { + return .init(iter.bytes); + } + return .{ .i = iter.i, .bytes = iter.bytes }; } }; @@ -266,6 +285,17 @@ pub const ReverseIterator = struct { defer iter.i = saved_i; return iter.prev(); } + + /// Create a forward iterator at this point. It will repeat the + /// last CodePoint seen. + pub fn forwardIterator(iter: *ReverseIterator) Iterator { + if (iter.i) |i| { + var fwd: Iterator = .{ .i = i, .bytes = iter.bytes }; + _ = fwd.next(); + return fwd; + } + return .{ .i = 0, .bytes = iter.bytes }; + } }; inline fn followbyte(b: u8) bool { @@ -410,6 +440,23 @@ test ReverseIterator { try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); } + { + var r_iter: ReverseIterator = .init("123"); + try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '1'), r_iter.prev().?.code); + var iter = r_iter.forwardIterator(); + try testing.expectEqual(@as(u21, '1'), iter.next().?.code); + try testing.expectEqual(@as(u21, '2'), iter.next().?.code); + try testing.expectEqual(@as(u21, '3'), iter.next().?.code); + r_iter = iter.reverseIterator(); + try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code); + try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); + iter = r_iter.forwardIterator(); + r_iter = iter.reverseIterator(); + try testing.expectEqual(@as(u21, '2'), iter.next().?.code); + try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); + } } const std = @import("std"); -- cgit v1.2.3 From cf8d8fe5d640511f6c4134fdaa36e930232ca7da Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Mon, 12 May 2025 15:22:37 -0400 Subject: Begin conformance test I'm not sure the details of this strategy can actually be made to work. But, something can. --- src/code_point.zig | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'src/code_point.zig') diff --git a/src/code_point.zig b/src/code_point.zig index b3c9aea..79ee5cd 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -10,6 +10,11 @@ pub const CodePoint = struct { code: u21, len: u3, offset: u32, + + /// Return the slice of this codepoint, given the original string. + pub fn bytes(cp: CodePoint, str: []const u8) []const u8 { + return str[cp.offset..][0..cp.len]; + } }; /// This function is deprecated and will be removed in a later release. -- cgit v1.2.3 From 7ff729895e72fc841440ec73a44c142779fcae1e Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Tue, 13 May 2025 16:33:02 -0400 Subject: Reverse Word Iterator Next up I hook it to the tests. --- src/code_point.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/code_point.zig') diff --git a/src/code_point.zig b/src/code_point.zig index 79ee5cd..a5b10d4 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -12,7 +12,7 @@ pub const CodePoint = struct { offset: u32, /// Return the slice of this codepoint, given the original string. - pub fn bytes(cp: CodePoint, str: []const u8) []const u8 { + pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 { return str[cp.offset..][0..cp.len]; } }; -- cgit v1.2.3 From 5cc8c1875a21bfb398e6685b03a29d6ba1cbf74a Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Tue, 13 May 2025 17:19:56 -0400 Subject: Hooked up break test, some bugs squashed The handling of ignorables is really different, because they 'adhere' to the future of the iteration, not the past. --- src/code_point.zig | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'src/code_point.zig') diff --git a/src/code_point.zig b/src/code_point.zig index a5b10d4..ba0b434 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -53,22 +53,12 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { }; // Multibyte -<<<<<<< HEAD // Second: var class: u4 = @intCast(u8dfa[byte]); var st: u32 = state_dfa[class]; if (st == RUNE_REJECT or cursor.* == bytes.len) { @branchHint(.cold); // First one is never a truncation -||||||| parent of ad4b046 (Various small iterator improvements) - // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte - if (cp.len > bytes.len) { - // Unicode replacement code point. -======= - // Return replacement if we don't have a complete codepoint remaining. Consumes only one byte. - if (cp.len > bytes.len) { - // Unicode replacement code point. ->>>>>>> ad4b046 (Various small iterator improvements) return .{ .code = 0xfffd, .len = 1, -- cgit v1.2.3 From 2b31e66b68216b876ddaf793f4008e652fcd7686 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 14 May 2025 13:15:14 -0400 Subject: Add format for CodePoint --- src/code_point.zig | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'src/code_point.zig') diff --git a/src/code_point.zig b/src/code_point.zig index ba0b434..43b38d2 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -15,6 +15,14 @@ pub const CodePoint = struct { pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 { return str[cp.offset..][0..cp.len]; } + + pub fn format(cp: CodePoint, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + try writer.print("CodePoint '{u}' .{{ ", .{cp.code}); + try writer.print( + ".code = 0x{x}, .offset = {d}, .len = {d} }}", + .{ cp.code, cp.offset, cp.len }, + ); + } }; /// This function is deprecated and will be removed in a later release. @@ -171,7 +179,7 @@ pub const Iterator = struct { /// Create a backward iterator at this point. It will repeat /// the last CodePoint seen. - pub fn reverseIterator(iter: *Iterator) ReverseIterator { + pub fn reverseIterator(iter: *const Iterator) ReverseIterator { if (iter.i == iter.bytes.len) { return .init(iter.bytes); } @@ -283,7 +291,7 @@ pub const ReverseIterator = struct { /// Create a forward iterator at this point. It will repeat the /// last CodePoint seen. - pub fn forwardIterator(iter: *ReverseIterator) Iterator { + pub fn forwardIterator(iter: *const ReverseIterator) Iterator { if (iter.i) |i| { var fwd: Iterator = .{ .i = i, .bytes = iter.bytes }; _ = fwd.next(); -- cgit v1.2.3 From 736b4ccce2384c8f96e63d9c49ab4d6aee1d65a5 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Thu, 15 May 2025 10:57:33 -0400 Subject: wordAtIndex passes conformance I removed the initAtIndex functions from the public vocabulary, because the last couple of days of sweat and blood prove that it's hard to use correctly. That's probably it for WordBreak, now to fix the overlong bug on v0.14 and get this integrated with the new reverse grapheme iterator. --- src/code_point.zig | 1 - 1 file changed, 1 deletion(-) (limited to 'src/code_point.zig') diff --git a/src/code_point.zig b/src/code_point.zig index 43b38d2..9a84080 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -272,7 +272,6 @@ pub const ReverseIterator = struct { while (i_prev > 0) : (i_prev -= 1) { if (!followbyte(iter.bytes[i_prev])) break; - if (i_prev == 0) break; } if (i_prev > 0) -- cgit v1.2.3 From c9a1b3392973ee30e6a9a532f1da8605619b5b06 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 23 May 2025 18:46:30 -0400 Subject: Make offset size configurable Hopefully I can talk users out of taking advantage of this configuration but I'll have better luck with that if it's available. --- src/code_point.zig | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'src/code_point.zig') diff --git a/src/code_point.zig b/src/code_point.zig index 9a84080..8bd3d5b 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -4,12 +4,14 @@ //! Represents invalid data according to the Replacement of Maximal //! Subparts algorithm. +pub const uoffset = if (@import("config").fat_offset) u64 else u32; + /// `CodePoint` represents a Unicode code point by its code, /// length, and offset in the source bytes. pub const CodePoint = struct { code: u21, len: u3, - offset: u32, + offset: uoffset, /// Return the slice of this codepoint, given the original string. pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 { @@ -27,8 +29,8 @@ pub const CodePoint = struct { /// This function is deprecated and will be removed in a later release. /// Use `decodeAtIndex` or `decodeAtCursor`. -pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { - var off: u32 = 0; +pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint { + var off: uoffset = 0; var maybe_code = decodeAtCursor(bytes, &off); if (maybe_code) |*code| { code.offset = offset; @@ -38,14 +40,14 @@ pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { } /// Decode the CodePoint, if any, at `bytes[idx]`. -pub fn decodeAtIndex(bytes: []const u8, idx: u32) ?CodePoint { +pub fn decodeAtIndex(bytes: []const u8, idx: uoffset) ?CodePoint { var off = idx; return decodeAtCursor(bytes, &off); } /// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the /// cursor will point at the next potential codepoint index. -pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { +pub fn decodeAtCursor(bytes: []const u8, cursor: *uoffset) ?CodePoint { // EOS if (cursor.* >= bytes.len) return null; @@ -161,7 +163,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { /// `Iterator` iterates a string one `CodePoint` at-a-time. pub const Iterator = struct { bytes: []const u8, - i: u32 = 0, + i: uoffset = 0, pub fn init(bytes: []const u8) Iterator { return .{ .bytes = bytes, .i = 0 }; @@ -257,7 +259,7 @@ const class_mask: [12]u8 = .{ pub const ReverseIterator = struct { bytes: []const u8, - i: ?u32, + i: ?uoffset, pub fn init(str: []const u8) ReverseIterator { var r_iter: ReverseIterator = undefined; -- cgit v1.2.3 From 8f5209fa095c2ed9114ce102b2f9b2cc90d66b13 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Sun, 1 Jun 2025 14:08:25 -0400 Subject: Add graphemeAtIndex + iterate before and after That completes the set. I do think it's possible to bum a few more cycles from the implementation, but, I'm not going to. It passes the acceptance suite and that's what it needs to do. --- src/code_point.zig | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) (limited to 'src/code_point.zig') diff --git a/src/code_point.zig b/src/code_point.zig index 8bd3d5b..16648af 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -39,9 +39,17 @@ pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint { return null; } +/// Return the codepoint at `index`, even if `index` is in the middle +/// of that codepoint. +pub fn codepointAtIndex(bytes: []const u8, index: uoffset) ?CodePoint { + var idx = index; + while (idx > 0 and 0x80 <= bytes[idx] and bytes[idx] <= 0xbf) : (idx -= 1) {} + return decodeAtIndex(bytes, idx); +} + /// Decode the CodePoint, if any, at `bytes[idx]`. -pub fn decodeAtIndex(bytes: []const u8, idx: uoffset) ?CodePoint { - var off = idx; +pub fn decodeAtIndex(bytes: []const u8, index: uoffset) ?CodePoint { + var off = index; return decodeAtCursor(bytes, &off); } @@ -329,6 +337,54 @@ test Iterator { try expectEqual(@as(?CodePoint, null), iter.next()); } +const code_point = @This(); + +// Keep this in sync with the README +test "Code point iterator" { + const str = "Hi ๐Ÿ˜Š"; + var iter: code_point.Iterator = .init(str); + var i: usize = 0; + + while (iter.next()) |cp| : (i += 1) { + // The `code` field is the actual code point scalar as a `u21`. + if (i == 0) try expect(cp.code == 'H'); + if (i == 1) try expect(cp.code == 'i'); + if (i == 2) try expect(cp.code == ' '); + + if (i == 3) { + try expect(cp.code == '๐Ÿ˜Š'); + // The `offset` field is the byte offset in the + // source string. + try expect(cp.offset == 3); + try expectEqual(cp, code_point.decodeAtIndex(str, cp.offset).?); + // The `len` field is the length in bytes of the + // code point in the source string. + try expect(cp.len == 4); + // There is also a 'cursor' decode, like so: + { + var cursor = cp.offset; + try expectEqual(cp, code_point.decodeAtCursor(str, &cursor).?); + // Which advances the cursor variable to the next possible + // offset, in this case, `str.len`. Don't forget to account + // for this possibility! + try expectEqual(cp.offset + cp.len, cursor); + } + // There's also this, for when you aren't sure if you have the + // correct start for a code point: + try expectEqual(cp, code_point.codepointAtIndex(str, cp.offset + 1).?); + } + // Reverse iteration is also an option: + var r_iter: code_point.ReverseIterator = .init(str); + // Both iterators can be peeked: + try expectEqual('๐Ÿ˜Š', r_iter.peek().?.code); + try expectEqual('๐Ÿ˜Š', r_iter.prev().?.code); + // Both kinds of iterators can be reversed: + var fwd_iter = r_iter.forwardIterator(); // or iter.reverseIterator(); + // This will always return the last codepoint from + // the prior iterator, _if_ it yielded one: + try expectEqual('๐Ÿ˜Š', fwd_iter.next().?.code); + } +} test "overlongs" { // None of these should equal `/`, all should be byte-for-byte // handled as replacement characters. -- cgit v1.2.3 From e3082e64b3ab8a8aa0777d63be69eb8b6d50a654 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Tue, 8 Jul 2025 12:12:20 -0400 Subject: Add Words.zig example to README --- src/code_point.zig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'src/code_point.zig') diff --git a/src/code_point.zig b/src/code_point.zig index 16648af..7a638af 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -121,6 +121,9 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *uoffset) ?CodePoint { } if (st == RUNE_REJECT or cursor.* == bytes.len) { @branchHint(.cold); + // This, and the branch below, detect truncation, the + // only invalid state handled differently by the Maximal + // Subparts algorithm. if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { cursor.* -= 2; // +1 return .{ -- cgit v1.2.3