From 6c7da0b526959840240177c0defb680e76fecad6 Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Sun, 18 Feb 2024 11:14:43 -0400 Subject: Testing Ghostty's Utf8Decoder. A bit slower --- src/cp2.zig | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 src/cp2.zig (limited to 'src/cp2.zig') diff --git a/src/cp2.zig b/src/cp2.zig new file mode 100644 index 0000000..ae0f9da --- /dev/null +++ b/src/cp2.zig @@ -0,0 +1,69 @@ +const std = @import("std"); + +const Utf8Decoder = @import("Utf8Decoder.zig"); + +/// `CodePoint` represents a Unicode code point by its code, +/// length, and offset in the source bytes. +pub const CodePoint = struct { + code: u21, + len: u3, + offset: u32, +}; + +/// `Iterator` iterates a string one `CodePoint` at-a-time. +pub const Iterator = struct { + bytes: []const u8, + decoder: Utf8Decoder = .{}, + i: u32 = 0, + + pub fn next(self: *Iterator) ?CodePoint { + if (self.i >= self.bytes.len) return null; + + if (self.bytes[self.i] < 128) { + // ASCII fast path + defer self.i += 1; + return .{ + .code = self.bytes[self.i], + .len = 1, + .offset = self.i, + }; + } + + for (self.bytes[self.i..], 1..) |b, len| { + var consumed = false; + while (!consumed) { + const res = self.decoder.next(b); + consumed = res[1]; + + if (res[0]) |code| { + defer self.i += @intCast(len); + + return .{ + .code = code, + .len = @intCast(len), + .offset = self.i, + }; + } + } + } + + unreachable; + } + + pub fn peek(self: *Iterator) ?CodePoint { + const saved_i = self.i; + defer self.i = saved_i; + return self.next(); + } +}; + +test "peek" { + const src = "Hi"; + var iter = Iterator{ .bytes = src }; + + try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); + try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); + try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code); + try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); + try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); +} -- cgit v1.2.3