From 6c7da0b526959840240177c0defb680e76fecad6 Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Sun, 18 Feb 2024 11:14:43 -0400 Subject: Testing Ghostty's Utf8Decoder. A bit slower --- src/Grapheme.zig | 16 +----- src/Utf8Decoder.zig | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/cp2.zig | 69 ++++++++++++++++++++++++ src/display_width.zig | 7 ++- 4 files changed, 216 insertions(+), 18 deletions(-) create mode 100644 src/Utf8Decoder.zig create mode 100644 src/cp2.zig (limited to 'src') diff --git a/src/Grapheme.zig b/src/Grapheme.zig index 6981753..f013aba 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig @@ -1,6 +1,7 @@ const std = @import("std"); const unicode = std.unicode; +const CodePoint = @import("code_point").CodePoint; const CodePointIterator = @import("code_point").Iterator; const gbp = @import("gbp"); @@ -16,13 +17,6 @@ pub const Grapheme = struct { } }; -// We need the code as a u21. -const CodePoint = struct { - code: u21, - len: u3, - offset: u32, -}; - /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. pub const Iterator = struct { buf: [2]?CodePoint = .{ null, null }, @@ -39,13 +33,7 @@ pub const Iterator = struct { fn advance(self: *Self) void { self.buf[0] = self.buf[1]; - - const maybe_cp = self.cp_iter.next(); - self.buf[1] = if (maybe_cp) |cp| .{ - .code = cp.code(self.cp_iter.bytes), - .len = cp.len, - .offset = cp.offset, - } else null; + self.buf[1] = self.cp_iter.next(); } pub fn next(self: *Self) ?Grapheme { diff --git a/src/Utf8Decoder.zig b/src/Utf8Decoder.zig new file mode 100644 index 0000000..6bb0d98 --- /dev/null +++ b/src/Utf8Decoder.zig @@ -0,0 +1,142 @@ +//! DFA-based non-allocating error-replacing UTF-8 decoder. +//! +//! This implementation is based largely on the excellent work of +//! Bjoern Hoehrmann, with slight modifications to support error- +//! replacement. +//! +//! For details on Bjoern's DFA-based UTF-8 decoder, see +//! http://bjoern.hoehrmann.de/utf-8/decoder/dfa (MIT licensed) +const UTF8Decoder = @This(); + +const std = @import("std"); +const testing = std.testing; + +const log = std.log.scoped(.utf8decoder); + +// zig fmt: off +const char_classes = [_]u4{ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, +}; + +const transitions = [_]u8 { + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; +// zig fmt: on + +// DFA states +const ACCEPT_STATE = 0; +const REJECT_STATE = 12; + +// This is where we accumulate our current codepoint. +accumulator: u21 = 0, +// The internal state of the DFA. +state: u8 = ACCEPT_STATE, + +/// Takes the next byte in the utf-8 sequence and emits a tuple of +/// - The codepoint that was generated, if there is one. +/// - A boolean that indicates whether the provided byte was consumed. +/// +/// The only case where the byte is not consumed is if an ill-formed +/// sequence is reached, in which case a replacement character will be +/// emitted and the byte will not be consumed. +/// +/// If the byte is not consumed, the caller is responsible for calling +/// again with the same byte before continuing. +pub inline fn next(self: *UTF8Decoder, byte: u8) struct { ?u21, bool } { + const char_class = char_classes[byte]; + + const initial_state = self.state; + + if (self.state != ACCEPT_STATE) { + self.accumulator <<= 6; + self.accumulator |= (byte & 0x3F); + } else { + self.accumulator = (@as(u21, 0xFF) >> char_class) & (byte); + } + + self.state = transitions[self.state + char_class]; + + if (self.state == ACCEPT_STATE) { + defer self.accumulator = 0; + + // Emit the fully decoded codepoint. + return .{ self.accumulator, true }; + } else if (self.state == REJECT_STATE) { + self.accumulator = 0; + self.state = ACCEPT_STATE; + // Emit a replacement character. If we rejected the first byte + // in a sequence, then it was consumed, otherwise it was not. + return .{ 0xFFFD, initial_state == ACCEPT_STATE }; + } else { + // Emit nothing, we're in the middle of a sequence. + return .{ null, true }; + } +} + +test "ASCII" { + var d: UTF8Decoder = .{}; + var out: [13]u8 = undefined; + for ("Hello, World!", 0..) |byte, i| { + const res = d.next(byte); + try testing.expect(res[1]); + if (res[0]) |codepoint| { + out[i] = @intCast(codepoint); + } + } + + try testing.expect(std.mem.eql(u8, &out, "Hello, World!")); +} + +test "Well formed utf-8" { + var d: UTF8Decoder = .{}; + var out: [4]u21 = undefined; + var i: usize = 0; + // 4 bytes, 3 bytes, 2 bytes, 1 byte + for ("๐Ÿ˜„โœครA") |byte| { + var consumed = false; + while (!consumed) { + const res = d.next(byte); + consumed = res[1]; + // There are no errors in this sequence, so + // every byte should be consumed first try. + try testing.expect(consumed == true); + if (res[0]) |codepoint| { + out[i] = codepoint; + i += 1; + } + } + } + + try testing.expect(std.mem.eql(u21, &out, &[_]u21{ 0x1F604, 0x2724, 0xC1, 0x41 })); +} + +test "Partially invalid utf-8" { + var d: UTF8Decoder = .{}; + var out: [5]u21 = undefined; + var i: usize = 0; + // Illegally terminated sequence, valid sequence, illegal surrogate pair. + for ("\xF0\x9F๐Ÿ˜„\xED\xA0\x80") |byte| { + var consumed = false; + while (!consumed) { + const res = d.next(byte); + consumed = res[1]; + if (res[0]) |codepoint| { + out[i] = codepoint; + i += 1; + } + } + } + + try testing.expect(std.mem.eql(u21, &out, &[_]u21{ 0xFFFD, 0x1F604, 0xFFFD, 0xFFFD, 0xFFFD })); +} diff --git a/src/cp2.zig b/src/cp2.zig new file mode 100644 index 0000000..ae0f9da --- /dev/null +++ b/src/cp2.zig @@ -0,0 +1,69 @@ +const std = @import("std"); + +const Utf8Decoder = @import("Utf8Decoder.zig"); + +/// `CodePoint` represents a Unicode code point by its code, +/// length, and offset in the source bytes. +pub const CodePoint = struct { + code: u21, + len: u3, + offset: u32, +}; + +/// `Iterator` iterates a string one `CodePoint` at-a-time. +pub const Iterator = struct { + bytes: []const u8, + decoder: Utf8Decoder = .{}, + i: u32 = 0, + + pub fn next(self: *Iterator) ?CodePoint { + if (self.i >= self.bytes.len) return null; + + if (self.bytes[self.i] < 128) { + // ASCII fast path + defer self.i += 1; + return .{ + .code = self.bytes[self.i], + .len = 1, + .offset = self.i, + }; + } + + for (self.bytes[self.i..], 1..) |b, len| { + var consumed = false; + while (!consumed) { + const res = self.decoder.next(b); + consumed = res[1]; + + if (res[0]) |code| { + defer self.i += @intCast(len); + + return .{ + .code = code, + .len = @intCast(len), + .offset = self.i, + }; + } + } + } + + unreachable; + } + + pub fn peek(self: *Iterator) ?CodePoint { + const saved_i = self.i; + defer self.i = saved_i; + return self.next(); + } +}; + +test "peek" { + const src = "Hi"; + var iter = Iterator{ .bytes = src }; + + try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); + try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); + try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code); + try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); + try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); +} diff --git a/src/display_width.zig b/src/display_width.zig index 7f39566..71483ca 100644 --- a/src/display_width.zig +++ b/src/display_width.zig @@ -52,18 +52,17 @@ pub fn strWidth(str: []const u8) usize { var giter = GraphemeIterator.init(str); while (giter.next()) |gc| { - const gc_bytes = gc.bytes(str); - var cp_iter = CodePointIterator{ .bytes = gc_bytes }; + var cp_iter = CodePointIterator{ .bytes = gc.bytes(str) }; var gc_total: isize = 0; while (cp_iter.next()) |cp| { - var w = codePointWidth(cp.code(gc_bytes)); + var w = codePointWidth(cp.code); if (w != 0) { // Handle text emoji sequence. if (cp_iter.next()) |ncp| { // emoji text sequence. - if (ncp.code(gc_bytes) == 0xFE0E) w = 1; + if (ncp.code == 0xFE0E) w = 1; } // Only adding width of first non-zero-width code point. -- cgit v1.2.3