From 08be45bfeb85bc809a492b9d0147052a028dd8ec Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Sun, 18 Feb 2024 11:21:49 -0400 Subject: Back to zg code_point. 4ms faster than Ghostty's Utf8Decoder --- src/Utf8Decoder.zig | 142 ---------------------------------------------------- src/code_point.zig | 68 ++++++++++++++----------- src/cp2.zig | 69 ------------------------- 3 files changed, 39 insertions(+), 240 deletions(-) delete mode 100644 src/Utf8Decoder.zig delete mode 100644 src/cp2.zig (limited to 'src') diff --git a/src/Utf8Decoder.zig b/src/Utf8Decoder.zig deleted file mode 100644 index 6bb0d98..0000000 --- a/src/Utf8Decoder.zig +++ /dev/null @@ -1,142 +0,0 @@ -//! DFA-based non-allocating error-replacing UTF-8 decoder. -//! -//! This implementation is based largely on the excellent work of -//! Bjoern Hoehrmann, with slight modifications to support error- -//! replacement. -//! -//! For details on Bjoern's DFA-based UTF-8 decoder, see -//! http://bjoern.hoehrmann.de/utf-8/decoder/dfa (MIT licensed) -const UTF8Decoder = @This(); - -const std = @import("std"); -const testing = std.testing; - -const log = std.log.scoped(.utf8decoder); - -// zig fmt: off -const char_classes = [_]u4{ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, -}; - -const transitions = [_]u8 { - 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, - 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, - 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, - 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, - 12,36,12,12,12,12,12,12,12,12,12,12, -}; -// zig fmt: on - -// DFA states -const ACCEPT_STATE = 0; -const REJECT_STATE = 12; - -// This is where we accumulate our current codepoint. -accumulator: u21 = 0, -// The internal state of the DFA. -state: u8 = ACCEPT_STATE, - -/// Takes the next byte in the utf-8 sequence and emits a tuple of -/// - The codepoint that was generated, if there is one. -/// - A boolean that indicates whether the provided byte was consumed. -/// -/// The only case where the byte is not consumed is if an ill-formed -/// sequence is reached, in which case a replacement character will be -/// emitted and the byte will not be consumed. -/// -/// If the byte is not consumed, the caller is responsible for calling -/// again with the same byte before continuing. -pub inline fn next(self: *UTF8Decoder, byte: u8) struct { ?u21, bool } { - const char_class = char_classes[byte]; - - const initial_state = self.state; - - if (self.state != ACCEPT_STATE) { - self.accumulator <<= 6; - self.accumulator |= (byte & 0x3F); - } else { - self.accumulator = (@as(u21, 0xFF) >> char_class) & (byte); - } - - self.state = transitions[self.state + char_class]; - - if (self.state == ACCEPT_STATE) { - defer self.accumulator = 0; - - // Emit the fully decoded codepoint. - return .{ self.accumulator, true }; - } else if (self.state == REJECT_STATE) { - self.accumulator = 0; - self.state = ACCEPT_STATE; - // Emit a replacement character. If we rejected the first byte - // in a sequence, then it was consumed, otherwise it was not. - return .{ 0xFFFD, initial_state == ACCEPT_STATE }; - } else { - // Emit nothing, we're in the middle of a sequence. - return .{ null, true }; - } -} - -test "ASCII" { - var d: UTF8Decoder = .{}; - var out: [13]u8 = undefined; - for ("Hello, World!", 0..) |byte, i| { - const res = d.next(byte); - try testing.expect(res[1]); - if (res[0]) |codepoint| { - out[i] = @intCast(codepoint); - } - } - - try testing.expect(std.mem.eql(u8, &out, "Hello, World!")); -} - -test "Well formed utf-8" { - var d: UTF8Decoder = .{}; - var out: [4]u21 = undefined; - var i: usize = 0; - // 4 bytes, 3 bytes, 2 bytes, 1 byte - for ("๐Ÿ˜„โœครA") |byte| { - var consumed = false; - while (!consumed) { - const res = d.next(byte); - consumed = res[1]; - // There are no errors in this sequence, so - // every byte should be consumed first try. - try testing.expect(consumed == true); - if (res[0]) |codepoint| { - out[i] = codepoint; - i += 1; - } - } - } - - try testing.expect(std.mem.eql(u21, &out, &[_]u21{ 0x1F604, 0x2724, 0xC1, 0x41 })); -} - -test "Partially invalid utf-8" { - var d: UTF8Decoder = .{}; - var out: [5]u21 = undefined; - var i: usize = 0; - // Illegally terminated sequence, valid sequence, illegal surrogate pair. - for ("\xF0\x9F๐Ÿ˜„\xED\xA0\x80") |byte| { - var consumed = false; - while (!consumed) { - const res = d.next(byte); - consumed = res[1]; - if (res[0]) |codepoint| { - out[i] = codepoint; - i += 1; - } - } - } - - try testing.expect(std.mem.eql(u21, &out, &[_]u21{ 0xFFFD, 0x1F604, 0xFFFD, 0xFFFD, 0xFFFD })); -} diff --git a/src/code_point.zig b/src/code_point.zig index 098e635..ac37562 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -3,29 +3,9 @@ const std = @import("std"); /// `CodePoint` represents a Unicode code point by its code, /// length, and offset in the source bytes. pub const CodePoint = struct { + code: u21, len: u3, offset: u32, - - pub fn code(self: CodePoint, src: []const u8) u21 { - const cp_bytes = src[self.offset..][0..self.len]; - - return switch (self.len) { - 1 => cp_bytes[0], - - 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111), - - 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) | - (cp_bytes[1] & 0b00111111)) << 6) | - (cp_bytes[2] & 0b00111111), - - 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) | - (cp_bytes[1] & 0b00111111)) << 6) | - (cp_bytes[2] & 0b00111111)) << 6) | - (cp_bytes[3] & 0b00111111), - - else => @panic("code_point.CodePoint.code: Invalid code point length."), - }; - } }; /// `Iterator` iterates a string one `CodePoint` at-a-time. @@ -39,20 +19,51 @@ pub const Iterator = struct { if (self.bytes[self.i] < 128) { // ASCII fast path defer self.i += 1; - return .{ .len = 1, .offset = self.i }; + + return .{ + .code = self.bytes[self.i], + .len = 1, + .offset = self.i, + }; } - const cp = CodePoint{ + var cp = CodePoint{ + .code = undefined, .len = switch (self.bytes[self.i]) { 0b1100_0000...0b1101_1111 => 2, 0b1110_0000...0b1110_1111 => 3, 0b1111_0000...0b1111_0111 => 4, - else => @panic("code_point.Iterator.next: Invalid start byte."), + else => { + defer self.i += 1; + // Unicode replacement code point. + return .{ + .code = 0xfffd, + .len = 1, + .offset = self.i, + }; + }, }, .offset = self.i, }; + const cp_bytes = self.bytes[self.i..][0..cp.len]; self.i += cp.len; + + cp.code = switch (cp.len) { + 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111), + + 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) | + (cp_bytes[1] & 0b00111111)) << 6) | + (cp_bytes[2] & 0b00111111), + + 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) | + (cp_bytes[1] & 0b00111111)) << 6) | + (cp_bytes[2] & 0b00111111)) << 6) | + (cp_bytes[3] & 0b00111111), + + else => @panic("CodePointIterator.next invalid code point length."), + }; + return cp; } @@ -64,12 +75,11 @@ pub const Iterator = struct { }; test "peek" { - const src = "Hi"; - var iter = Iterator{ .bytes = src }; + var iter = Iterator{ .bytes = "Hi" }; - try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code(src)); - try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code(src)); - try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code(src)); + try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); + try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); + try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code); try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); } diff --git a/src/cp2.zig b/src/cp2.zig deleted file mode 100644 index ae0f9da..0000000 --- a/src/cp2.zig +++ /dev/null @@ -1,69 +0,0 @@ -const std = @import("std"); - -const Utf8Decoder = @import("Utf8Decoder.zig"); - -/// `CodePoint` represents a Unicode code point by its code, -/// length, and offset in the source bytes. -pub const CodePoint = struct { - code: u21, - len: u3, - offset: u32, -}; - -/// `Iterator` iterates a string one `CodePoint` at-a-time. -pub const Iterator = struct { - bytes: []const u8, - decoder: Utf8Decoder = .{}, - i: u32 = 0, - - pub fn next(self: *Iterator) ?CodePoint { - if (self.i >= self.bytes.len) return null; - - if (self.bytes[self.i] < 128) { - // ASCII fast path - defer self.i += 1; - return .{ - .code = self.bytes[self.i], - .len = 1, - .offset = self.i, - }; - } - - for (self.bytes[self.i..], 1..) |b, len| { - var consumed = false; - while (!consumed) { - const res = self.decoder.next(b); - consumed = res[1]; - - if (res[0]) |code| { - defer self.i += @intCast(len); - - return .{ - .code = code, - .len = @intCast(len), - .offset = self.i, - }; - } - } - } - - unreachable; - } - - pub fn peek(self: *Iterator) ?CodePoint { - const saved_i = self.i; - defer self.i = saved_i; - return self.next(); - } -}; - -test "peek" { - const src = "Hi"; - var iter = Iterator{ .bytes = src }; - - try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); - try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); - try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code); - try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); - try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); -} -- cgit v1.2.3