diff options
| author | 2024-02-18 11:21:49 -0400 | |
|---|---|---|
| committer | 2024-02-18 11:21:49 -0400 | |
| commit | 08be45bfeb85bc809a492b9d0147052a028dd8ec (patch) | |
| tree | e91eda437902090e3bde5fafbdfb92f2db369b7c | |
| parent | Testing Ghostty's Utf8Decoder. A bit slower (diff) | |
| download | zg-08be45bfeb85bc809a492b9d0147052a028dd8ec.tar.gz zg-08be45bfeb85bc809a492b9d0147052a028dd8ec.tar.xz zg-08be45bfeb85bc809a492b9d0147052a028dd8ec.zip | |
Back to zg code_point. 4ms faster than Ghostty's Utf8Decoder
| -rw-r--r-- | build.zig | 2 | ||||
| -rw-r--r-- | src/Utf8Decoder.zig | 142 | ||||
| -rw-r--r-- | src/code_point.zig | 68 | ||||
| -rw-r--r-- | src/cp2.zig | 69 |
4 files changed, 40 insertions, 241 deletions
| @@ -28,7 +28,7 @@ pub fn build(b: *std.Build) void { | |||
| 28 | 28 | ||
| 29 | // Modules we provide | 29 | // Modules we provide |
| 30 | const code_point = b.addModule("code_point", .{ | 30 | const code_point = b.addModule("code_point", .{ |
| 31 | .root_source_file = .{ .path = "src/cp2.zig" }, | 31 | .root_source_file = .{ .path = "src/code_point.zig" }, |
| 32 | .target = target, | 32 | .target = target, |
| 33 | .optimize = optimize, | 33 | .optimize = optimize, |
| 34 | }); | 34 | }); |
diff --git a/src/Utf8Decoder.zig b/src/Utf8Decoder.zig deleted file mode 100644 index 6bb0d98..0000000 --- a/src/Utf8Decoder.zig +++ /dev/null | |||
| @@ -1,142 +0,0 @@ | |||
| 1 | //! DFA-based non-allocating error-replacing UTF-8 decoder. | ||
| 2 | //! | ||
| 3 | //! This implementation is based largely on the excellent work of | ||
| 4 | //! Bjoern Hoehrmann, with slight modifications to support error- | ||
| 5 | //! replacement. | ||
| 6 | //! | ||
| 7 | //! For details on Bjoern's DFA-based UTF-8 decoder, see | ||
| 8 | //! http://bjoern.hoehrmann.de/utf-8/decoder/dfa (MIT licensed) | ||
| 9 | const UTF8Decoder = @This(); | ||
| 10 | |||
| 11 | const std = @import("std"); | ||
| 12 | const testing = std.testing; | ||
| 13 | |||
| 14 | const log = std.log.scoped(.utf8decoder); | ||
| 15 | |||
| 16 | // zig fmt: off | ||
| 17 | const char_classes = [_]u4{ | ||
| 18 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
| 19 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
| 20 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
| 21 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
| 22 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, | ||
| 23 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, | ||
| 24 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | ||
| 25 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, | ||
| 26 | }; | ||
| 27 | |||
| 28 | const transitions = [_]u8 { | ||
| 29 | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, | ||
| 30 | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, | ||
| 31 | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, | ||
| 32 | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, | ||
| 33 | 12,36,12,12,12,12,12,12,12,12,12,12, | ||
| 34 | }; | ||
| 35 | // zig fmt: on | ||
| 36 | |||
| 37 | // DFA states | ||
| 38 | const ACCEPT_STATE = 0; | ||
| 39 | const REJECT_STATE = 12; | ||
| 40 | |||
| 41 | // This is where we accumulate our current codepoint. | ||
| 42 | accumulator: u21 = 0, | ||
| 43 | // The internal state of the DFA. | ||
| 44 | state: u8 = ACCEPT_STATE, | ||
| 45 | |||
| 46 | /// Takes the next byte in the utf-8 sequence and emits a tuple of | ||
| 47 | /// - The codepoint that was generated, if there is one. | ||
| 48 | /// - A boolean that indicates whether the provided byte was consumed. | ||
| 49 | /// | ||
| 50 | /// The only case where the byte is not consumed is if an ill-formed | ||
| 51 | /// sequence is reached, in which case a replacement character will be | ||
| 52 | /// emitted and the byte will not be consumed. | ||
| 53 | /// | ||
| 54 | /// If the byte is not consumed, the caller is responsible for calling | ||
| 55 | /// again with the same byte before continuing. | ||
| 56 | pub inline fn next(self: *UTF8Decoder, byte: u8) struct { ?u21, bool } { | ||
| 57 | const char_class = char_classes[byte]; | ||
| 58 | |||
| 59 | const initial_state = self.state; | ||
| 60 | |||
| 61 | if (self.state != ACCEPT_STATE) { | ||
| 62 | self.accumulator <<= 6; | ||
| 63 | self.accumulator |= (byte & 0x3F); | ||
| 64 | } else { | ||
| 65 | self.accumulator = (@as(u21, 0xFF) >> char_class) & (byte); | ||
| 66 | } | ||
| 67 | |||
| 68 | self.state = transitions[self.state + char_class]; | ||
| 69 | |||
| 70 | if (self.state == ACCEPT_STATE) { | ||
| 71 | defer self.accumulator = 0; | ||
| 72 | |||
| 73 | // Emit the fully decoded codepoint. | ||
| 74 | return .{ self.accumulator, true }; | ||
| 75 | } else if (self.state == REJECT_STATE) { | ||
| 76 | self.accumulator = 0; | ||
| 77 | self.state = ACCEPT_STATE; | ||
| 78 | // Emit a replacement character. If we rejected the first byte | ||
| 79 | // in a sequence, then it was consumed, otherwise it was not. | ||
| 80 | return .{ 0xFFFD, initial_state == ACCEPT_STATE }; | ||
| 81 | } else { | ||
| 82 | // Emit nothing, we're in the middle of a sequence. | ||
| 83 | return .{ null, true }; | ||
| 84 | } | ||
| 85 | } | ||
| 86 | |||
| 87 | test "ASCII" { | ||
| 88 | var d: UTF8Decoder = .{}; | ||
| 89 | var out: [13]u8 = undefined; | ||
| 90 | for ("Hello, World!", 0..) |byte, i| { | ||
| 91 | const res = d.next(byte); | ||
| 92 | try testing.expect(res[1]); | ||
| 93 | if (res[0]) |codepoint| { | ||
| 94 | out[i] = @intCast(codepoint); | ||
| 95 | } | ||
| 96 | } | ||
| 97 | |||
| 98 | try testing.expect(std.mem.eql(u8, &out, "Hello, World!")); | ||
| 99 | } | ||
| 100 | |||
| 101 | test "Well formed utf-8" { | ||
| 102 | var d: UTF8Decoder = .{}; | ||
| 103 | var out: [4]u21 = undefined; | ||
| 104 | var i: usize = 0; | ||
| 105 | // 4 bytes, 3 bytes, 2 bytes, 1 byte | ||
| 106 | for ("๐โครA") |byte| { | ||
| 107 | var consumed = false; | ||
| 108 | while (!consumed) { | ||
| 109 | const res = d.next(byte); | ||
| 110 | consumed = res[1]; | ||
| 111 | // There are no errors in this sequence, so | ||
| 112 | // every byte should be consumed first try. | ||
| 113 | try testing.expect(consumed == true); | ||
| 114 | if (res[0]) |codepoint| { | ||
| 115 | out[i] = codepoint; | ||
| 116 | i += 1; | ||
| 117 | } | ||
| 118 | } | ||
| 119 | } | ||
| 120 | |||
| 121 | try testing.expect(std.mem.eql(u21, &out, &[_]u21{ 0x1F604, 0x2724, 0xC1, 0x41 })); | ||
| 122 | } | ||
| 123 | |||
| 124 | test "Partially invalid utf-8" { | ||
| 125 | var d: UTF8Decoder = .{}; | ||
| 126 | var out: [5]u21 = undefined; | ||
| 127 | var i: usize = 0; | ||
| 128 | // Illegally terminated sequence, valid sequence, illegal surrogate pair. | ||
| 129 | for ("\xF0\x9F๐\xED\xA0\x80") |byte| { | ||
| 130 | var consumed = false; | ||
| 131 | while (!consumed) { | ||
| 132 | const res = d.next(byte); | ||
| 133 | consumed = res[1]; | ||
| 134 | if (res[0]) |codepoint| { | ||
| 135 | out[i] = codepoint; | ||
| 136 | i += 1; | ||
| 137 | } | ||
| 138 | } | ||
| 139 | } | ||
| 140 | |||
| 141 | try testing.expect(std.mem.eql(u21, &out, &[_]u21{ 0xFFFD, 0x1F604, 0xFFFD, 0xFFFD, 0xFFFD })); | ||
| 142 | } | ||
diff --git a/src/code_point.zig b/src/code_point.zig index 098e635..ac37562 100644 --- a/src/code_point.zig +++ b/src/code_point.zig | |||
| @@ -3,29 +3,9 @@ const std = @import("std"); | |||
| 3 | /// `CodePoint` represents a Unicode code point by its code, | 3 | /// `CodePoint` represents a Unicode code point by its code, |
| 4 | /// length, and offset in the source bytes. | 4 | /// length, and offset in the source bytes. |
| 5 | pub const CodePoint = struct { | 5 | pub const CodePoint = struct { |
| 6 | code: u21, | ||
| 6 | len: u3, | 7 | len: u3, |
| 7 | offset: u32, | 8 | offset: u32, |
| 8 | |||
| 9 | pub fn code(self: CodePoint, src: []const u8) u21 { | ||
| 10 | const cp_bytes = src[self.offset..][0..self.len]; | ||
| 11 | |||
| 12 | return switch (self.len) { | ||
| 13 | 1 => cp_bytes[0], | ||
| 14 | |||
| 15 | 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111), | ||
| 16 | |||
| 17 | 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) | | ||
| 18 | (cp_bytes[1] & 0b00111111)) << 6) | | ||
| 19 | (cp_bytes[2] & 0b00111111), | ||
| 20 | |||
| 21 | 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) | | ||
| 22 | (cp_bytes[1] & 0b00111111)) << 6) | | ||
| 23 | (cp_bytes[2] & 0b00111111)) << 6) | | ||
| 24 | (cp_bytes[3] & 0b00111111), | ||
| 25 | |||
| 26 | else => @panic("code_point.CodePoint.code: Invalid code point length."), | ||
| 27 | }; | ||
| 28 | } | ||
| 29 | }; | 9 | }; |
| 30 | 10 | ||
| 31 | /// `Iterator` iterates a string one `CodePoint` at-a-time. | 11 | /// `Iterator` iterates a string one `CodePoint` at-a-time. |
| @@ -39,20 +19,51 @@ pub const Iterator = struct { | |||
| 39 | if (self.bytes[self.i] < 128) { | 19 | if (self.bytes[self.i] < 128) { |
| 40 | // ASCII fast path | 20 | // ASCII fast path |
| 41 | defer self.i += 1; | 21 | defer self.i += 1; |
| 42 | return .{ .len = 1, .offset = self.i }; | 22 | |
| 23 | return .{ | ||
| 24 | .code = self.bytes[self.i], | ||
| 25 | .len = 1, | ||
| 26 | .offset = self.i, | ||
| 27 | }; | ||
| 43 | } | 28 | } |
| 44 | 29 | ||
| 45 | const cp = CodePoint{ | 30 | var cp = CodePoint{ |
| 31 | .code = undefined, | ||
| 46 | .len = switch (self.bytes[self.i]) { | 32 | .len = switch (self.bytes[self.i]) { |
| 47 | 0b1100_0000...0b1101_1111 => 2, | 33 | 0b1100_0000...0b1101_1111 => 2, |
| 48 | 0b1110_0000...0b1110_1111 => 3, | 34 | 0b1110_0000...0b1110_1111 => 3, |
| 49 | 0b1111_0000...0b1111_0111 => 4, | 35 | 0b1111_0000...0b1111_0111 => 4, |
| 50 | else => @panic("code_point.Iterator.next: Invalid start byte."), | 36 | else => { |
| 37 | defer self.i += 1; | ||
| 38 | // Unicode replacement code point. | ||
| 39 | return .{ | ||
| 40 | .code = 0xfffd, | ||
| 41 | .len = 1, | ||
| 42 | .offset = self.i, | ||
| 43 | }; | ||
| 44 | }, | ||
| 51 | }, | 45 | }, |
| 52 | .offset = self.i, | 46 | .offset = self.i, |
| 53 | }; | 47 | }; |
| 54 | 48 | ||
| 49 | const cp_bytes = self.bytes[self.i..][0..cp.len]; | ||
| 55 | self.i += cp.len; | 50 | self.i += cp.len; |
| 51 | |||
| 52 | cp.code = switch (cp.len) { | ||
| 53 | 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111), | ||
| 54 | |||
| 55 | 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) | | ||
| 56 | (cp_bytes[1] & 0b00111111)) << 6) | | ||
| 57 | (cp_bytes[2] & 0b00111111), | ||
| 58 | |||
| 59 | 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) | | ||
| 60 | (cp_bytes[1] & 0b00111111)) << 6) | | ||
| 61 | (cp_bytes[2] & 0b00111111)) << 6) | | ||
| 62 | (cp_bytes[3] & 0b00111111), | ||
| 63 | |||
| 64 | else => @panic("CodePointIterator.next invalid code point length."), | ||
| 65 | }; | ||
| 66 | |||
| 56 | return cp; | 67 | return cp; |
| 57 | } | 68 | } |
| 58 | 69 | ||
| @@ -64,12 +75,11 @@ pub const Iterator = struct { | |||
| 64 | }; | 75 | }; |
| 65 | 76 | ||
| 66 | test "peek" { | 77 | test "peek" { |
| 67 | const src = "Hi"; | 78 | var iter = Iterator{ .bytes = "Hi" }; |
| 68 | var iter = Iterator{ .bytes = src }; | ||
| 69 | 79 | ||
| 70 | try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code(src)); | 80 | try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); |
| 71 | try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code(src)); | 81 | try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); |
| 72 | try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code(src)); | 82 | try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code); |
| 73 | try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); | 83 | try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); |
| 74 | try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); | 84 | try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); |
| 75 | } | 85 | } |
diff --git a/src/cp2.zig b/src/cp2.zig deleted file mode 100644 index ae0f9da..0000000 --- a/src/cp2.zig +++ /dev/null | |||
| @@ -1,69 +0,0 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | |||
| 3 | const Utf8Decoder = @import("Utf8Decoder.zig"); | ||
| 4 | |||
| 5 | /// `CodePoint` represents a Unicode code point by its code, | ||
| 6 | /// length, and offset in the source bytes. | ||
| 7 | pub const CodePoint = struct { | ||
| 8 | code: u21, | ||
| 9 | len: u3, | ||
| 10 | offset: u32, | ||
| 11 | }; | ||
| 12 | |||
| 13 | /// `Iterator` iterates a string one `CodePoint` at-a-time. | ||
| 14 | pub const Iterator = struct { | ||
| 15 | bytes: []const u8, | ||
| 16 | decoder: Utf8Decoder = .{}, | ||
| 17 | i: u32 = 0, | ||
| 18 | |||
| 19 | pub fn next(self: *Iterator) ?CodePoint { | ||
| 20 | if (self.i >= self.bytes.len) return null; | ||
| 21 | |||
| 22 | if (self.bytes[self.i] < 128) { | ||
| 23 | // ASCII fast path | ||
| 24 | defer self.i += 1; | ||
| 25 | return .{ | ||
| 26 | .code = self.bytes[self.i], | ||
| 27 | .len = 1, | ||
| 28 | .offset = self.i, | ||
| 29 | }; | ||
| 30 | } | ||
| 31 | |||
| 32 | for (self.bytes[self.i..], 1..) |b, len| { | ||
| 33 | var consumed = false; | ||
| 34 | while (!consumed) { | ||
| 35 | const res = self.decoder.next(b); | ||
| 36 | consumed = res[1]; | ||
| 37 | |||
| 38 | if (res[0]) |code| { | ||
| 39 | defer self.i += @intCast(len); | ||
| 40 | |||
| 41 | return .{ | ||
| 42 | .code = code, | ||
| 43 | .len = @intCast(len), | ||
| 44 | .offset = self.i, | ||
| 45 | }; | ||
| 46 | } | ||
| 47 | } | ||
| 48 | } | ||
| 49 | |||
| 50 | unreachable; | ||
| 51 | } | ||
| 52 | |||
| 53 | pub fn peek(self: *Iterator) ?CodePoint { | ||
| 54 | const saved_i = self.i; | ||
| 55 | defer self.i = saved_i; | ||
| 56 | return self.next(); | ||
| 57 | } | ||
| 58 | }; | ||
| 59 | |||
| 60 | test "peek" { | ||
| 61 | const src = "Hi"; | ||
| 62 | var iter = Iterator{ .bytes = src }; | ||
| 63 | |||
| 64 | try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); | ||
| 65 | try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); | ||
| 66 | try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code); | ||
| 67 | try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); | ||
| 68 | try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); | ||
| 69 | } | ||