summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-18 11:21:49 -0400
committerGravatar Jose Colon Rodriguez2024-02-18 11:21:49 -0400
commit08be45bfeb85bc809a492b9d0147052a028dd8ec (patch)
treee91eda437902090e3bde5fafbdfb92f2db369b7c
parentTesting Ghostty's Utf8Decoder. A bit slower (diff)
downloadzg-08be45bfeb85bc809a492b9d0147052a028dd8ec.tar.gz
zg-08be45bfeb85bc809a492b9d0147052a028dd8ec.tar.xz
zg-08be45bfeb85bc809a492b9d0147052a028dd8ec.zip
Back to zg code_point. 4ms faster than Ghostty's Utf8Decoder
-rw-r--r--build.zig2
-rw-r--r--src/Utf8Decoder.zig142
-rw-r--r--src/code_point.zig68
-rw-r--r--src/cp2.zig69
4 files changed, 40 insertions, 241 deletions
diff --git a/build.zig b/build.zig
index 4098948..68e01e3 100644
--- a/build.zig
+++ b/build.zig
@@ -28,7 +28,7 @@ pub fn build(b: *std.Build) void {
28 28
29 // Modules we provide 29 // Modules we provide
30 const code_point = b.addModule("code_point", .{ 30 const code_point = b.addModule("code_point", .{
31 .root_source_file = .{ .path = "src/cp2.zig" }, 31 .root_source_file = .{ .path = "src/code_point.zig" },
32 .target = target, 32 .target = target,
33 .optimize = optimize, 33 .optimize = optimize,
34 }); 34 });
diff --git a/src/Utf8Decoder.zig b/src/Utf8Decoder.zig
deleted file mode 100644
index 6bb0d98..0000000
--- a/src/Utf8Decoder.zig
+++ /dev/null
@@ -1,142 +0,0 @@
1//! DFA-based non-allocating error-replacing UTF-8 decoder.
2//!
3//! This implementation is based largely on the excellent work of
4//! Bjoern Hoehrmann, with slight modifications to support error-
5//! replacement.
6//!
7//! For details on Bjoern's DFA-based UTF-8 decoder, see
8//! http://bjoern.hoehrmann.de/utf-8/decoder/dfa (MIT licensed)
9const UTF8Decoder = @This();
10
11const std = @import("std");
12const testing = std.testing;
13
14const log = std.log.scoped(.utf8decoder);
15
16// zig fmt: off
17const char_classes = [_]u4{
18 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
19 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
20 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
21 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
22 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
23 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
24 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
25 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
26};
27
28const transitions = [_]u8 {
29 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
30 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
31 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
32 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
33 12,36,12,12,12,12,12,12,12,12,12,12,
34};
35// zig fmt: on
36
37// DFA states
38const ACCEPT_STATE = 0;
39const REJECT_STATE = 12;
40
41// This is where we accumulate our current codepoint.
42accumulator: u21 = 0,
43// The internal state of the DFA.
44state: u8 = ACCEPT_STATE,
45
46/// Takes the next byte in the utf-8 sequence and emits a tuple of
47/// - The codepoint that was generated, if there is one.
48/// - A boolean that indicates whether the provided byte was consumed.
49///
50/// The only case where the byte is not consumed is if an ill-formed
51/// sequence is reached, in which case a replacement character will be
52/// emitted and the byte will not be consumed.
53///
54/// If the byte is not consumed, the caller is responsible for calling
55/// again with the same byte before continuing.
56pub inline fn next(self: *UTF8Decoder, byte: u8) struct { ?u21, bool } {
57 const char_class = char_classes[byte];
58
59 const initial_state = self.state;
60
61 if (self.state != ACCEPT_STATE) {
62 self.accumulator <<= 6;
63 self.accumulator |= (byte & 0x3F);
64 } else {
65 self.accumulator = (@as(u21, 0xFF) >> char_class) & (byte);
66 }
67
68 self.state = transitions[self.state + char_class];
69
70 if (self.state == ACCEPT_STATE) {
71 defer self.accumulator = 0;
72
73 // Emit the fully decoded codepoint.
74 return .{ self.accumulator, true };
75 } else if (self.state == REJECT_STATE) {
76 self.accumulator = 0;
77 self.state = ACCEPT_STATE;
78 // Emit a replacement character. If we rejected the first byte
79 // in a sequence, then it was consumed, otherwise it was not.
80 return .{ 0xFFFD, initial_state == ACCEPT_STATE };
81 } else {
82 // Emit nothing, we're in the middle of a sequence.
83 return .{ null, true };
84 }
85}
86
87test "ASCII" {
88 var d: UTF8Decoder = .{};
89 var out: [13]u8 = undefined;
90 for ("Hello, World!", 0..) |byte, i| {
91 const res = d.next(byte);
92 try testing.expect(res[1]);
93 if (res[0]) |codepoint| {
94 out[i] = @intCast(codepoint);
95 }
96 }
97
98 try testing.expect(std.mem.eql(u8, &out, "Hello, World!"));
99}
100
101test "Well formed utf-8" {
102 var d: UTF8Decoder = .{};
103 var out: [4]u21 = undefined;
104 var i: usize = 0;
105 // 4 bytes, 3 bytes, 2 bytes, 1 byte
106 for ("๐Ÿ˜„โœครA") |byte| {
107 var consumed = false;
108 while (!consumed) {
109 const res = d.next(byte);
110 consumed = res[1];
111 // There are no errors in this sequence, so
112 // every byte should be consumed first try.
113 try testing.expect(consumed == true);
114 if (res[0]) |codepoint| {
115 out[i] = codepoint;
116 i += 1;
117 }
118 }
119 }
120
121 try testing.expect(std.mem.eql(u21, &out, &[_]u21{ 0x1F604, 0x2724, 0xC1, 0x41 }));
122}
123
124test "Partially invalid utf-8" {
125 var d: UTF8Decoder = .{};
126 var out: [5]u21 = undefined;
127 var i: usize = 0;
128 // Illegally terminated sequence, valid sequence, illegal surrogate pair.
129 for ("\xF0\x9F๐Ÿ˜„\xED\xA0\x80") |byte| {
130 var consumed = false;
131 while (!consumed) {
132 const res = d.next(byte);
133 consumed = res[1];
134 if (res[0]) |codepoint| {
135 out[i] = codepoint;
136 i += 1;
137 }
138 }
139 }
140
141 try testing.expect(std.mem.eql(u21, &out, &[_]u21{ 0xFFFD, 0x1F604, 0xFFFD, 0xFFFD, 0xFFFD }));
142}
diff --git a/src/code_point.zig b/src/code_point.zig
index 098e635..ac37562 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -3,29 +3,9 @@ const std = @import("std");
3/// `CodePoint` represents a Unicode code point by its code, 3/// `CodePoint` represents a Unicode code point by its code,
4/// length, and offset in the source bytes. 4/// length, and offset in the source bytes.
5pub const CodePoint = struct { 5pub const CodePoint = struct {
6 code: u21,
6 len: u3, 7 len: u3,
7 offset: u32, 8 offset: u32,
8
9 pub fn code(self: CodePoint, src: []const u8) u21 {
10 const cp_bytes = src[self.offset..][0..self.len];
11
12 return switch (self.len) {
13 1 => cp_bytes[0],
14
15 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
16
17 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
18 (cp_bytes[1] & 0b00111111)) << 6) |
19 (cp_bytes[2] & 0b00111111),
20
21 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
22 (cp_bytes[1] & 0b00111111)) << 6) |
23 (cp_bytes[2] & 0b00111111)) << 6) |
24 (cp_bytes[3] & 0b00111111),
25
26 else => @panic("code_point.CodePoint.code: Invalid code point length."),
27 };
28 }
29}; 9};
30 10
31/// `Iterator` iterates a string one `CodePoint` at-a-time. 11/// `Iterator` iterates a string one `CodePoint` at-a-time.
@@ -39,20 +19,51 @@ pub const Iterator = struct {
39 if (self.bytes[self.i] < 128) { 19 if (self.bytes[self.i] < 128) {
40 // ASCII fast path 20 // ASCII fast path
41 defer self.i += 1; 21 defer self.i += 1;
42 return .{ .len = 1, .offset = self.i }; 22
23 return .{
24 .code = self.bytes[self.i],
25 .len = 1,
26 .offset = self.i,
27 };
43 } 28 }
44 29
45 const cp = CodePoint{ 30 var cp = CodePoint{
31 .code = undefined,
46 .len = switch (self.bytes[self.i]) { 32 .len = switch (self.bytes[self.i]) {
47 0b1100_0000...0b1101_1111 => 2, 33 0b1100_0000...0b1101_1111 => 2,
48 0b1110_0000...0b1110_1111 => 3, 34 0b1110_0000...0b1110_1111 => 3,
49 0b1111_0000...0b1111_0111 => 4, 35 0b1111_0000...0b1111_0111 => 4,
50 else => @panic("code_point.Iterator.next: Invalid start byte."), 36 else => {
37 defer self.i += 1;
38 // Unicode replacement code point.
39 return .{
40 .code = 0xfffd,
41 .len = 1,
42 .offset = self.i,
43 };
44 },
51 }, 45 },
52 .offset = self.i, 46 .offset = self.i,
53 }; 47 };
54 48
49 const cp_bytes = self.bytes[self.i..][0..cp.len];
55 self.i += cp.len; 50 self.i += cp.len;
51
52 cp.code = switch (cp.len) {
53 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
54
55 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
56 (cp_bytes[1] & 0b00111111)) << 6) |
57 (cp_bytes[2] & 0b00111111),
58
59 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
60 (cp_bytes[1] & 0b00111111)) << 6) |
61 (cp_bytes[2] & 0b00111111)) << 6) |
62 (cp_bytes[3] & 0b00111111),
63
64 else => @panic("CodePointIterator.next invalid code point length."),
65 };
66
56 return cp; 67 return cp;
57 } 68 }
58 69
@@ -64,12 +75,11 @@ pub const Iterator = struct {
64}; 75};
65 76
66test "peek" { 77test "peek" {
67 const src = "Hi"; 78 var iter = Iterator{ .bytes = "Hi" };
68 var iter = Iterator{ .bytes = src };
69 79
70 try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code(src)); 80 try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
71 try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code(src)); 81 try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
72 try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code(src)); 82 try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);
73 try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); 83 try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
74 try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); 84 try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
75} 85}
diff --git a/src/cp2.zig b/src/cp2.zig
deleted file mode 100644
index ae0f9da..0000000
--- a/src/cp2.zig
+++ /dev/null
@@ -1,69 +0,0 @@
1const std = @import("std");
2
3const Utf8Decoder = @import("Utf8Decoder.zig");
4
5/// `CodePoint` represents a Unicode code point by its code,
6/// length, and offset in the source bytes.
7pub const CodePoint = struct {
8 code: u21,
9 len: u3,
10 offset: u32,
11};
12
13/// `Iterator` iterates a string one `CodePoint` at-a-time.
14pub const Iterator = struct {
15 bytes: []const u8,
16 decoder: Utf8Decoder = .{},
17 i: u32 = 0,
18
19 pub fn next(self: *Iterator) ?CodePoint {
20 if (self.i >= self.bytes.len) return null;
21
22 if (self.bytes[self.i] < 128) {
23 // ASCII fast path
24 defer self.i += 1;
25 return .{
26 .code = self.bytes[self.i],
27 .len = 1,
28 .offset = self.i,
29 };
30 }
31
32 for (self.bytes[self.i..], 1..) |b, len| {
33 var consumed = false;
34 while (!consumed) {
35 const res = self.decoder.next(b);
36 consumed = res[1];
37
38 if (res[0]) |code| {
39 defer self.i += @intCast(len);
40
41 return .{
42 .code = code,
43 .len = @intCast(len),
44 .offset = self.i,
45 };
46 }
47 }
48 }
49
50 unreachable;
51 }
52
53 pub fn peek(self: *Iterator) ?CodePoint {
54 const saved_i = self.i;
55 defer self.i = saved_i;
56 return self.next();
57 }
58};
59
60test "peek" {
61 const src = "Hi";
62 var iter = Iterator{ .bytes = src };
63
64 try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
65 try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
66 try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);
67 try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
68 try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
69}