Back to zg code_point. 4ms faster than Ghostty's Utf8Decoder

author: Jose Colon Rodriguez 2024-02-18 11:21:49 -0400
committer: Jose Colon Rodriguez 2024-02-18 11:21:49 -0400
commit: 08be45bfeb85bc809a492b9d0147052a028dd8ec (patch)
tree: e91eda437902090e3bde5fafbdfb92f2db369b7c
parent: Testing Ghostty's Utf8Decoder. A bit slower (diff)
download: zg-08be45bfeb85bc809a492b9d0147052a028dd8ec.tar.gz
zg-08be45bfeb85bc809a492b9d0147052a028dd8ec.tar.xz
zg-08be45bfeb85bc809a492b9d0147052a028dd8ec.zip
4 files changed, 40 insertions, 241 deletions
diff --git a/build.zig b/build.zig
index 4098948..68e01e3 100644
--- a/build.zig
+++ b/build.zig
@@ -28,7 +28,7 @@ pub fn build(b: *std.Build) void {
    // Modules we provide
    const code_point = b.addModule("code_point", .{
-        .root_source_file = .{ .path = "src/cp2.zig" },
+        .root_source_file = .{ .path = "src/code_point.zig" },
        .target = target,
        .optimize = optimize,
    });
diff --git a/src/Utf8Decoder.zig b/src/Utf8Decoder.zig
deleted file mode 100644
index 6bb0d98..0000000
--- a/src/Utf8Decoder.zig
+++ /dev/null
@@ -1,142 +0,0 @@
-//! DFA-based non-allocating error-replacing UTF-8 decoder.
-//!
-//! This implementation is based largely on the excellent work of
-//! Bjoern Hoehrmann, with slight modifications to support error-
-//! replacement.
-//!
-//! For details on Bjoern's DFA-based UTF-8 decoder, see
-//! http://bjoern.hoehrmann.de/utf-8/decoder/dfa (MIT licensed)
-const UTF8Decoder = @This();
-const std = @import("std");
-const testing = std.testing;
-const log = std.log.scoped(.utf8decoder);
-// zig fmt: off
-const char_classes = [_]u4{
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
-   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
-   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
-};
-const transitions = [_]u8 {
-   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
-  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
-  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
-  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
-  12,36,12,12,12,12,12,12,12,12,12,12,
-};
-// zig fmt: on
-// DFA states
-const ACCEPT_STATE = 0;
-const REJECT_STATE = 12;
-// This is where we accumulate our current codepoint.
-accumulator: u21 = 0,
-// The internal state of the DFA.
-state: u8 = ACCEPT_STATE,
-/// Takes the next byte in the utf-8 sequence and emits a tuple of
-/// - The codepoint that was generated, if there is one.
-/// - A boolean that indicates whether the provided byte was consumed.
-///
-/// The only case where the byte is not consumed is if an ill-formed
-/// sequence is reached, in which case a replacement character will be
-/// emitted and the byte will not be consumed.
-///
-/// If the byte is not consumed, the caller is responsible for calling
-/// again with the same byte before continuing.
-pub inline fn next(self: *UTF8Decoder, byte: u8) struct { ?u21, bool } {
-    const char_class = char_classes[byte];
-    const initial_state = self.state;
-    if (self.state != ACCEPT_STATE) {
-        self.accumulator <<= 6;
-        self.accumulator |= (byte & 0x3F);
-    } else {
-        self.accumulator = (@as(u21, 0xFF) >> char_class) & (byte);
-    }
-    self.state = transitions[self.state + char_class];
-    if (self.state == ACCEPT_STATE) {
-        defer self.accumulator = 0;
-        // Emit the fully decoded codepoint.
-        return .{ self.accumulator, true };
-    } else if (self.state == REJECT_STATE) {
-        self.accumulator = 0;
-        self.state = ACCEPT_STATE;
-        // Emit a replacement character. If we rejected the first byte
-        // in a sequence, then it was consumed, otherwise it was not.
-        return .{ 0xFFFD, initial_state == ACCEPT_STATE };
-    } else {
-        // Emit nothing, we're in the middle of a sequence.
-        return .{ null, true };
-    }
-}
-test "ASCII" {
-    var d: UTF8Decoder = .{};
-    var out: [13]u8 = undefined;
-    for ("Hello, World!", 0..) |byte, i| {
-        const res = d.next(byte);
-        try testing.expect(res[1]);
-        if (res[0]) |codepoint| {
-            out[i] = @intCast(codepoint);
-        }
-    }
-    try testing.expect(std.mem.eql(u8, &out, "Hello, World!"));
-}
-test "Well formed utf-8" {
-    var d: UTF8Decoder = .{};
-    var out: [4]u21 = undefined;
-    var i: usize = 0;
-    // 4 bytes, 3 bytes, 2 bytes, 1 byte
-    for ("😄✤ÁA") |byte| {
-        var consumed = false;
-        while (!consumed) {
-            const res = d.next(byte);
-            consumed = res[1];
-            // There are no errors in this sequence, so
-            // every byte should be consumed first try.
-            try testing.expect(consumed == true);
-            if (res[0]) |codepoint| {
-                out[i] = codepoint;
-                i += 1;
-            }
-        }
-    }
-    try testing.expect(std.mem.eql(u21, &out, &[_]u21{ 0x1F604, 0x2724, 0xC1, 0x41 }));
-}
-test "Partially invalid utf-8" {
-    var d: UTF8Decoder = .{};
-    var out: [5]u21 = undefined;
-    var i: usize = 0;
-    // Illegally terminated sequence, valid sequence, illegal surrogate pair.
-    for ("\xF0\x9F😄\xED\xA0\x80") |byte| {
-        var consumed = false;
-        while (!consumed) {
-            const res = d.next(byte);
-            consumed = res[1];
-            if (res[0]) |codepoint| {
-                out[i] = codepoint;
-                i += 1;
-            }
-        }
-    }
-    try testing.expect(std.mem.eql(u21, &out, &[_]u21{ 0xFFFD, 0x1F604, 0xFFFD, 0xFFFD, 0xFFFD }));
-}
diff --git a/src/code_point.zig b/src/code_point.zig
index 098e635..ac37562 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -3,29 +3,9 @@ const std = @import("std");
 /// `CodePoint` represents a Unicode code point by its code,
 /// length, and offset in the source bytes.
 pub const CodePoint = struct {
+    code: u21,
    len: u3,
    offset: u32,
-    pub fn code(self: CodePoint, src: []const u8) u21 {
-        const cp_bytes = src[self.offset..][0..self.len];
-        return switch (self.len) {
-            1 => cp_bytes[0],
-            2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
-            3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
-                (cp_bytes[1] & 0b00111111)) << 6) |
-                (cp_bytes[2] & 0b00111111),
-            4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
-                (cp_bytes[1] & 0b00111111)) << 6) |
-                (cp_bytes[2] & 0b00111111)) << 6) |
-                (cp_bytes[3] & 0b00111111),
-            else => @panic("code_point.CodePoint.code: Invalid code point length."),
-        };
-    }
 };
 /// `Iterator` iterates a string one `CodePoint` at-a-time.
@@ -39,20 +19,51 @@ pub const Iterator = struct {
        if (self.bytes[self.i] < 128) {
            // ASCII fast path
            defer self.i += 1;
-            return .{ .len = 1, .offset = self.i };
+            return .{
+                .code = self.bytes[self.i],
+                .len = 1,
+                .offset = self.i,
+            };
        }
-        const cp = CodePoint{
+        var cp = CodePoint{
+            .code = undefined,
            .len = switch (self.bytes[self.i]) {
                0b1100_0000...0b1101_1111 => 2,
                0b1110_0000...0b1110_1111 => 3,
                0b1111_0000...0b1111_0111 => 4,
-                else => @panic("code_point.Iterator.next: Invalid start byte."),
+                else => {
+                    defer self.i += 1;
+                    // Unicode replacement code point.
+                    return .{
+                        .code = 0xfffd,
+                        .len = 1,
+                        .offset = self.i,
+                    };
+                },
            },
            .offset = self.i,
        };
+        const cp_bytes = self.bytes[self.i..][0..cp.len];
        self.i += cp.len;
+        cp.code = switch (cp.len) {
+            2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
+            3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
+                (cp_bytes[1] & 0b00111111)) << 6) |
+                (cp_bytes[2] & 0b00111111),
+            4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
+                (cp_bytes[1] & 0b00111111)) << 6) |
+                (cp_bytes[2] & 0b00111111)) << 6) |
+                (cp_bytes[3] & 0b00111111),
+            else => @panic("CodePointIterator.next invalid code point length."),
+        };
        return cp;
    }
@@ -64,12 +75,11 @@ pub const Iterator = struct {
 };
 test "peek" {
-    const src = "Hi";
+    var iter = Iterator{ .bytes = "Hi" };
-    var iter = Iterator{ .bytes = src };
-    try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code(src));
+    try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
-    try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code(src));
+    try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
-    try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code(src));
+    try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);
    try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
    try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
 }
diff --git a/src/cp2.zig b/src/cp2.zig
deleted file mode 100644
index ae0f9da..0000000
--- a/src/cp2.zig
+++ /dev/null
@@ -1,69 +0,0 @@
-const std = @import("std");
-const Utf8Decoder = @import("Utf8Decoder.zig");
-/// `CodePoint` represents a Unicode code point by its code,
-/// length, and offset in the source bytes.
-pub const CodePoint = struct {
-    code: u21,
-    len: u3,
-    offset: u32,
-};
-/// `Iterator` iterates a string one `CodePoint` at-a-time.
-pub const Iterator = struct {
-    bytes: []const u8,
-    decoder: Utf8Decoder = .{},
-    i: u32 = 0,
-    pub fn next(self: *Iterator) ?CodePoint {
-        if (self.i >= self.bytes.len) return null;
-        if (self.bytes[self.i] < 128) {
-            // ASCII fast path
-            defer self.i += 1;
-            return .{
-                .code = self.bytes[self.i],
-                .len = 1,
-                .offset = self.i,
-            };
-        }
-        for (self.bytes[self.i..], 1..) |b, len| {
-            var consumed = false;
-            while (!consumed) {
-                const res = self.decoder.next(b);
-                consumed = res[1];
-                if (res[0]) |code| {
-                    defer self.i += @intCast(len);
-                    return .{
-                        .code = code,
-                        .len = @intCast(len),
-                        .offset = self.i,
-                    };
-                }
-            }
-        }
-        unreachable;
-    }
-    pub fn peek(self: *Iterator) ?CodePoint {
-        const saved_i = self.i;
-        defer self.i = saved_i;
-        return self.next();
-    }
-};
-test "peek" {
-    const src = "Hi";
-    var iter = Iterator{ .bytes = src };
-    try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
-    try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
-    try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);
-    try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
-    try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
-}
author	Jose Colon Rodriguez	2024-02-18 11:21:49 -0400
committer	Jose Colon Rodriguez	2024-02-18 11:21:49 -0400
commit	08be45bfeb85bc809a492b9d0147052a028dd8ec (patch)
tree	e91eda437902090e3bde5fafbdfb92f2db369b7c
parent	Testing Ghostty's Utf8Decoder. A bit slower (diff)
download	zg-08be45bfeb85bc809a492b9d0147052a028dd8ec.tar.gz zg-08be45bfeb85bc809a492b9d0147052a028dd8ec.tar.xz zg-08be45bfeb85bc809a492b9d0147052a028dd8ec.zip

diff --git a/build.zig b/build.zig index 4098948..68e01e3 100644 --- a/build.zig +++ b/build.zig
@@ -28,7 +28,7 @@ pub fn build(b: *std.Build) void {
28		28
29	// Modules we provide	29	// Modules we provide
30	const code_point = b.addModule("code_point", .{	30	const code_point = b.addModule("code_point", .{
31	.root_source_file = .{ .path = "src/cp2.zig" },	31	.root_source_file = .{ .path = "src/code_point.zig" },
32	.target = target,	32	.target = target,
33	.optimize = optimize,	33	.optimize = optimize,
34	});	34	});


diff --git a/src/Utf8Decoder.zig b/src/Utf8Decoder.zig deleted file mode 100644 index 6bb0d98..0000000 --- a/src/Utf8Decoder.zig +++ /dev/null
@@ -1,142 +0,0 @@
1	//! DFA-based non-allocating error-replacing UTF-8 decoder.
2	//!
3	//! This implementation is based largely on the excellent work of
4	//! Bjoern Hoehrmann, with slight modifications to support error-
5	//! replacement.
6	//!
7	//! For details on Bjoern's DFA-based UTF-8 decoder, see
8	//! http://bjoern.hoehrmann.de/utf-8/decoder/dfa (MIT licensed)
9	const UTF8Decoder = @This();
10
11	const std = @import("std");
12	const testing = std.testing;
13
14	const log = std.log.scoped(.utf8decoder);
15
16	// zig fmt: off
17	const char_classes = [_]u4{
18	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
19	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
20	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
21	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
22	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
23	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
24	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
25	10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
26	};
27
28	const transitions = [_]u8 {
29	0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
30	12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
31	12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
32	12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
33	12,36,12,12,12,12,12,12,12,12,12,12,
34	};
35	// zig fmt: on
36
37	// DFA states
38	const ACCEPT_STATE = 0;
39	const REJECT_STATE = 12;
40
41	// This is where we accumulate our current codepoint.
42	accumulator: u21 = 0,
43	// The internal state of the DFA.
44	state: u8 = ACCEPT_STATE,
45
46	/// Takes the next byte in the utf-8 sequence and emits a tuple of
47	/// - The codepoint that was generated, if there is one.
48	/// - A boolean that indicates whether the provided byte was consumed.
49	///
50	/// The only case where the byte is not consumed is if an ill-formed
51	/// sequence is reached, in which case a replacement character will be
52	/// emitted and the byte will not be consumed.
53	///
54	/// If the byte is not consumed, the caller is responsible for calling
55	/// again with the same byte before continuing.
56	pub inline fn next(self: *UTF8Decoder, byte: u8) struct { ?u21, bool } {
57	const char_class = char_classes[byte];
58
59	const initial_state = self.state;
60
61	if (self.state != ACCEPT_STATE) {
62	self.accumulator <<= 6;
63	self.accumulator \|= (byte & 0x3F);
64	} else {
65	self.accumulator = (@as(u21, 0xFF) >> char_class) & (byte);
66	}
67
68	self.state = transitions[self.state + char_class];
69
70	if (self.state == ACCEPT_STATE) {
71	defer self.accumulator = 0;
72
73	// Emit the fully decoded codepoint.
74	return .{ self.accumulator, true };
75	} else if (self.state == REJECT_STATE) {
76	self.accumulator = 0;
77	self.state = ACCEPT_STATE;
78	// Emit a replacement character. If we rejected the first byte
79	// in a sequence, then it was consumed, otherwise it was not.
80	return .{ 0xFFFD, initial_state == ACCEPT_STATE };
81	} else {
82	// Emit nothing, we're in the middle of a sequence.
83	return .{ null, true };
84	}
85	}
86
87	test "ASCII" {
88	var d: UTF8Decoder = .{};
89	var out: [13]u8 = undefined;
90	for ("Hello, World!", 0..) \|byte, i\| {
91	const res = d.next(byte);
92	try testing.expect(res[1]);
93	if (res[0]) \|codepoint\| {
94	out[i] = @intCast(codepoint);
95	}
96	}
97
98	try testing.expect(std.mem.eql(u8, &out, "Hello, World!"));
99	}
100
101	test "Well formed utf-8" {
102	var d: UTF8Decoder = .{};
103	var out: [4]u21 = undefined;
104	var i: usize = 0;
105	// 4 bytes, 3 bytes, 2 bytes, 1 byte
106	for ("😄✤ÁA") \|byte\| {
107	var consumed = false;
108	while (!consumed) {
109	const res = d.next(byte);
110	consumed = res[1];
111	// There are no errors in this sequence, so
112	// every byte should be consumed first try.
113	try testing.expect(consumed == true);
114	if (res[0]) \|codepoint\| {
115	out[i] = codepoint;
116	i += 1;
117	}
118	}
119	}
120
121	try testing.expect(std.mem.eql(u21, &out, &[_]u21{ 0x1F604, 0x2724, 0xC1, 0x41 }));
122	}
123
124	test "Partially invalid utf-8" {
125	var d: UTF8Decoder = .{};
126	var out: [5]u21 = undefined;
127	var i: usize = 0;
128	// Illegally terminated sequence, valid sequence, illegal surrogate pair.
129	for ("\xF0\x9F😄\xED\xA0\x80") \|byte\| {
130	var consumed = false;
131	while (!consumed) {
132	const res = d.next(byte);
133	consumed = res[1];
134	if (res[0]) \|codepoint\| {
135	out[i] = codepoint;
136	i += 1;
137	}
138	}
139	}
140
141	try testing.expect(std.mem.eql(u21, &out, &[_]u21{ 0xFFFD, 0x1F604, 0xFFFD, 0xFFFD, 0xFFFD }));
142	}


diff --git a/src/code_point.zig b/src/code_point.zig index 098e635..ac37562 100644 --- a/src/code_point.zig +++ b/src/code_point.zig
@@ -3,29 +3,9 @@ const std = @import("std");
3	/// `CodePoint` represents a Unicode code point by its code,	3	/// `CodePoint` represents a Unicode code point by its code,
4	/// length, and offset in the source bytes.	4	/// length, and offset in the source bytes.
5	pub const CodePoint = struct {	5	pub const CodePoint = struct {
		6	code: u21,
6	len: u3,	7	len: u3,
7	offset: u32,	8	offset: u32,
8
9	pub fn code(self: CodePoint, src: []const u8) u21 {
10	const cp_bytes = src[self.offset..][0..self.len];
11
12	return switch (self.len) {
13	1 => cp_bytes[0],
14
15	2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) \| (cp_bytes[1] & 0b00111111),
16
17	3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) \|
18	(cp_bytes[1] & 0b00111111)) << 6) \|
19	(cp_bytes[2] & 0b00111111),
20
21	4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) \|
22	(cp_bytes[1] & 0b00111111)) << 6) \|
23	(cp_bytes[2] & 0b00111111)) << 6) \|
24	(cp_bytes[3] & 0b00111111),
25
26	else => @panic("code_point.CodePoint.code: Invalid code point length."),
27	};
28	}
29	};	9	};
30		10
31	/// `Iterator` iterates a string one `CodePoint` at-a-time.	11	/// `Iterator` iterates a string one `CodePoint` at-a-time.
@@ -39,20 +19,51 @@ pub const Iterator = struct {
39	if (self.bytes[self.i] < 128) {	19	if (self.bytes[self.i] < 128) {
40	// ASCII fast path	20	// ASCII fast path
41	defer self.i += 1;	21	defer self.i += 1;
42	return .{ .len = 1, .offset = self.i };	22
		23	return .{
		24	.code = self.bytes[self.i],
		25	.len = 1,
		26	.offset = self.i,
		27	};
43	}	28	}
44		29
45	const cp = CodePoint{	30	var cp = CodePoint{
		31	.code = undefined,
46	.len = switch (self.bytes[self.i]) {	32	.len = switch (self.bytes[self.i]) {
47	0b1100_0000...0b1101_1111 => 2,	33	0b1100_0000...0b1101_1111 => 2,
48	0b1110_0000...0b1110_1111 => 3,	34	0b1110_0000...0b1110_1111 => 3,
49	0b1111_0000...0b1111_0111 => 4,	35	0b1111_0000...0b1111_0111 => 4,
50	else => @panic("code_point.Iterator.next: Invalid start byte."),	36	else => {
		37	defer self.i += 1;
		38	// Unicode replacement code point.
		39	return .{
		40	.code = 0xfffd,
		41	.len = 1,
		42	.offset = self.i,
		43	};
		44	},
51	},	45	},
52	.offset = self.i,	46	.offset = self.i,
53	};	47	};
54		48
		49	const cp_bytes = self.bytes[self.i..][0..cp.len];
55	self.i += cp.len;	50	self.i += cp.len;
		51
		52	cp.code = switch (cp.len) {
		53	2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) \| (cp_bytes[1] & 0b00111111),
		54
		55	3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) \|
		56	(cp_bytes[1] & 0b00111111)) << 6) \|
		57	(cp_bytes[2] & 0b00111111),
		58
		59	4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) \|
		60	(cp_bytes[1] & 0b00111111)) << 6) \|
		61	(cp_bytes[2] & 0b00111111)) << 6) \|
		62	(cp_bytes[3] & 0b00111111),
		63
		64	else => @panic("CodePointIterator.next invalid code point length."),
		65	};
		66
56	return cp;	67	return cp;
57	}	68	}
58		69
@@ -64,12 +75,11 @@ pub const Iterator = struct {
64	};	75	};
65		76
66	test "peek" {	77	test "peek" {
67	const src = "Hi";	78	var iter = Iterator{ .bytes = "Hi" };
68	var iter = Iterator{ .bytes = src };
69		79
70	try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code(src));	80	try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
71	try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code(src));	81	try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
72	try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code(src));	82	try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);
73	try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());	83	try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
74	try std.testing.expectEqual(@as(?CodePoint, null), iter.next());	84	try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
75	}	85	}


diff --git a/src/cp2.zig b/src/cp2.zig deleted file mode 100644 index ae0f9da..0000000 --- a/src/cp2.zig +++ /dev/null
@@ -1,69 +0,0 @@
1	const std = @import("std");
2
3	const Utf8Decoder = @import("Utf8Decoder.zig");
4
5	/// `CodePoint` represents a Unicode code point by its code,
6	/// length, and offset in the source bytes.
7	pub const CodePoint = struct {
8	code: u21,
9	len: u3,
10	offset: u32,
11	};
12
13	/// `Iterator` iterates a string one `CodePoint` at-a-time.
14	pub const Iterator = struct {
15	bytes: []const u8,
16	decoder: Utf8Decoder = .{},
17	i: u32 = 0,
18
19	pub fn next(self: *Iterator) ?CodePoint {
20	if (self.i >= self.bytes.len) return null;
21
22	if (self.bytes[self.i] < 128) {
23	// ASCII fast path
24	defer self.i += 1;
25	return .{
26	.code = self.bytes[self.i],
27	.len = 1,
28	.offset = self.i,
29	};
30	}
31
32	for (self.bytes[self.i..], 1..) \|b, len\| {
33	var consumed = false;
34	while (!consumed) {
35	const res = self.decoder.next(b);
36	consumed = res[1];
37
38	if (res[0]) \|code\| {
39	defer self.i += @intCast(len);
40
41	return .{
42	.code = code,
43	.len = @intCast(len),
44	.offset = self.i,
45	};
46	}
47	}
48	}
49
50	unreachable;
51	}
52
53	pub fn peek(self: *Iterator) ?CodePoint {
54	const saved_i = self.i;
55	defer self.i = saved_i;
56	return self.next();
57	}
58	};
59
60	test "peek" {
61	const src = "Hi";
62	var iter = Iterator{ .bytes = src };
63
64	try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
65	try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
66	try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);
67	try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
68	try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
69	}