Replace CodePoint Decoding with Hörhmann Method

This still needs a small barrage of tests to confirm that it correctly performs substitution of maximal subparts (Unicode 16.0.0 §3.9.6). I'm pretty sure this edition is 'overly maximal' actually, the name of the algorithm is somewhat misleading as to what it actually does.
author: Sam Atman 2025-05-15 12:13:09 -0400
committer: Sam Atman 2025-05-15 12:13:09 -0400
commit: f3841123295f4abe1bf61ec844117729e1fd5af4 (patch)
tree: b7dd0c608625efa24f26dbfbed2af091ccfaec14 /src/code_point.zig
parent: Add overlong test, which should fail (diff)
download: zg-f3841123295f4abe1bf61ec844117729e1fd5af4.tar.gz
zg-f3841123295f4abe1bf61ec844117729e1fd5af4.tar.xz
zg-f3841123295f4abe1bf61ec844117729e1fd5af4.zip
1 files changed, 204 insertions, 59 deletions
diff --git a/src/code_point.zig b/src/code_point.zig
index e402554..d589413 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -1,3 +1,9 @@
+//! Unicode Code Point module
+//!
+//! Provides a decoder and iterator over a UTF-8 encoded string.
+//! Represents invalid data according to the Replacement of Maximal
+//! Subparts algorithm.
 /// `CodePoint` represents a Unicode code point by its code,
 /// length, and offset in the source bytes.
 pub const CodePoint = struct {
@@ -6,67 +12,144 @@ pub const CodePoint = struct {
    offset: u32,
 };
-/// given a small slice of a string, decode the corresponding codepoint
+/// This function is deprecated and will be removed in a later release.
+/// Use `decodeAtIndex` or `decodeAtCursor`.
 pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
-    // EOS fast path
+    var off: u32 = 0;
-    if (bytes.len == 0) {
+    var maybe_code = decodeAtCursor(bytes, &off);
-        return null;
+    if (maybe_code) |*code| {
+        code.offset = offset;
+        return code.*;
    }
+    return null;
+}
-    // ASCII fast path
+/// Decode the CodePoint, if any, at `bytes[idx]`.
-    if (bytes[0] < 128) {
+pub fn decodeAtIndex(bytes: []const u8, idx: u32) ?CodePoint {
-        return .{
+    var off = idx;
-            .code = bytes[0],
+    return decodeAtCursor(bytes, &off);
-            .len = 1,
+}
-            .offset = offset,
-        };
+/// Decode the CodePoint, if any, at `bytes[cursor.*]`.  After, the
-    }
+/// cursor will point at the next potential codepoint index.
+pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
+    // EOS
+    if (cursor.* >= bytes.len) return null;
-    var cp = CodePoint{
+    const this_off = cursor.*;
-        .code = undefined,
+    cursor.* += 1;
-        .len = switch (bytes[0]) {
-            0b1100_0000...0b1101_1111 => 2,
+    // ASCII
-            0b1110_0000...0b1110_1111 => 3,
+    var byte = bytes[this_off];
-            0b1111_0000...0b1111_0111 => 4,
+    if (byte < 0x80) return .{
-            else => {
+        .code = byte,
-                // unicode replacement code point.
+        .offset = this_off,
-                return .{
+        .len = 1,
-                    .code = 0xfffd,
-                    .len = 1,
-                    .offset = offset,
-                };
-            },
-        },
-        .offset = offset,
    };
+    // Multibyte
-    // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte
+    // Second:
-    if (cp.len > bytes.len) {
+    var class: u4 = @intCast(u8dfa[byte]);
-        // Unicode replacement code point.
+    var st: u32 = state_dfa[class];
+    if (st == RUNE_REJECT or cursor.* == bytes.len) {
+        @branchHint(.cold);
+        // First one is never a truncation
        return .{
            .code = 0xfffd,
            .len = 1,
-            .offset = offset,
+            .offset = this_off,
        };
    }
+    var rune: u32 = byte & class_mask[class];
-    const cp_bytes = bytes[0..cp.len];
+    byte = bytes[cursor.*];
-    cp.code = switch (cp.len) {
+    class = @intCast(u8dfa[byte]);
-        2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
+    st = state_dfa[st + class];
+    rune = (byte & 0x3f) | (rune << 6);
-        3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
+    cursor.* += 1;
-            (cp_bytes[1] & 0b00111111)) << 6) |
+    if (st == RUNE_ACCEPT) {
-            (cp_bytes[2] & 0b00111111),
+        return .{
+            .code = @intCast(rune),
-        4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
+            .len = 2,
-            (cp_bytes[1] & 0b00111111)) << 6) |
+            .offset = this_off,
-            (cp_bytes[2] & 0b00111111)) << 6) |
+        };
-            (cp_bytes[3] & 0b00111111),
+    }
+    if (st == RUNE_REJECT or cursor.* == bytes.len) {
-        else => @panic("CodePointIterator.next invalid code point length."),
+        @branchHint(.cold);
+        // Check for valid start at cursor:
+        if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) {
+            return .{
+                .code = 0xfffd,
+                .len = 2,
+                .offset = this_off,
+            };
+        } else {
+            // Truncation.
+            cursor.* -= 1;
+            return .{
+                .code = 0xfffe,
+                .len = 1,
+                .offset = this_off,
+            };
+        }
+    }
+    // Third
+    byte = bytes[cursor.*];
+    class = @intCast(u8dfa[byte]);
+    st = state_dfa[st + class];
+    rune = (byte & 0x3f) | (rune << 6);
+    cursor.* += 1;
+    if (st == RUNE_ACCEPT) {
+        return .{
+            .code = @intCast(rune),
+            .len = 3,
+            .offset = this_off,
+        };
+    }
+    if (st == RUNE_REJECT or cursor.* == bytes.len) {
+        @branchHint(.cold);
+        if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) {
+            return .{
+                .code = 0xfffd,
+                .len = 3,
+                .offset = this_off,
+            };
+        } else {
+            cursor.* -= 1;
+            return .{
+                .code = 0xfffd,
+                .len = 2,
+                .offset = this_off,
+            };
+        }
+    }
+    byte = bytes[cursor.*];
+    class = @intCast(u8dfa[byte]);
+    st = state_dfa[st + class];
+    rune = (byte & 0x3f) | (rune << 6);
+    cursor.* += 1;
+    if (st == RUNE_REJECT) {
+        @branchHint(.cold);
+        if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) {
+            return .{
+                .code = 0xfffd,
+                .len = 4,
+                .offset = this_off,
+            };
+        } else {
+            cursor.* -= 1;
+            return .{
+                .code = 0xfffd,
+                .len = 3,
+                .offset = this_off,
+            };
+        }
+    }
+    assert(st == RUNE_ACCEPT);
+    return .{
+        .code = @intCast(rune),
+        .len = 4,
+        .offset = this_off,
    };
-    return cp;
 }
 /// `Iterator` iterates a string one `CodePoint` at-a-time.
@@ -75,14 +158,7 @@ pub const Iterator = struct {
    i: u32 = 0,
    pub fn next(self: *Iterator) ?CodePoint {
-        if (self.i >= self.bytes.len) return null;
+        return decodeAtCursor(self.bytes, &self.i);
-        const res = decode(self.bytes[self.i..], self.i);
-        if (res) |cp| {
-            self.i += cp.len;
-        }
-        return res;
    }
    pub fn peek(self: *Iterator) ?CodePoint {
@@ -92,6 +168,74 @@ pub const Iterator = struct {
    }
 };
+// A fast DFA decoder for UTF-8
+//
+// The algorithm used aims to be optimal, without involving SIMD, this
+// strikes a balance between portability and efficiency.  That is done
+// by using a DFA, represented as a few lookup tables, to track state,
+// encoding valid transitions between bytes, arriving at 0 each time a
+// codepoint is decoded.  In the process it builds up the value of the
+// codepoint in question.
+//
+// The virtue of such an approach is low branching factor, achieved at
+// a modest cost of storing the tables.  An embedded system might want
+// to use a more familiar decision graph based on switches, but modern
+// hosted environments can well afford the space, and may appreciate a
+// speed increase in exchange.
+//
+// Credit for the algorithm goes to Björn Höhrmann, who wrote it up at
+// https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ .  The original
+// license may be found in the ./credits folder.
+//
+/// Successful codepoint parse
+const RUNE_ACCEPT = 0;
+/// Error state
+const RUNE_REJECT = 12;
+/// Byte transitions: value to class
+const u8dfa: [256]u8 = .{
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1f
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3f
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5f
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7f
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9f
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..bf
+    8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..df
+    0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // e0..ef
+    0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // f0..ff
+};
+/// State transition: state + class = new state
+const state_dfa: [108]u8 = .{
+    0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, // 0  (RUNE_ACCEPT)
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // 12 (RUNE_REJECT)
+    12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, // 24
+    12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, // 32
+    12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, // 48
+    12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, // 60
+    12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // 72
+    12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // 84
+    12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // 96
+};
+/// State masks
+const class_mask: [12]u8 = .{
+    0xff,
+    0,
+    0b0011_1111,
+    0b0001_1111,
+    0b0000_1111,
+    0b0000_0111,
+    0b0000_0011,
+    0,
+    0,
+    0,
+    0,
+    0,
+};
 test "decode" {
    const bytes = "🌩️";
    const res = decode(bytes, 0);
@@ -120,8 +264,8 @@ test "overlongs" {
    const bytes = "\xC0\xAF";
    const res = decode(bytes, 0);
    if (res) |cp| {
-        try testing.expectEqual(@as(u21, '/'), cp.code);
+        try testing.expectEqual(0xfffd, cp.code);
-        try testing.expectEqual(2, cp.len);
+        try testing.expectEqual(1, cp.len);
    } else {
        try testing.expect(false);
    }
@@ -129,3 +273,4 @@ test "overlongs" {
 const std = @import("std");
 const testing = std.testing;
+const assert = std.debug.assert;
author	Sam Atman	2025-05-15 12:13:09 -0400
committer	Sam Atman	2025-05-15 12:13:09 -0400
commit	f3841123295f4abe1bf61ec844117729e1fd5af4 (patch)
tree	b7dd0c608625efa24f26dbfbed2af091ccfaec14 /src/code_point.zig
parent	Add overlong test, which should fail (diff)
download	zg-f3841123295f4abe1bf61ec844117729e1fd5af4.tar.gz zg-f3841123295f4abe1bf61ec844117729e1fd5af4.tar.xz zg-f3841123295f4abe1bf61ec844117729e1fd5af4.zip

diff --git a/src/code_point.zig b/src/code_point.zig index e402554..d589413 100644 --- a/src/code_point.zig +++ b/src/code_point.zig
@@ -1,3 +1,9 @@
		1	//! Unicode Code Point module
		2	//!
		3	//! Provides a decoder and iterator over a UTF-8 encoded string.
		4	//! Represents invalid data according to the Replacement of Maximal
		5	//! Subparts algorithm.
		6
1	/// `CodePoint` represents a Unicode code point by its code,	7	/// `CodePoint` represents a Unicode code point by its code,
2	/// length, and offset in the source bytes.	8	/// length, and offset in the source bytes.
3	pub const CodePoint = struct {	9	pub const CodePoint = struct {
@@ -6,67 +12,144 @@ pub const CodePoint = struct {
6	offset: u32,	12	offset: u32,
7	};	13	};
8		14
9	/// given a small slice of a string, decode the corresponding codepoint	15	/// This function is deprecated and will be removed in a later release.
		16	/// Use `decodeAtIndex` or `decodeAtCursor`.
10	pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {	17	pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
11	// EOS fast path	18	var off: u32 = 0;
12	if (bytes.len == 0) {	19	var maybe_code = decodeAtCursor(bytes, &off);
13	return null;	20	if (maybe_code) \|*code\| {
		21	code.offset = offset;
		22	return code.*;
14	}	23	}
		24	return null;
		25	}
15		26
16	// ASCII fast path	27	/// Decode the CodePoint, if any, at `bytes[idx]`.
17	if (bytes[0] < 128) {	28	pub fn decodeAtIndex(bytes: []const u8, idx: u32) ?CodePoint {
18	return .{	29	var off = idx;
19	.code = bytes[0],	30	return decodeAtCursor(bytes, &off);
20	.len = 1,	31	}
21	.offset = offset,	32
22	};	33	/// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the
23	}	34	/// cursor will point at the next potential codepoint index.
		35	pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
		36	// EOS
		37	if (cursor.* >= bytes.len) return null;
24		38
25	var cp = CodePoint{	39	const this_off = cursor.*;
26	.code = undefined,	40	cursor.* += 1;
27	.len = switch (bytes[0]) {	41
28	0b1100_0000...0b1101_1111 => 2,	42	// ASCII
29	0b1110_0000...0b1110_1111 => 3,	43	var byte = bytes[this_off];
30	0b1111_0000...0b1111_0111 => 4,	44	if (byte < 0x80) return .{
31	else => {	45	.code = byte,
32	// unicode replacement code point.	46	.offset = this_off,
33	return .{	47	.len = 1,
34	.code = 0xfffd,
35	.len = 1,
36	.offset = offset,
37	};
38	},
39	},
40	.offset = offset,
41	};	48	};
		49	// Multibyte
42		50
43	// Return replacement if we don' have a complete codepoint remaining. Consumes only one byte	51	// Second:
44	if (cp.len > bytes.len) {	52	var class: u4 = @intCast(u8dfa[byte]);
45	// Unicode replacement code point.	53	var st: u32 = state_dfa[class];
		54	if (st == RUNE_REJECT or cursor.* == bytes.len) {
		55	@branchHint(.cold);
		56	// First one is never a truncation
46	return .{	57	return .{
47	.code = 0xfffd,	58	.code = 0xfffd,
48	.len = 1,	59	.len = 1,
49	.offset = offset,	60	.offset = this_off,
50	};	61	};
51	}	62	}
52		63	var rune: u32 = byte & class_mask[class];
53	const cp_bytes = bytes[0..cp.len];	64	byte = bytes[cursor.*];
54	cp.code = switch (cp.len) {	65	class = @intCast(u8dfa[byte]);
55	2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) \| (cp_bytes[1] & 0b00111111),	66	st = state_dfa[st + class];
56		67	rune = (byte & 0x3f) \| (rune << 6);
57	3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) \|	68	cursor.* += 1;
58	(cp_bytes[1] & 0b00111111)) << 6) \|	69	if (st == RUNE_ACCEPT) {
59	(cp_bytes[2] & 0b00111111),	70	return .{
60		71	.code = @intCast(rune),
61	4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) \|	72	.len = 2,
62	(cp_bytes[1] & 0b00111111)) << 6) \|	73	.offset = this_off,
63	(cp_bytes[2] & 0b00111111)) << 6) \|	74	};
64	(cp_bytes[3] & 0b00111111),	75	}
65		76	if (st == RUNE_REJECT or cursor.* == bytes.len) {
66	else => @panic("CodePointIterator.next invalid code point length."),	77	@branchHint(.cold);
		78	// Check for valid start at cursor:
		79	if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) {
		80	return .{
		81	.code = 0xfffd,
		82	.len = 2,
		83	.offset = this_off,
		84	};
		85	} else {
		86	// Truncation.
		87	cursor.* -= 1;
		88	return .{
		89	.code = 0xfffe,
		90	.len = 1,
		91	.offset = this_off,
		92	};
		93	}
		94	}
		95	// Third
		96	byte = bytes[cursor.*];
		97	class = @intCast(u8dfa[byte]);
		98	st = state_dfa[st + class];
		99	rune = (byte & 0x3f) \| (rune << 6);
		100	cursor.* += 1;
		101	if (st == RUNE_ACCEPT) {
		102	return .{
		103	.code = @intCast(rune),
		104	.len = 3,
		105	.offset = this_off,
		106	};
		107	}
		108	if (st == RUNE_REJECT or cursor.* == bytes.len) {
		109	@branchHint(.cold);
		110	if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) {
		111	return .{
		112	.code = 0xfffd,
		113	.len = 3,
		114	.offset = this_off,
		115	};
		116	} else {
		117	cursor.* -= 1;
		118	return .{
		119	.code = 0xfffd,
		120	.len = 2,
		121	.offset = this_off,
		122	};
		123	}
		124	}
		125	byte = bytes[cursor.*];
		126	class = @intCast(u8dfa[byte]);
		127	st = state_dfa[st + class];
		128	rune = (byte & 0x3f) \| (rune << 6);
		129	cursor.* += 1;
		130	if (st == RUNE_REJECT) {
		131	@branchHint(.cold);
		132	if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) {
		133	return .{
		134	.code = 0xfffd,
		135	.len = 4,
		136	.offset = this_off,
		137	};
		138	} else {
		139	cursor.* -= 1;
		140	return .{
		141	.code = 0xfffd,
		142	.len = 3,
		143	.offset = this_off,
		144	};
		145	}
		146	}
		147	assert(st == RUNE_ACCEPT);
		148	return .{
		149	.code = @intCast(rune),
		150	.len = 4,
		151	.offset = this_off,
67	};	152	};
68
69	return cp;
70	}	153	}
71		154
72	/// `Iterator` iterates a string one `CodePoint` at-a-time.	155	/// `Iterator` iterates a string one `CodePoint` at-a-time.
@@ -75,14 +158,7 @@ pub const Iterator = struct {
75	i: u32 = 0,	158	i: u32 = 0,
76		159
77	pub fn next(self: *Iterator) ?CodePoint {	160	pub fn next(self: *Iterator) ?CodePoint {
78	if (self.i >= self.bytes.len) return null;	161	return decodeAtCursor(self.bytes, &self.i);
79
80	const res = decode(self.bytes[self.i..], self.i);
81	if (res) \|cp\| {
82	self.i += cp.len;
83	}
84
85	return res;
86	}	162	}
87		163
88	pub fn peek(self: *Iterator) ?CodePoint {	164	pub fn peek(self: *Iterator) ?CodePoint {
@@ -92,6 +168,74 @@ pub const Iterator = struct {
92	}	168	}
93	};	169	};
94		170
		171	// A fast DFA decoder for UTF-8
		172	//
		173	// The algorithm used aims to be optimal, without involving SIMD, this
		174	// strikes a balance between portability and efficiency. That is done
		175	// by using a DFA, represented as a few lookup tables, to track state,
		176	// encoding valid transitions between bytes, arriving at 0 each time a
		177	// codepoint is decoded. In the process it builds up the value of the
		178	// codepoint in question.
		179	//
		180	// The virtue of such an approach is low branching factor, achieved at
		181	// a modest cost of storing the tables. An embedded system might want
		182	// to use a more familiar decision graph based on switches, but modern
		183	// hosted environments can well afford the space, and may appreciate a
		184	// speed increase in exchange.
		185	//
		186	// Credit for the algorithm goes to Björn Höhrmann, who wrote it up at
		187	// https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ . The original
		188	// license may be found in the ./credits folder.
		189	//
		190
		191	/// Successful codepoint parse
		192	const RUNE_ACCEPT = 0;
		193
		194	/// Error state
		195	const RUNE_REJECT = 12;
		196
		197	/// Byte transitions: value to class
		198	const u8dfa: [256]u8 = .{
		199	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1f
		200	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3f
		201	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5f
		202	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7f
		203	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9f
		204	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..bf
		205	8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..df
		206	0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // e0..ef
		207	0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // f0..ff
		208	};
		209
		210	/// State transition: state + class = new state
		211	const state_dfa: [108]u8 = .{
		212	0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, // 0 (RUNE_ACCEPT)
		213	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // 12 (RUNE_REJECT)
		214	12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, // 24
		215	12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, // 32
		216	12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, // 48
		217	12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, // 60
		218	12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // 72
		219	12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // 84
		220	12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // 96
		221	};
		222
		223	/// State masks
		224	const class_mask: [12]u8 = .{
		225	0xff,
		226	0,
		227	0b0011_1111,
		228	0b0001_1111,
		229	0b0000_1111,
		230	0b0000_0111,
		231	0b0000_0011,
		232	0,
		233	0,
		234	0,
		235	0,
		236	0,
		237	};
		238
95	test "decode" {	239	test "decode" {
96	const bytes = "🌩️";	240	const bytes = "🌩️";
97	const res = decode(bytes, 0);	241	const res = decode(bytes, 0);
@@ -120,8 +264,8 @@ test "overlongs" {
120	const bytes = "\xC0\xAF";	264	const bytes = "\xC0\xAF";
121	const res = decode(bytes, 0);	265	const res = decode(bytes, 0);
122	if (res) \|cp\| {	266	if (res) \|cp\| {
123	try testing.expectEqual(@as(u21, '/'), cp.code);	267	try testing.expectEqual(0xfffd, cp.code);
124	try testing.expectEqual(2, cp.len);	268	try testing.expectEqual(1, cp.len);
125	} else {	269	} else {
126	try testing.expect(false);	270	try testing.expect(false);
127	}	271	}
@@ -129,3 +273,4 @@ test "overlongs" {
129		273
130	const std = @import("std");	274	const std = @import("std");
131	const testing = std.testing;	275	const testing = std.testing;
		276	const assert = std.debug.assert;