summaryrefslogtreecommitdiff
path: root/src/code_point.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/code_point.zig')
-rw-r--r--src/code_point.zig263
1 files changed, 204 insertions, 59 deletions
diff --git a/src/code_point.zig b/src/code_point.zig
index e402554..d589413 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -1,3 +1,9 @@
1//! Unicode Code Point module
2//!
3//! Provides a decoder and iterator over a UTF-8 encoded string.
4//! Represents invalid data according to the Replacement of Maximal
5//! Subparts algorithm.
6
1/// `CodePoint` represents a Unicode code point by its code, 7/// `CodePoint` represents a Unicode code point by its code,
2/// length, and offset in the source bytes. 8/// length, and offset in the source bytes.
3pub const CodePoint = struct { 9pub const CodePoint = struct {
@@ -6,67 +12,144 @@ pub const CodePoint = struct {
6 offset: u32, 12 offset: u32,
7}; 13};
8 14
9/// given a small slice of a string, decode the corresponding codepoint 15/// This function is deprecated and will be removed in a later release.
16/// Use `decodeAtIndex` or `decodeAtCursor`.
10pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { 17pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
11 // EOS fast path 18 var off: u32 = 0;
12 if (bytes.len == 0) { 19 var maybe_code = decodeAtCursor(bytes, &off);
13 return null; 20 if (maybe_code) |*code| {
21 code.offset = offset;
22 return code.*;
14 } 23 }
24 return null;
25}
15 26
16 // ASCII fast path 27/// Decode the CodePoint, if any, at `bytes[idx]`.
17 if (bytes[0] < 128) { 28pub fn decodeAtIndex(bytes: []const u8, idx: u32) ?CodePoint {
18 return .{ 29 var off = idx;
19 .code = bytes[0], 30 return decodeAtCursor(bytes, &off);
20 .len = 1, 31}
21 .offset = offset, 32
22 }; 33/// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the
23 } 34/// cursor will point at the next potential codepoint index.
35pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
36 // EOS
37 if (cursor.* >= bytes.len) return null;
24 38
25 var cp = CodePoint{ 39 const this_off = cursor.*;
26 .code = undefined, 40 cursor.* += 1;
27 .len = switch (bytes[0]) { 41
28 0b1100_0000...0b1101_1111 => 2, 42 // ASCII
29 0b1110_0000...0b1110_1111 => 3, 43 var byte = bytes[this_off];
30 0b1111_0000...0b1111_0111 => 4, 44 if (byte < 0x80) return .{
31 else => { 45 .code = byte,
32 // unicode replacement code point. 46 .offset = this_off,
33 return .{ 47 .len = 1,
34 .code = 0xfffd,
35 .len = 1,
36 .offset = offset,
37 };
38 },
39 },
40 .offset = offset,
41 }; 48 };
49 // Multibyte
42 50
43 // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte 51 // Second:
44 if (cp.len > bytes.len) { 52 var class: u4 = @intCast(u8dfa[byte]);
45 // Unicode replacement code point. 53 var st: u32 = state_dfa[class];
54 if (st == RUNE_REJECT or cursor.* == bytes.len) {
55 @branchHint(.cold);
56 // First one is never a truncation
46 return .{ 57 return .{
47 .code = 0xfffd, 58 .code = 0xfffd,
48 .len = 1, 59 .len = 1,
49 .offset = offset, 60 .offset = this_off,
50 }; 61 };
51 } 62 }
52 63 var rune: u32 = byte & class_mask[class];
53 const cp_bytes = bytes[0..cp.len]; 64 byte = bytes[cursor.*];
54 cp.code = switch (cp.len) { 65 class = @intCast(u8dfa[byte]);
55 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111), 66 st = state_dfa[st + class];
56 67 rune = (byte & 0x3f) | (rune << 6);
57 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) | 68 cursor.* += 1;
58 (cp_bytes[1] & 0b00111111)) << 6) | 69 if (st == RUNE_ACCEPT) {
59 (cp_bytes[2] & 0b00111111), 70 return .{
60 71 .code = @intCast(rune),
61 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) | 72 .len = 2,
62 (cp_bytes[1] & 0b00111111)) << 6) | 73 .offset = this_off,
63 (cp_bytes[2] & 0b00111111)) << 6) | 74 };
64 (cp_bytes[3] & 0b00111111), 75 }
65 76 if (st == RUNE_REJECT or cursor.* == bytes.len) {
66 else => @panic("CodePointIterator.next invalid code point length."), 77 @branchHint(.cold);
78 // Check for valid start at cursor:
79 if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) {
80 return .{
81 .code = 0xfffd,
82 .len = 2,
83 .offset = this_off,
84 };
85 } else {
86 // Truncation.
87 cursor.* -= 1;
88 return .{
89 .code = 0xfffe,
90 .len = 1,
91 .offset = this_off,
92 };
93 }
94 }
95 // Third
96 byte = bytes[cursor.*];
97 class = @intCast(u8dfa[byte]);
98 st = state_dfa[st + class];
99 rune = (byte & 0x3f) | (rune << 6);
100 cursor.* += 1;
101 if (st == RUNE_ACCEPT) {
102 return .{
103 .code = @intCast(rune),
104 .len = 3,
105 .offset = this_off,
106 };
107 }
108 if (st == RUNE_REJECT or cursor.* == bytes.len) {
109 @branchHint(.cold);
110 if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) {
111 return .{
112 .code = 0xfffd,
113 .len = 3,
114 .offset = this_off,
115 };
116 } else {
117 cursor.* -= 1;
118 return .{
119 .code = 0xfffd,
120 .len = 2,
121 .offset = this_off,
122 };
123 }
124 }
125 byte = bytes[cursor.*];
126 class = @intCast(u8dfa[byte]);
127 st = state_dfa[st + class];
128 rune = (byte & 0x3f) | (rune << 6);
129 cursor.* += 1;
130 if (st == RUNE_REJECT) {
131 @branchHint(.cold);
132 if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) {
133 return .{
134 .code = 0xfffd,
135 .len = 4,
136 .offset = this_off,
137 };
138 } else {
139 cursor.* -= 1;
140 return .{
141 .code = 0xfffd,
142 .len = 3,
143 .offset = this_off,
144 };
145 }
146 }
147 assert(st == RUNE_ACCEPT);
148 return .{
149 .code = @intCast(rune),
150 .len = 4,
151 .offset = this_off,
67 }; 152 };
68
69 return cp;
70} 153}
71 154
72/// `Iterator` iterates a string one `CodePoint` at-a-time. 155/// `Iterator` iterates a string one `CodePoint` at-a-time.
@@ -75,14 +158,7 @@ pub const Iterator = struct {
75 i: u32 = 0, 158 i: u32 = 0,
76 159
77 pub fn next(self: *Iterator) ?CodePoint { 160 pub fn next(self: *Iterator) ?CodePoint {
78 if (self.i >= self.bytes.len) return null; 161 return decodeAtCursor(self.bytes, &self.i);
79
80 const res = decode(self.bytes[self.i..], self.i);
81 if (res) |cp| {
82 self.i += cp.len;
83 }
84
85 return res;
86 } 162 }
87 163
88 pub fn peek(self: *Iterator) ?CodePoint { 164 pub fn peek(self: *Iterator) ?CodePoint {
@@ -92,6 +168,74 @@ pub const Iterator = struct {
92 } 168 }
93}; 169};
94 170
171// A fast DFA decoder for UTF-8
172//
173// The algorithm used aims to be optimal, without involving SIMD, this
174// strikes a balance between portability and efficiency. That is done
175// by using a DFA, represented as a few lookup tables, to track state,
176// encoding valid transitions between bytes, arriving at 0 each time a
177// codepoint is decoded. In the process it builds up the value of the
178// codepoint in question.
179//
180// The virtue of such an approach is low branching factor, achieved at
181// a modest cost of storing the tables. An embedded system might want
182// to use a more familiar decision graph based on switches, but modern
183// hosted environments can well afford the space, and may appreciate a
184// speed increase in exchange.
185//
186// Credit for the algorithm goes to Björn Höhrmann, who wrote it up at
187// https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ . The original
188// license may be found in the ./credits folder.
189//
190
191/// Successful codepoint parse
192const RUNE_ACCEPT = 0;
193
194/// Error state
195const RUNE_REJECT = 12;
196
197/// Byte transitions: value to class
198const u8dfa: [256]u8 = .{
199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1f
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3f
201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5f
202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7f
203 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9f
204 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..bf
205 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..df
206 0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // e0..ef
207 0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // f0..ff
208};
209
210/// State transition: state + class = new state
211const state_dfa: [108]u8 = .{
212 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, // 0 (RUNE_ACCEPT)
213 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // 12 (RUNE_REJECT)
214 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, // 24
215 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, // 32
216 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, // 48
217 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, // 60
218 12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // 72
219 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // 84
220 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // 96
221};
222
223/// State masks
224const class_mask: [12]u8 = .{
225 0xff,
226 0,
227 0b0011_1111,
228 0b0001_1111,
229 0b0000_1111,
230 0b0000_0111,
231 0b0000_0011,
232 0,
233 0,
234 0,
235 0,
236 0,
237};
238
95test "decode" { 239test "decode" {
96 const bytes = "🌩️"; 240 const bytes = "🌩️";
97 const res = decode(bytes, 0); 241 const res = decode(bytes, 0);
@@ -120,8 +264,8 @@ test "overlongs" {
120 const bytes = "\xC0\xAF"; 264 const bytes = "\xC0\xAF";
121 const res = decode(bytes, 0); 265 const res = decode(bytes, 0);
122 if (res) |cp| { 266 if (res) |cp| {
123 try testing.expectEqual(@as(u21, '/'), cp.code); 267 try testing.expectEqual(0xfffd, cp.code);
124 try testing.expectEqual(2, cp.len); 268 try testing.expectEqual(1, cp.len);
125 } else { 269 } else {
126 try testing.expect(false); 270 try testing.expect(false);
127 } 271 }
@@ -129,3 +273,4 @@ test "overlongs" {
129 273
130const std = @import("std"); 274const std = @import("std");
131const testing = std.testing; 275const testing = std.testing;
276const assert = std.debug.assert;