summaryrefslogtreecommitdiff
path: root/src/code_point.zig
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-15 14:06:01 -0400
committerGravatar Sam Atman2025-05-15 14:06:29 -0400
commit655f324ebf118bd4535e4c6104a89e70c2fac676 (patch)
tree7de19b3e9fc6efeb8289b36758f7c4ebe47e810d /src/code_point.zig
parentReplace CodePoint Decoding with Hörhmann Method (diff)
downloadzg-655f324ebf118bd4535e4c6104a89e70c2fac676.tar.gz
zg-655f324ebf118bd4535e4c6104a89e70c2fac676.tar.xz
zg-655f324ebf118bd4535e4c6104a89e70c2fac676.zip
Maximal Subparts tests
The decoder now properly returns substitution bytes according to Substitution of Maximal Subparts, with tests to prove it.
Diffstat (limited to 'src/code_point.zig')
-rw-r--r--src/code_point.zig151
1 files changed, 114 insertions, 37 deletions
diff --git a/src/code_point.zig b/src/code_point.zig
index d589413..fe7ad6e 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -37,7 +37,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
37 if (cursor.* >= bytes.len) return null; 37 if (cursor.* >= bytes.len) return null;
38 38
39 const this_off = cursor.*; 39 const this_off = cursor.*;
40 cursor.* += 1; 40 cursor.* += 1; // +1
41 41
42 // ASCII 42 // ASCII
43 var byte = bytes[this_off]; 43 var byte = bytes[this_off];
@@ -65,7 +65,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
65 class = @intCast(u8dfa[byte]); 65 class = @intCast(u8dfa[byte]);
66 st = state_dfa[st + class]; 66 st = state_dfa[st + class];
67 rune = (byte & 0x3f) | (rune << 6); 67 rune = (byte & 0x3f) | (rune << 6);
68 cursor.* += 1; 68 cursor.* += 1; // +2
69 if (st == RUNE_ACCEPT) { 69 if (st == RUNE_ACCEPT) {
70 return .{ 70 return .{
71 .code = @intCast(rune), 71 .code = @intCast(rune),
@@ -75,29 +75,20 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
75 } 75 }
76 if (st == RUNE_REJECT or cursor.* == bytes.len) { 76 if (st == RUNE_REJECT or cursor.* == bytes.len) {
77 @branchHint(.cold); 77 @branchHint(.cold);
78 // Check for valid start at cursor: 78 // Truncation and other bad bytes the same here:
79 if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { 79 cursor.* -= 1; // + 1
80 return .{ 80 return .{
81 .code = 0xfffd, 81 .code = 0xfffd,
82 .len = 2, 82 .len = 1,
83 .offset = this_off, 83 .offset = this_off,
84 }; 84 };
85 } else {
86 // Truncation.
87 cursor.* -= 1;
88 return .{
89 .code = 0xfffe,
90 .len = 1,
91 .offset = this_off,
92 };
93 }
94 } 85 }
95 // Third 86 // Third
96 byte = bytes[cursor.*]; 87 byte = bytes[cursor.*];
97 class = @intCast(u8dfa[byte]); 88 class = @intCast(u8dfa[byte]);
98 st = state_dfa[st + class]; 89 st = state_dfa[st + class];
99 rune = (byte & 0x3f) | (rune << 6); 90 rune = (byte & 0x3f) | (rune << 6);
100 cursor.* += 1; 91 cursor.* += 1; // +3
101 if (st == RUNE_ACCEPT) { 92 if (st == RUNE_ACCEPT) {
102 return .{ 93 return .{
103 .code = @intCast(rune), 94 .code = @intCast(rune),
@@ -108,13 +99,14 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
108 if (st == RUNE_REJECT or cursor.* == bytes.len) { 99 if (st == RUNE_REJECT or cursor.* == bytes.len) {
109 @branchHint(.cold); 100 @branchHint(.cold);
110 if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { 101 if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) {
102 cursor.* -= 2; // +1
111 return .{ 103 return .{
112 .code = 0xfffd, 104 .code = 0xfffd,
113 .len = 3, 105 .len = 1,
114 .offset = this_off, 106 .offset = this_off,
115 }; 107 };
116 } else { 108 } else {
117 cursor.* -= 1; 109 cursor.* -= 1; // +2
118 return .{ 110 return .{
119 .code = 0xfffd, 111 .code = 0xfffd,
120 .len = 2, 112 .len = 2,
@@ -126,17 +118,18 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
126 class = @intCast(u8dfa[byte]); 118 class = @intCast(u8dfa[byte]);
127 st = state_dfa[st + class]; 119 st = state_dfa[st + class];
128 rune = (byte & 0x3f) | (rune << 6); 120 rune = (byte & 0x3f) | (rune << 6);
129 cursor.* += 1; 121 cursor.* += 1; // +4
130 if (st == RUNE_REJECT) { 122 if (st == RUNE_REJECT) {
131 @branchHint(.cold); 123 @branchHint(.cold);
132 if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { 124 if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) {
125 cursor.* -= 3; // +1
133 return .{ 126 return .{
134 .code = 0xfffd, 127 .code = 0xfffd,
135 .len = 4, 128 .len = 1,
136 .offset = this_off, 129 .offset = this_off,
137 }; 130 };
138 } else { 131 } else {
139 cursor.* -= 1; 132 cursor.* -= 1; // +3
140 return .{ 133 return .{
141 .code = 0xfffd, 134 .code = 0xfffd,
142 .len = 3, 135 .len = 3,
@@ -157,6 +150,10 @@ pub const Iterator = struct {
157 bytes: []const u8, 150 bytes: []const u8,
158 i: u32 = 0, 151 i: u32 = 0,
159 152
153 pub fn init(bytes: []const u8) Iterator {
154 return .{ .bytes = bytes, .i = 0 };
155 }
156
160 pub fn next(self: *Iterator) ?CodePoint { 157 pub fn next(self: *Iterator) ?CodePoint {
161 return decodeAtCursor(self.bytes, &self.i); 158 return decodeAtCursor(self.bytes, &self.i);
162 } 159 }
@@ -252,25 +249,105 @@ test "decode" {
252test "peek" { 249test "peek" {
253 var iter = Iterator{ .bytes = "Hi" }; 250 var iter = Iterator{ .bytes = "Hi" };
254 251
255 try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); 252 try expectEqual(@as(u21, 'H'), iter.next().?.code);
256 try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); 253 try expectEqual(@as(u21, 'i'), iter.peek().?.code);
257 try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code); 254 try expectEqual(@as(u21, 'i'), iter.next().?.code);
258 try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); 255 try expectEqual(@as(?CodePoint, null), iter.peek());
259 try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); 256 try expectEqual(@as(?CodePoint, null), iter.next());
260} 257}
261 258
262test "overlongs" { 259test "overlongs" {
263 // Should not pass! 260 // None of these should equal `/`, all should be byte-for-byte
264 const bytes = "\xC0\xAF"; 261 // handled as replacement characters.
265 const res = decode(bytes, 0); 262 {
266 if (res) |cp| { 263 const bytes = "\xc0\xaf";
267 try testing.expectEqual(0xfffd, cp.code); 264 var iter: Iterator = .init(bytes);
268 try testing.expectEqual(1, cp.len); 265 const first = iter.next().?;
269 } else { 266 try expect('/' != first.code);
270 try testing.expect(false); 267 try expectEqual(0xfffd, first.code);
268 try testing.expectEqual(1, first.len);
269 const second = iter.next().?;
270 try expectEqual(0xfffd, second.code);
271 try testing.expectEqual(1, second.len);
272 }
273 {
274 const bytes = "\xe0\x80\xaf";
275 var iter: Iterator = .init(bytes);
276 const first = iter.next().?;
277 try expect('/' != first.code);
278 try expectEqual(0xfffd, first.code);
279 try testing.expectEqual(1, first.len);
280 const second = iter.next().?;
281 try expectEqual(0xfffd, second.code);
282 try testing.expectEqual(1, second.len);
283 const third = iter.next().?;
284 try expectEqual(0xfffd, third.code);
285 try testing.expectEqual(1, third.len);
286 }
287 {
288 const bytes = "\xf0\x80\x80\xaf";
289 var iter: Iterator = .init(bytes);
290 const first = iter.next().?;
291 try expect('/' != first.code);
292 try expectEqual(0xfffd, first.code);
293 try testing.expectEqual(1, first.len);
294 const second = iter.next().?;
295 try expectEqual(0xfffd, second.code);
296 try testing.expectEqual(1, second.len);
297 const third = iter.next().?;
298 try expectEqual(0xfffd, third.code);
299 try testing.expectEqual(1, third.len);
300 const fourth = iter.next().?;
301 try expectEqual(0xfffd, fourth.code);
302 try testing.expectEqual(1, fourth.len);
303 }
304}
305
306test "surrogates" {
307 // Substitution of Maximal Subparts dictates a
308 // replacement character for each byte of a surrogate.
309 {
310 const bytes = "\xed\xad\xbf";
311 var iter: Iterator = .init(bytes);
312 const first = iter.next().?;
313 try expectEqual(0xfffd, first.code);
314 try testing.expectEqual(1, first.len);
315 const second = iter.next().?;
316 try expectEqual(0xfffd, second.code);
317 try testing.expectEqual(1, second.len);
318 const third = iter.next().?;
319 try expectEqual(0xfffd, third.code);
320 try testing.expectEqual(1, third.len);
321 }
322}
323
324test "truncation" {
325 // Truncation must return one (1) replacement
326 // character for each stem of a valid UTF-8 codepoint
327 // Sample from Table 3-11 of the Unicode Standard 16.0.0
328 {
329 const bytes = "\xe1\x80\xe2\xf0\x91\x92\xf1\xbf\x41";
330 var iter: Iterator = .init(bytes);
331 const first = iter.next().?;
332 try expectEqual(0xfffd, first.code);
333 try testing.expectEqual(2, first.len);
334 const second = iter.next().?;
335 try expectEqual(0xfffd, second.code);
336 try testing.expectEqual(1, second.len);
337 const third = iter.next().?;
338 try expectEqual(0xfffd, third.code);
339 try testing.expectEqual(3, third.len);
340 const fourth = iter.next().?;
341 try expectEqual(0xfffd, fourth.code);
342 try testing.expectEqual(2, fourth.len);
343 const fifth = iter.next().?;
344 try expectEqual(0x41, fifth.code);
345 try testing.expectEqual(1, fifth.len);
271 } 346 }
272} 347}
273 348
274const std = @import("std"); 349const std = @import("std");
275const testing = std.testing; 350const testing = std.testing;
351const expect = testing.expect;
352const expectEqual = testing.expectEqual;
276const assert = std.debug.assert; 353const assert = std.debug.assert;