From 655f324ebf118bd4535e4c6104a89e70c2fac676 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Thu, 15 May 2025 14:06:01 -0400 Subject: Maximal Subparts tests The decoder now properly returns substitution bytes according to Substitution of Maximal Subparts, with tests to prove it. --- src/code_point.zig | 151 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 114 insertions(+), 37 deletions(-) (limited to 'src/code_point.zig') diff --git a/src/code_point.zig b/src/code_point.zig index d589413..fe7ad6e 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -37,7 +37,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { if (cursor.* >= bytes.len) return null; const this_off = cursor.*; - cursor.* += 1; + cursor.* += 1; // +1 // ASCII var byte = bytes[this_off]; @@ -65,7 +65,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { class = @intCast(u8dfa[byte]); st = state_dfa[st + class]; rune = (byte & 0x3f) | (rune << 6); - cursor.* += 1; + cursor.* += 1; // +2 if (st == RUNE_ACCEPT) { return .{ .code = @intCast(rune), @@ -75,29 +75,20 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { } if (st == RUNE_REJECT or cursor.* == bytes.len) { @branchHint(.cold); - // Check for valid start at cursor: - if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { - return .{ - .code = 0xfffd, - .len = 2, - .offset = this_off, - }; - } else { - // Truncation. - cursor.* -= 1; - return .{ - .code = 0xfffe, - .len = 1, - .offset = this_off, - }; - } + // Truncation and other bad bytes the same here: + cursor.* -= 1; // + 1 + return .{ + .code = 0xfffd, + .len = 1, + .offset = this_off, + }; } // Third byte = bytes[cursor.*]; class = @intCast(u8dfa[byte]); st = state_dfa[st + class]; rune = (byte & 0x3f) | (rune << 6); - cursor.* += 1; + cursor.* += 1; // +3 if (st == RUNE_ACCEPT) { return .{ .code = @intCast(rune), @@ -108,13 +99,14 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { if (st == RUNE_REJECT or cursor.* == bytes.len) { @branchHint(.cold); if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { + cursor.* -= 2; // +1 return .{ .code = 0xfffd, - .len = 3, + .len = 1, .offset = this_off, }; } else { - cursor.* -= 1; + cursor.* -= 1; // +2 return .{ .code = 0xfffd, .len = 2, @@ -126,17 +118,18 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { class = @intCast(u8dfa[byte]); st = state_dfa[st + class]; rune = (byte & 0x3f) | (rune << 6); - cursor.* += 1; + cursor.* += 1; // +4 if (st == RUNE_REJECT) { @branchHint(.cold); if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { + cursor.* -= 3; // +1 return .{ .code = 0xfffd, - .len = 4, + .len = 1, .offset = this_off, }; } else { - cursor.* -= 1; + cursor.* -= 1; // +3 return .{ .code = 0xfffd, .len = 3, @@ -157,6 +150,10 @@ pub const Iterator = struct { bytes: []const u8, i: u32 = 0, + pub fn init(bytes: []const u8) Iterator { + return .{ .bytes = bytes, .i = 0 }; + } + pub fn next(self: *Iterator) ?CodePoint { return decodeAtCursor(self.bytes, &self.i); } @@ -252,25 +249,105 @@ test "decode" { test "peek" { var iter = Iterator{ .bytes = "Hi" }; - try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); - try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); - try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code); - try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); - try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); + try expectEqual(@as(u21, 'H'), iter.next().?.code); + try expectEqual(@as(u21, 'i'), iter.peek().?.code); + try expectEqual(@as(u21, 'i'), iter.next().?.code); + try expectEqual(@as(?CodePoint, null), iter.peek()); + try expectEqual(@as(?CodePoint, null), iter.next()); } test "overlongs" { - // Should not pass! - const bytes = "\xC0\xAF"; - const res = decode(bytes, 0); - if (res) |cp| { - try testing.expectEqual(0xfffd, cp.code); - try testing.expectEqual(1, cp.len); - } else { - try testing.expect(false); + // None of these should equal `/`, all should be byte-for-byte + // handled as replacement characters. + { + const bytes = "\xc0\xaf"; + var iter: Iterator = .init(bytes); + const first = iter.next().?; + try expect('/' != first.code); + try expectEqual(0xfffd, first.code); + try testing.expectEqual(1, first.len); + const second = iter.next().?; + try expectEqual(0xfffd, second.code); + try testing.expectEqual(1, second.len); + } + { + const bytes = "\xe0\x80\xaf"; + var iter: Iterator = .init(bytes); + const first = iter.next().?; + try expect('/' != first.code); + try expectEqual(0xfffd, first.code); + try testing.expectEqual(1, first.len); + const second = iter.next().?; + try expectEqual(0xfffd, second.code); + try testing.expectEqual(1, second.len); + const third = iter.next().?; + try expectEqual(0xfffd, third.code); + try testing.expectEqual(1, third.len); + } + { + const bytes = "\xf0\x80\x80\xaf"; + var iter: Iterator = .init(bytes); + const first = iter.next().?; + try expect('/' != first.code); + try expectEqual(0xfffd, first.code); + try testing.expectEqual(1, first.len); + const second = iter.next().?; + try expectEqual(0xfffd, second.code); + try testing.expectEqual(1, second.len); + const third = iter.next().?; + try expectEqual(0xfffd, third.code); + try testing.expectEqual(1, third.len); + const fourth = iter.next().?; + try expectEqual(0xfffd, fourth.code); + try testing.expectEqual(1, fourth.len); + } +} + +test "surrogates" { + // Substitution of Maximal Subparts dictates a + // replacement character for each byte of a surrogate. + { + const bytes = "\xed\xad\xbf"; + var iter: Iterator = .init(bytes); + const first = iter.next().?; + try expectEqual(0xfffd, first.code); + try testing.expectEqual(1, first.len); + const second = iter.next().?; + try expectEqual(0xfffd, second.code); + try testing.expectEqual(1, second.len); + const third = iter.next().?; + try expectEqual(0xfffd, third.code); + try testing.expectEqual(1, third.len); + } +} + +test "truncation" { + // Truncation must return one (1) replacement + // character for each stem of a valid UTF-8 codepoint + // Sample from Table 3-11 of the Unicode Standard 16.0.0 + { + const bytes = "\xe1\x80\xe2\xf0\x91\x92\xf1\xbf\x41"; + var iter: Iterator = .init(bytes); + const first = iter.next().?; + try expectEqual(0xfffd, first.code); + try testing.expectEqual(2, first.len); + const second = iter.next().?; + try expectEqual(0xfffd, second.code); + try testing.expectEqual(1, second.len); + const third = iter.next().?; + try expectEqual(0xfffd, third.code); + try testing.expectEqual(3, third.len); + const fourth = iter.next().?; + try expectEqual(0xfffd, fourth.code); + try testing.expectEqual(2, fourth.len); + const fifth = iter.next().?; + try expectEqual(0x41, fifth.code); + try testing.expectEqual(1, fifth.len); } } const std = @import("std"); const testing = std.testing; +const expect = testing.expect; +const expectEqual = testing.expectEqual; const assert = std.debug.assert; -- cgit v1.2.3