diff options
Diffstat (limited to 'src/code_point.zig')
| -rw-r--r-- | src/code_point.zig | 151 |
1 files changed, 114 insertions, 37 deletions
diff --git a/src/code_point.zig b/src/code_point.zig index d589413..fe7ad6e 100644 --- a/src/code_point.zig +++ b/src/code_point.zig | |||
| @@ -37,7 +37,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { | |||
| 37 | if (cursor.* >= bytes.len) return null; | 37 | if (cursor.* >= bytes.len) return null; |
| 38 | 38 | ||
| 39 | const this_off = cursor.*; | 39 | const this_off = cursor.*; |
| 40 | cursor.* += 1; | 40 | cursor.* += 1; // +1 |
| 41 | 41 | ||
| 42 | // ASCII | 42 | // ASCII |
| 43 | var byte = bytes[this_off]; | 43 | var byte = bytes[this_off]; |
| @@ -65,7 +65,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { | |||
| 65 | class = @intCast(u8dfa[byte]); | 65 | class = @intCast(u8dfa[byte]); |
| 66 | st = state_dfa[st + class]; | 66 | st = state_dfa[st + class]; |
| 67 | rune = (byte & 0x3f) | (rune << 6); | 67 | rune = (byte & 0x3f) | (rune << 6); |
| 68 | cursor.* += 1; | 68 | cursor.* += 1; // +2 |
| 69 | if (st == RUNE_ACCEPT) { | 69 | if (st == RUNE_ACCEPT) { |
| 70 | return .{ | 70 | return .{ |
| 71 | .code = @intCast(rune), | 71 | .code = @intCast(rune), |
| @@ -75,29 +75,20 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { | |||
| 75 | } | 75 | } |
| 76 | if (st == RUNE_REJECT or cursor.* == bytes.len) { | 76 | if (st == RUNE_REJECT or cursor.* == bytes.len) { |
| 77 | @branchHint(.cold); | 77 | @branchHint(.cold); |
| 78 | // Check for valid start at cursor: | 78 | // Truncation and other bad bytes the same here: |
| 79 | if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { | 79 | cursor.* -= 1; // + 1 |
| 80 | return .{ | 80 | return .{ |
| 81 | .code = 0xfffd, | 81 | .code = 0xfffd, |
| 82 | .len = 2, | 82 | .len = 1, |
| 83 | .offset = this_off, | 83 | .offset = this_off, |
| 84 | }; | 84 | }; |
| 85 | } else { | ||
| 86 | // Truncation. | ||
| 87 | cursor.* -= 1; | ||
| 88 | return .{ | ||
| 89 | .code = 0xfffe, | ||
| 90 | .len = 1, | ||
| 91 | .offset = this_off, | ||
| 92 | }; | ||
| 93 | } | ||
| 94 | } | 85 | } |
| 95 | // Third | 86 | // Third |
| 96 | byte = bytes[cursor.*]; | 87 | byte = bytes[cursor.*]; |
| 97 | class = @intCast(u8dfa[byte]); | 88 | class = @intCast(u8dfa[byte]); |
| 98 | st = state_dfa[st + class]; | 89 | st = state_dfa[st + class]; |
| 99 | rune = (byte & 0x3f) | (rune << 6); | 90 | rune = (byte & 0x3f) | (rune << 6); |
| 100 | cursor.* += 1; | 91 | cursor.* += 1; // +3 |
| 101 | if (st == RUNE_ACCEPT) { | 92 | if (st == RUNE_ACCEPT) { |
| 102 | return .{ | 93 | return .{ |
| 103 | .code = @intCast(rune), | 94 | .code = @intCast(rune), |
| @@ -108,13 +99,14 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { | |||
| 108 | if (st == RUNE_REJECT or cursor.* == bytes.len) { | 99 | if (st == RUNE_REJECT or cursor.* == bytes.len) { |
| 109 | @branchHint(.cold); | 100 | @branchHint(.cold); |
| 110 | if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { | 101 | if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { |
| 102 | cursor.* -= 2; // +1 | ||
| 111 | return .{ | 103 | return .{ |
| 112 | .code = 0xfffd, | 104 | .code = 0xfffd, |
| 113 | .len = 3, | 105 | .len = 1, |
| 114 | .offset = this_off, | 106 | .offset = this_off, |
| 115 | }; | 107 | }; |
| 116 | } else { | 108 | } else { |
| 117 | cursor.* -= 1; | 109 | cursor.* -= 1; // +2 |
| 118 | return .{ | 110 | return .{ |
| 119 | .code = 0xfffd, | 111 | .code = 0xfffd, |
| 120 | .len = 2, | 112 | .len = 2, |
| @@ -126,17 +118,18 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { | |||
| 126 | class = @intCast(u8dfa[byte]); | 118 | class = @intCast(u8dfa[byte]); |
| 127 | st = state_dfa[st + class]; | 119 | st = state_dfa[st + class]; |
| 128 | rune = (byte & 0x3f) | (rune << 6); | 120 | rune = (byte & 0x3f) | (rune << 6); |
| 129 | cursor.* += 1; | 121 | cursor.* += 1; // +4 |
| 130 | if (st == RUNE_REJECT) { | 122 | if (st == RUNE_REJECT) { |
| 131 | @branchHint(.cold); | 123 | @branchHint(.cold); |
| 132 | if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { | 124 | if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { |
| 125 | cursor.* -= 3; // +1 | ||
| 133 | return .{ | 126 | return .{ |
| 134 | .code = 0xfffd, | 127 | .code = 0xfffd, |
| 135 | .len = 4, | 128 | .len = 1, |
| 136 | .offset = this_off, | 129 | .offset = this_off, |
| 137 | }; | 130 | }; |
| 138 | } else { | 131 | } else { |
| 139 | cursor.* -= 1; | 132 | cursor.* -= 1; // +3 |
| 140 | return .{ | 133 | return .{ |
| 141 | .code = 0xfffd, | 134 | .code = 0xfffd, |
| 142 | .len = 3, | 135 | .len = 3, |
| @@ -157,6 +150,10 @@ pub const Iterator = struct { | |||
| 157 | bytes: []const u8, | 150 | bytes: []const u8, |
| 158 | i: u32 = 0, | 151 | i: u32 = 0, |
| 159 | 152 | ||
| 153 | pub fn init(bytes: []const u8) Iterator { | ||
| 154 | return .{ .bytes = bytes, .i = 0 }; | ||
| 155 | } | ||
| 156 | |||
| 160 | pub fn next(self: *Iterator) ?CodePoint { | 157 | pub fn next(self: *Iterator) ?CodePoint { |
| 161 | return decodeAtCursor(self.bytes, &self.i); | 158 | return decodeAtCursor(self.bytes, &self.i); |
| 162 | } | 159 | } |
| @@ -252,25 +249,105 @@ test "decode" { | |||
| 252 | test "peek" { | 249 | test "peek" { |
| 253 | var iter = Iterator{ .bytes = "Hi" }; | 250 | var iter = Iterator{ .bytes = "Hi" }; |
| 254 | 251 | ||
| 255 | try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); | 252 | try expectEqual(@as(u21, 'H'), iter.next().?.code); |
| 256 | try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); | 253 | try expectEqual(@as(u21, 'i'), iter.peek().?.code); |
| 257 | try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code); | 254 | try expectEqual(@as(u21, 'i'), iter.next().?.code); |
| 258 | try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); | 255 | try expectEqual(@as(?CodePoint, null), iter.peek()); |
| 259 | try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); | 256 | try expectEqual(@as(?CodePoint, null), iter.next()); |
| 260 | } | 257 | } |
| 261 | 258 | ||
| 262 | test "overlongs" { | 259 | test "overlongs" { |
| 263 | // Should not pass! | 260 | // None of these should equal `/`, all should be byte-for-byte |
| 264 | const bytes = "\xC0\xAF"; | 261 | // handled as replacement characters. |
| 265 | const res = decode(bytes, 0); | 262 | { |
| 266 | if (res) |cp| { | 263 | const bytes = "\xc0\xaf"; |
| 267 | try testing.expectEqual(0xfffd, cp.code); | 264 | var iter: Iterator = .init(bytes); |
| 268 | try testing.expectEqual(1, cp.len); | 265 | const first = iter.next().?; |
| 269 | } else { | 266 | try expect('/' != first.code); |
| 270 | try testing.expect(false); | 267 | try expectEqual(0xfffd, first.code); |
| 268 | try testing.expectEqual(1, first.len); | ||
| 269 | const second = iter.next().?; | ||
| 270 | try expectEqual(0xfffd, second.code); | ||
| 271 | try testing.expectEqual(1, second.len); | ||
| 272 | } | ||
| 273 | { | ||
| 274 | const bytes = "\xe0\x80\xaf"; | ||
| 275 | var iter: Iterator = .init(bytes); | ||
| 276 | const first = iter.next().?; | ||
| 277 | try expect('/' != first.code); | ||
| 278 | try expectEqual(0xfffd, first.code); | ||
| 279 | try testing.expectEqual(1, first.len); | ||
| 280 | const second = iter.next().?; | ||
| 281 | try expectEqual(0xfffd, second.code); | ||
| 282 | try testing.expectEqual(1, second.len); | ||
| 283 | const third = iter.next().?; | ||
| 284 | try expectEqual(0xfffd, third.code); | ||
| 285 | try testing.expectEqual(1, third.len); | ||
| 286 | } | ||
| 287 | { | ||
| 288 | const bytes = "\xf0\x80\x80\xaf"; | ||
| 289 | var iter: Iterator = .init(bytes); | ||
| 290 | const first = iter.next().?; | ||
| 291 | try expect('/' != first.code); | ||
| 292 | try expectEqual(0xfffd, first.code); | ||
| 293 | try testing.expectEqual(1, first.len); | ||
| 294 | const second = iter.next().?; | ||
| 295 | try expectEqual(0xfffd, second.code); | ||
| 296 | try testing.expectEqual(1, second.len); | ||
| 297 | const third = iter.next().?; | ||
| 298 | try expectEqual(0xfffd, third.code); | ||
| 299 | try testing.expectEqual(1, third.len); | ||
| 300 | const fourth = iter.next().?; | ||
| 301 | try expectEqual(0xfffd, fourth.code); | ||
| 302 | try testing.expectEqual(1, fourth.len); | ||
| 303 | } | ||
| 304 | } | ||
| 305 | |||
| 306 | test "surrogates" { | ||
| 307 | // Substitution of Maximal Subparts dictates a | ||
| 308 | // replacement character for each byte of a surrogate. | ||
| 309 | { | ||
| 310 | const bytes = "\xed\xad\xbf"; | ||
| 311 | var iter: Iterator = .init(bytes); | ||
| 312 | const first = iter.next().?; | ||
| 313 | try expectEqual(0xfffd, first.code); | ||
| 314 | try testing.expectEqual(1, first.len); | ||
| 315 | const second = iter.next().?; | ||
| 316 | try expectEqual(0xfffd, second.code); | ||
| 317 | try testing.expectEqual(1, second.len); | ||
| 318 | const third = iter.next().?; | ||
| 319 | try expectEqual(0xfffd, third.code); | ||
| 320 | try testing.expectEqual(1, third.len); | ||
| 321 | } | ||
| 322 | } | ||
| 323 | |||
| 324 | test "truncation" { | ||
| 325 | // Truncation must return one (1) replacement | ||
| 326 | // character for each stem of a valid UTF-8 codepoint | ||
| 327 | // Sample from Table 3-11 of the Unicode Standard 16.0.0 | ||
| 328 | { | ||
| 329 | const bytes = "\xe1\x80\xe2\xf0\x91\x92\xf1\xbf\x41"; | ||
| 330 | var iter: Iterator = .init(bytes); | ||
| 331 | const first = iter.next().?; | ||
| 332 | try expectEqual(0xfffd, first.code); | ||
| 333 | try testing.expectEqual(2, first.len); | ||
| 334 | const second = iter.next().?; | ||
| 335 | try expectEqual(0xfffd, second.code); | ||
| 336 | try testing.expectEqual(1, second.len); | ||
| 337 | const third = iter.next().?; | ||
| 338 | try expectEqual(0xfffd, third.code); | ||
| 339 | try testing.expectEqual(3, third.len); | ||
| 340 | const fourth = iter.next().?; | ||
| 341 | try expectEqual(0xfffd, fourth.code); | ||
| 342 | try testing.expectEqual(2, fourth.len); | ||
| 343 | const fifth = iter.next().?; | ||
| 344 | try expectEqual(0x41, fifth.code); | ||
| 345 | try testing.expectEqual(1, fifth.len); | ||
| 271 | } | 346 | } |
| 272 | } | 347 | } |
| 273 | 348 | ||
| 274 | const std = @import("std"); | 349 | const std = @import("std"); |
| 275 | const testing = std.testing; | 350 | const testing = std.testing; |
| 351 | const expect = testing.expect; | ||
| 352 | const expectEqual = testing.expectEqual; | ||
| 276 | const assert = std.debug.assert; | 353 | const assert = std.debug.assert; |