//! Unicode Code Point module //! //! Provides a decoder and iterator over a UTF-8 encoded string. //! Represents invalid data according to the Replacement of Maximal //! Subparts algorithm. pub const uoffset = if (@import("config").fat_offset) u64 else u32; /// `CodePoint` represents a Unicode code point by its code, /// length, and offset in the source bytes. pub const CodePoint = struct { code: u21, len: u3, offset: uoffset, /// Return the slice of this codepoint, given the original string. pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 { return str[cp.offset..][0..cp.len]; } pub fn format(cp: CodePoint, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { try writer.print("CodePoint '{u}' .{{ ", .{cp.code}); try writer.print( ".code = 0x{x}, .offset = {d}, .len = {d} }}", .{ cp.code, cp.offset, cp.len }, ); } }; /// This function is deprecated and will be removed in a later release. /// Use `decodeAtIndex` or `decodeAtCursor`. pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint { var off: uoffset = 0; var maybe_code = decodeAtCursor(bytes, &off); if (maybe_code) |*code| { code.offset = offset; return code.*; } return null; } /// Return the codepoint at `index`, even if `index` is in the middle /// of that codepoint. pub fn codepointAtIndex(bytes: []const u8, index: uoffset) ?CodePoint { var idx = index; while (idx > 0 and 0x80 <= bytes[idx] and bytes[idx] <= 0xbf) : (idx -= 1) {} return decodeAtIndex(bytes, idx); } /// Decode the CodePoint, if any, at `bytes[idx]`. pub fn decodeAtIndex(bytes: []const u8, index: uoffset) ?CodePoint { var off = index; return decodeAtCursor(bytes, &off); } /// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the /// cursor will point at the next potential codepoint index. pub fn decodeAtCursor(bytes: []const u8, cursor: *uoffset) ?CodePoint { // EOS if (cursor.* >= bytes.len) return null; const this_off = cursor.*; cursor.* += 1; // +1 // ASCII var byte = bytes[this_off]; if (byte < 0x80) return .{ .code = byte, .offset = this_off, .len = 1, }; // Multibyte // Second: var class: u4 = @intCast(u8dfa[byte]); var st: u32 = state_dfa[class]; if (st == RUNE_REJECT or cursor.* == bytes.len) { @branchHint(.cold); // First one is never a truncation return .{ .code = 0xfffd, .len = 1, .offset = this_off, }; } var rune: u32 = byte & class_mask[class]; byte = bytes[cursor.*]; class = @intCast(u8dfa[byte]); st = state_dfa[st + class]; rune = (byte & 0x3f) | (rune << 6); cursor.* += 1; // +2 if (st == RUNE_ACCEPT) { return .{ .code = @intCast(rune), .len = 2, .offset = this_off, }; } if (st == RUNE_REJECT or cursor.* == bytes.len) { @branchHint(.cold); // Truncation and other bad bytes the same here: cursor.* -= 1; // + 1 return .{ .code = 0xfffd, .len = 1, .offset = this_off, }; } // Third byte = bytes[cursor.*]; class = @intCast(u8dfa[byte]); st = state_dfa[st + class]; rune = (byte & 0x3f) | (rune << 6); cursor.* += 1; // +3 if (st == RUNE_ACCEPT) { return .{ .code = @intCast(rune), .len = 3, .offset = this_off, }; } if (st == RUNE_REJECT or cursor.* == bytes.len) { @branchHint(.cold); // This, and the branch below, detect truncation, the // only invalid state handled differently by the Maximal // Subparts algorithm. if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { cursor.* -= 2; // +1 return .{ .code = 0xfffd, .len = 1, .offset = this_off, }; } else { cursor.* -= 1; // +2 return .{ .code = 0xfffd, .len = 2, .offset = this_off, }; } } byte = bytes[cursor.*]; class = @intCast(u8dfa[byte]); st = state_dfa[st + class]; rune = (byte & 0x3f) | (rune << 6); cursor.* += 1; // +4 if (st == RUNE_REJECT) { @branchHint(.cold); if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { cursor.* -= 3; // +1 return .{ .code = 0xfffd, .len = 1, .offset = this_off, }; } else { cursor.* -= 1; // +3 return .{ .code = 0xfffd, .len = 3, .offset = this_off, }; } } assert(st == RUNE_ACCEPT); return .{ .code = @intCast(rune), .len = 4, .offset = this_off, }; } /// `Iterator` iterates a string one `CodePoint` at-a-time. pub const Iterator = struct { bytes: []const u8, i: uoffset = 0, pub fn init(bytes: []const u8) Iterator { return .{ .bytes = bytes, .i = 0 }; } pub fn next(self: *Iterator) ?CodePoint { return decodeAtCursor(self.bytes, &self.i); } pub fn peek(iter: *Iterator) ?CodePoint { const saved_i = iter.i; defer iter.i = saved_i; return iter.next(); } /// Create a backward iterator at this point. It will repeat /// the last CodePoint seen. pub fn reverseIterator(iter: *const Iterator) ReverseIterator { if (iter.i == iter.bytes.len) { return .init(iter.bytes); } return .{ .i = iter.i, .bytes = iter.bytes }; } }; // A fast DFA decoder for UTF-8 // // The algorithm used aims to be optimal, without involving SIMD, this // strikes a balance between portability and efficiency. That is done // by using a DFA, represented as a few lookup tables, to track state, // encoding valid transitions between bytes, arriving at 0 each time a // codepoint is decoded. In the process it builds up the value of the // codepoint in question. // // The virtue of such an approach is low branching factor, achieved at // a modest cost of storing the tables. An embedded system might want // to use a more familiar decision graph based on switches, but modern // hosted environments can well afford the space, and may appreciate a // speed increase in exchange. // // Credit for the algorithm goes to Björn Höhrmann, who wrote it up at // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ . The original // license may be found in the ./credits folder. // /// Successful codepoint parse const RUNE_ACCEPT = 0; /// Error state const RUNE_REJECT = 12; /// Byte transitions: value to class const u8dfa: [256]u8 = .{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1f 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3f 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5f 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7f 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9f 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..bf 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..df 0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // e0..ef 0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // f0..ff }; /// State transition: state + class = new state const state_dfa: [108]u8 = .{ 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, // 0 (RUNE_ACCEPT) 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // 12 (RUNE_REJECT) 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, // 24 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, // 32 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, // 48 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, // 60 12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // 72 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // 84 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // 96 }; /// State masks const class_mask: [12]u8 = .{ 0xff, 0, 0b0011_1111, 0b0001_1111, 0b0000_1111, 0b0000_0111, 0b0000_0011, 0, 0, 0, 0, 0, }; pub const ReverseIterator = struct { bytes: []const u8, i: ?uoffset, pub fn init(str: []const u8) ReverseIterator { var r_iter: ReverseIterator = undefined; r_iter.bytes = str; r_iter.i = if (str.len == 0) 0 else @intCast(str.len - 1); return r_iter; } pub fn prev(iter: *ReverseIterator) ?CodePoint { if (iter.i == null) return null; var i_prev = iter.i.?; while (i_prev > 0) : (i_prev -= 1) { if (!followbyte(iter.bytes[i_prev])) break; } if (i_prev > 0) iter.i = i_prev - 1 else iter.i = null; return decode(iter.bytes[i_prev..], i_prev); } pub fn peek(iter: *ReverseIterator) ?CodePoint { const saved_i = iter.i; defer iter.i = saved_i; return iter.prev(); } /// Create a forward iterator at this point. It will repeat the /// last CodePoint seen. pub fn forwardIterator(iter: *const ReverseIterator) Iterator { if (iter.i) |i| { var fwd: Iterator = .{ .i = i, .bytes = iter.bytes }; _ = fwd.next(); return fwd; } return .{ .i = 0, .bytes = iter.bytes }; } }; inline fn followbyte(b: u8) bool { return 0x80 <= b and b <= 0xbf; } test "decode" { const bytes = "🌩️"; const res = decode(bytes, 0); if (res) |cp| { try std.testing.expectEqual(@as(u21, 0x1F329), cp.code); try std.testing.expectEqual(4, cp.len); } else { // shouldn't have failed to return try std.testing.expect(false); } } test Iterator { var iter = Iterator{ .bytes = "Hi" }; try expectEqual(@as(u21, 'H'), iter.next().?.code); try expectEqual(@as(u21, 'i'), iter.peek().?.code); try expectEqual(@as(u21, 'i'), iter.next().?.code); try expectEqual(@as(?CodePoint, null), iter.peek()); try expectEqual(@as(?CodePoint, null), iter.next()); } const code_point = @This(); // Keep this in sync with the README test "Code point iterator" { const str = "Hi 😊"; var iter: code_point.Iterator = .init(str); var i: usize = 0; while (iter.next()) |cp| : (i += 1) { // The `code` field is the actual code point scalar as a `u21`. if (i == 0) try expect(cp.code == 'H'); if (i == 1) try expect(cp.code == 'i'); if (i == 2) try expect(cp.code == ' '); if (i == 3) { try expect(cp.code == '😊'); // The `offset` field is the byte offset in the // source string. try expect(cp.offset == 3); try expectEqual(cp, code_point.decodeAtIndex(str, cp.offset).?); // The `len` field is the length in bytes of the // code point in the source string. try expect(cp.len == 4); // There is also a 'cursor' decode, like so: { var cursor = cp.offset; try expectEqual(cp, code_point.decodeAtCursor(str, &cursor).?); // Which advances the cursor variable to the next possible // offset, in this case, `str.len`. Don't forget to account // for this possibility! try expectEqual(cp.offset + cp.len, cursor); } // There's also this, for when you aren't sure if you have the // correct start for a code point: try expectEqual(cp, code_point.codepointAtIndex(str, cp.offset + 1).?); } // Reverse iteration is also an option: var r_iter: code_point.ReverseIterator = .init(str); // Both iterators can be peeked: try expectEqual('😊', r_iter.peek().?.code); try expectEqual('😊', r_iter.prev().?.code); // Both kinds of iterators can be reversed: var fwd_iter = r_iter.forwardIterator(); // or iter.reverseIterator(); // This will always return the last codepoint from // the prior iterator, _if_ it yielded one: try expectEqual('😊', fwd_iter.next().?.code); } } test "overlongs" { // None of these should equal `/`, all should be byte-for-byte // handled as replacement characters. { const bytes = "\xc0\xaf"; var iter: Iterator = .init(bytes); const first = iter.next().?; try expect('/' != first.code); try expectEqual(0xfffd, first.code); try testing.expectEqual(1, first.len); const second = iter.next().?; try expectEqual(0xfffd, second.code); try testing.expectEqual(1, second.len); } { const bytes = "\xe0\x80\xaf"; var iter: Iterator = .init(bytes); const first = iter.next().?; try expect('/' != first.code); try expectEqual(0xfffd, first.code); try testing.expectEqual(1, first.len); const second = iter.next().?; try expectEqual(0xfffd, second.code); try testing.expectEqual(1, second.len); const third = iter.next().?; try expectEqual(0xfffd, third.code); try testing.expectEqual(1, third.len); } { const bytes = "\xf0\x80\x80\xaf"; var iter: Iterator = .init(bytes); const first = iter.next().?; try expect('/' != first.code); try expectEqual(0xfffd, first.code); try testing.expectEqual(1, first.len); const second = iter.next().?; try expectEqual(0xfffd, second.code); try testing.expectEqual(1, second.len); const third = iter.next().?; try expectEqual(0xfffd, third.code); try testing.expectEqual(1, third.len); const fourth = iter.next().?; try expectEqual(0xfffd, fourth.code); try testing.expectEqual(1, fourth.len); } } test "surrogates" { // Substitution of Maximal Subparts dictates a // replacement character for each byte of a surrogate. { const bytes = "\xed\xad\xbf"; var iter: Iterator = .init(bytes); const first = iter.next().?; try expectEqual(0xfffd, first.code); try testing.expectEqual(1, first.len); const second = iter.next().?; try expectEqual(0xfffd, second.code); try testing.expectEqual(1, second.len); const third = iter.next().?; try expectEqual(0xfffd, third.code); try testing.expectEqual(1, third.len); } } test "truncation" { // Truncation must return one (1) replacement // character for each stem of a valid UTF-8 codepoint // Sample from Table 3-11 of the Unicode Standard 16.0.0 { const bytes = "\xe1\x80\xe2\xf0\x91\x92\xf1\xbf\x41"; var iter: Iterator = .init(bytes); const first = iter.next().?; try expectEqual(0xfffd, first.code); try testing.expectEqual(2, first.len); const second = iter.next().?; try expectEqual(0xfffd, second.code); try testing.expectEqual(1, second.len); const third = iter.next().?; try expectEqual(0xfffd, third.code); try testing.expectEqual(3, third.len); const fourth = iter.next().?; try expectEqual(0xfffd, fourth.code); try testing.expectEqual(2, fourth.len); const fifth = iter.next().?; try expectEqual(0x41, fifth.code); try testing.expectEqual(1, fifth.len); } } test ReverseIterator { { var r_iter: ReverseIterator = .init("ABC"); try testing.expectEqual(@as(u21, 'C'), r_iter.prev().?.code); try testing.expectEqual(@as(u21, 'B'), r_iter.peek().?.code); try testing.expectEqual(@as(u21, 'B'), r_iter.prev().?.code); try testing.expectEqual(@as(u21, 'A'), r_iter.prev().?.code); try testing.expectEqual(@as(?CodePoint, null), r_iter.peek()); try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); } { var r_iter: ReverseIterator = .init("∅δq🦾ă"); try testing.expectEqual(@as(u21, 'ă'), r_iter.prev().?.code); try testing.expectEqual(@as(u21, '🦾'), r_iter.prev().?.code); try testing.expectEqual(@as(u21, 'q'), r_iter.prev().?.code); try testing.expectEqual(@as(u21, 'δ'), r_iter.peek().?.code); try testing.expectEqual(@as(u21, 'δ'), r_iter.prev().?.code); try testing.expectEqual(@as(u21, '∅'), r_iter.peek().?.code); try testing.expectEqual(@as(u21, '∅'), r_iter.peek().?.code); try testing.expectEqual(@as(u21, '∅'), r_iter.prev().?.code); try testing.expectEqual(@as(?CodePoint, null), r_iter.peek()); try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); } { var r_iter: ReverseIterator = .init("123"); try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code); try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); try testing.expectEqual(@as(u21, '1'), r_iter.prev().?.code); var iter = r_iter.forwardIterator(); try testing.expectEqual(@as(u21, '1'), iter.next().?.code); try testing.expectEqual(@as(u21, '2'), iter.next().?.code); try testing.expectEqual(@as(u21, '3'), iter.next().?.code); r_iter = iter.reverseIterator(); try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code); try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); iter = r_iter.forwardIterator(); r_iter = iter.reverseIterator(); try testing.expectEqual(@as(u21, '2'), iter.next().?.code); try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); } } const std = @import("std"); const testing = std.testing; const expect = testing.expect; const expectEqual = testing.expectEqual; const assert = std.debug.assert;