diff options
Diffstat (limited to 'src/code_point.zig')
| -rw-r--r-- | src/code_point.zig | 263 |
1 files changed, 204 insertions, 59 deletions
diff --git a/src/code_point.zig b/src/code_point.zig index e402554..d589413 100644 --- a/src/code_point.zig +++ b/src/code_point.zig | |||
| @@ -1,3 +1,9 @@ | |||
| 1 | //! Unicode Code Point module | ||
| 2 | //! | ||
| 3 | //! Provides a decoder and iterator over a UTF-8 encoded string. | ||
| 4 | //! Represents invalid data according to the Replacement of Maximal | ||
| 5 | //! Subparts algorithm. | ||
| 6 | |||
| 1 | /// `CodePoint` represents a Unicode code point by its code, | 7 | /// `CodePoint` represents a Unicode code point by its code, |
| 2 | /// length, and offset in the source bytes. | 8 | /// length, and offset in the source bytes. |
| 3 | pub const CodePoint = struct { | 9 | pub const CodePoint = struct { |
| @@ -6,67 +12,144 @@ pub const CodePoint = struct { | |||
| 6 | offset: u32, | 12 | offset: u32, |
| 7 | }; | 13 | }; |
| 8 | 14 | ||
| 9 | /// given a small slice of a string, decode the corresponding codepoint | 15 | /// This function is deprecated and will be removed in a later release. |
| 16 | /// Use `decodeAtIndex` or `decodeAtCursor`. | ||
| 10 | pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { | 17 | pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { |
| 11 | // EOS fast path | 18 | var off: u32 = 0; |
| 12 | if (bytes.len == 0) { | 19 | var maybe_code = decodeAtCursor(bytes, &off); |
| 13 | return null; | 20 | if (maybe_code) |*code| { |
| 21 | code.offset = offset; | ||
| 22 | return code.*; | ||
| 14 | } | 23 | } |
| 24 | return null; | ||
| 25 | } | ||
| 15 | 26 | ||
| 16 | // ASCII fast path | 27 | /// Decode the CodePoint, if any, at `bytes[idx]`. |
| 17 | if (bytes[0] < 128) { | 28 | pub fn decodeAtIndex(bytes: []const u8, idx: u32) ?CodePoint { |
| 18 | return .{ | 29 | var off = idx; |
| 19 | .code = bytes[0], | 30 | return decodeAtCursor(bytes, &off); |
| 20 | .len = 1, | 31 | } |
| 21 | .offset = offset, | 32 | |
| 22 | }; | 33 | /// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the |
| 23 | } | 34 | /// cursor will point at the next potential codepoint index. |
| 35 | pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { | ||
| 36 | // EOS | ||
| 37 | if (cursor.* >= bytes.len) return null; | ||
| 24 | 38 | ||
| 25 | var cp = CodePoint{ | 39 | const this_off = cursor.*; |
| 26 | .code = undefined, | 40 | cursor.* += 1; |
| 27 | .len = switch (bytes[0]) { | 41 | |
| 28 | 0b1100_0000...0b1101_1111 => 2, | 42 | // ASCII |
| 29 | 0b1110_0000...0b1110_1111 => 3, | 43 | var byte = bytes[this_off]; |
| 30 | 0b1111_0000...0b1111_0111 => 4, | 44 | if (byte < 0x80) return .{ |
| 31 | else => { | 45 | .code = byte, |
| 32 | // unicode replacement code point. | 46 | .offset = this_off, |
| 33 | return .{ | 47 | .len = 1, |
| 34 | .code = 0xfffd, | ||
| 35 | .len = 1, | ||
| 36 | .offset = offset, | ||
| 37 | }; | ||
| 38 | }, | ||
| 39 | }, | ||
| 40 | .offset = offset, | ||
| 41 | }; | 48 | }; |
| 49 | // Multibyte | ||
| 42 | 50 | ||
| 43 | // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte | 51 | // Second: |
| 44 | if (cp.len > bytes.len) { | 52 | var class: u4 = @intCast(u8dfa[byte]); |
| 45 | // Unicode replacement code point. | 53 | var st: u32 = state_dfa[class]; |
| 54 | if (st == RUNE_REJECT or cursor.* == bytes.len) { | ||
| 55 | @branchHint(.cold); | ||
| 56 | // First one is never a truncation | ||
| 46 | return .{ | 57 | return .{ |
| 47 | .code = 0xfffd, | 58 | .code = 0xfffd, |
| 48 | .len = 1, | 59 | .len = 1, |
| 49 | .offset = offset, | 60 | .offset = this_off, |
| 50 | }; | 61 | }; |
| 51 | } | 62 | } |
| 52 | 63 | var rune: u32 = byte & class_mask[class]; | |
| 53 | const cp_bytes = bytes[0..cp.len]; | 64 | byte = bytes[cursor.*]; |
| 54 | cp.code = switch (cp.len) { | 65 | class = @intCast(u8dfa[byte]); |
| 55 | 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111), | 66 | st = state_dfa[st + class]; |
| 56 | 67 | rune = (byte & 0x3f) | (rune << 6); | |
| 57 | 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) | | 68 | cursor.* += 1; |
| 58 | (cp_bytes[1] & 0b00111111)) << 6) | | 69 | if (st == RUNE_ACCEPT) { |
| 59 | (cp_bytes[2] & 0b00111111), | 70 | return .{ |
| 60 | 71 | .code = @intCast(rune), | |
| 61 | 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) | | 72 | .len = 2, |
| 62 | (cp_bytes[1] & 0b00111111)) << 6) | | 73 | .offset = this_off, |
| 63 | (cp_bytes[2] & 0b00111111)) << 6) | | 74 | }; |
| 64 | (cp_bytes[3] & 0b00111111), | 75 | } |
| 65 | 76 | if (st == RUNE_REJECT or cursor.* == bytes.len) { | |
| 66 | else => @panic("CodePointIterator.next invalid code point length."), | 77 | @branchHint(.cold); |
| 78 | // Check for valid start at cursor: | ||
| 79 | if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { | ||
| 80 | return .{ | ||
| 81 | .code = 0xfffd, | ||
| 82 | .len = 2, | ||
| 83 | .offset = this_off, | ||
| 84 | }; | ||
| 85 | } else { | ||
| 86 | // Truncation. | ||
| 87 | cursor.* -= 1; | ||
| 88 | return .{ | ||
| 89 | .code = 0xfffe, | ||
| 90 | .len = 1, | ||
| 91 | .offset = this_off, | ||
| 92 | }; | ||
| 93 | } | ||
| 94 | } | ||
| 95 | // Third | ||
| 96 | byte = bytes[cursor.*]; | ||
| 97 | class = @intCast(u8dfa[byte]); | ||
| 98 | st = state_dfa[st + class]; | ||
| 99 | rune = (byte & 0x3f) | (rune << 6); | ||
| 100 | cursor.* += 1; | ||
| 101 | if (st == RUNE_ACCEPT) { | ||
| 102 | return .{ | ||
| 103 | .code = @intCast(rune), | ||
| 104 | .len = 3, | ||
| 105 | .offset = this_off, | ||
| 106 | }; | ||
| 107 | } | ||
| 108 | if (st == RUNE_REJECT or cursor.* == bytes.len) { | ||
| 109 | @branchHint(.cold); | ||
| 110 | if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { | ||
| 111 | return .{ | ||
| 112 | .code = 0xfffd, | ||
| 113 | .len = 3, | ||
| 114 | .offset = this_off, | ||
| 115 | }; | ||
| 116 | } else { | ||
| 117 | cursor.* -= 1; | ||
| 118 | return .{ | ||
| 119 | .code = 0xfffd, | ||
| 120 | .len = 2, | ||
| 121 | .offset = this_off, | ||
| 122 | }; | ||
| 123 | } | ||
| 124 | } | ||
| 125 | byte = bytes[cursor.*]; | ||
| 126 | class = @intCast(u8dfa[byte]); | ||
| 127 | st = state_dfa[st + class]; | ||
| 128 | rune = (byte & 0x3f) | (rune << 6); | ||
| 129 | cursor.* += 1; | ||
| 130 | if (st == RUNE_REJECT) { | ||
| 131 | @branchHint(.cold); | ||
| 132 | if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { | ||
| 133 | return .{ | ||
| 134 | .code = 0xfffd, | ||
| 135 | .len = 4, | ||
| 136 | .offset = this_off, | ||
| 137 | }; | ||
| 138 | } else { | ||
| 139 | cursor.* -= 1; | ||
| 140 | return .{ | ||
| 141 | .code = 0xfffd, | ||
| 142 | .len = 3, | ||
| 143 | .offset = this_off, | ||
| 144 | }; | ||
| 145 | } | ||
| 146 | } | ||
| 147 | assert(st == RUNE_ACCEPT); | ||
| 148 | return .{ | ||
| 149 | .code = @intCast(rune), | ||
| 150 | .len = 4, | ||
| 151 | .offset = this_off, | ||
| 67 | }; | 152 | }; |
| 68 | |||
| 69 | return cp; | ||
| 70 | } | 153 | } |
| 71 | 154 | ||
| 72 | /// `Iterator` iterates a string one `CodePoint` at-a-time. | 155 | /// `Iterator` iterates a string one `CodePoint` at-a-time. |
| @@ -75,14 +158,7 @@ pub const Iterator = struct { | |||
| 75 | i: u32 = 0, | 158 | i: u32 = 0, |
| 76 | 159 | ||
| 77 | pub fn next(self: *Iterator) ?CodePoint { | 160 | pub fn next(self: *Iterator) ?CodePoint { |
| 78 | if (self.i >= self.bytes.len) return null; | 161 | return decodeAtCursor(self.bytes, &self.i); |
| 79 | |||
| 80 | const res = decode(self.bytes[self.i..], self.i); | ||
| 81 | if (res) |cp| { | ||
| 82 | self.i += cp.len; | ||
| 83 | } | ||
| 84 | |||
| 85 | return res; | ||
| 86 | } | 162 | } |
| 87 | 163 | ||
| 88 | pub fn peek(self: *Iterator) ?CodePoint { | 164 | pub fn peek(self: *Iterator) ?CodePoint { |
| @@ -92,6 +168,74 @@ pub const Iterator = struct { | |||
| 92 | } | 168 | } |
| 93 | }; | 169 | }; |
| 94 | 170 | ||
| 171 | // A fast DFA decoder for UTF-8 | ||
| 172 | // | ||
| 173 | // The algorithm used aims to be optimal, without involving SIMD, this | ||
| 174 | // strikes a balance between portability and efficiency. That is done | ||
| 175 | // by using a DFA, represented as a few lookup tables, to track state, | ||
| 176 | // encoding valid transitions between bytes, arriving at 0 each time a | ||
| 177 | // codepoint is decoded. In the process it builds up the value of the | ||
| 178 | // codepoint in question. | ||
| 179 | // | ||
| 180 | // The virtue of such an approach is low branching factor, achieved at | ||
| 181 | // a modest cost of storing the tables. An embedded system might want | ||
| 182 | // to use a more familiar decision graph based on switches, but modern | ||
| 183 | // hosted environments can well afford the space, and may appreciate a | ||
| 184 | // speed increase in exchange. | ||
| 185 | // | ||
| 186 | // Credit for the algorithm goes to Björn Höhrmann, who wrote it up at | ||
| 187 | // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ . The original | ||
| 188 | // license may be found in the ./credits folder. | ||
| 189 | // | ||
| 190 | |||
| 191 | /// Successful codepoint parse | ||
| 192 | const RUNE_ACCEPT = 0; | ||
| 193 | |||
| 194 | /// Error state | ||
| 195 | const RUNE_REJECT = 12; | ||
| 196 | |||
| 197 | /// Byte transitions: value to class | ||
| 198 | const u8dfa: [256]u8 = .{ | ||
| 199 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1f | ||
| 200 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3f | ||
| 201 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5f | ||
| 202 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7f | ||
| 203 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9f | ||
| 204 | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..bf | ||
| 205 | 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..df | ||
| 206 | 0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // e0..ef | ||
| 207 | 0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // f0..ff | ||
| 208 | }; | ||
| 209 | |||
| 210 | /// State transition: state + class = new state | ||
| 211 | const state_dfa: [108]u8 = .{ | ||
| 212 | 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, // 0 (RUNE_ACCEPT) | ||
| 213 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // 12 (RUNE_REJECT) | ||
| 214 | 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, // 24 | ||
| 215 | 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, // 32 | ||
| 216 | 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, // 48 | ||
| 217 | 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, // 60 | ||
| 218 | 12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // 72 | ||
| 219 | 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // 84 | ||
| 220 | 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // 96 | ||
| 221 | }; | ||
| 222 | |||
| 223 | /// State masks | ||
| 224 | const class_mask: [12]u8 = .{ | ||
| 225 | 0xff, | ||
| 226 | 0, | ||
| 227 | 0b0011_1111, | ||
| 228 | 0b0001_1111, | ||
| 229 | 0b0000_1111, | ||
| 230 | 0b0000_0111, | ||
| 231 | 0b0000_0011, | ||
| 232 | 0, | ||
| 233 | 0, | ||
| 234 | 0, | ||
| 235 | 0, | ||
| 236 | 0, | ||
| 237 | }; | ||
| 238 | |||
| 95 | test "decode" { | 239 | test "decode" { |
| 96 | const bytes = "🌩️"; | 240 | const bytes = "🌩️"; |
| 97 | const res = decode(bytes, 0); | 241 | const res = decode(bytes, 0); |
| @@ -120,8 +264,8 @@ test "overlongs" { | |||
| 120 | const bytes = "\xC0\xAF"; | 264 | const bytes = "\xC0\xAF"; |
| 121 | const res = decode(bytes, 0); | 265 | const res = decode(bytes, 0); |
| 122 | if (res) |cp| { | 266 | if (res) |cp| { |
| 123 | try testing.expectEqual(@as(u21, '/'), cp.code); | 267 | try testing.expectEqual(0xfffd, cp.code); |
| 124 | try testing.expectEqual(2, cp.len); | 268 | try testing.expectEqual(1, cp.len); |
| 125 | } else { | 269 | } else { |
| 126 | try testing.expect(false); | 270 | try testing.expect(false); |
| 127 | } | 271 | } |
| @@ -129,3 +273,4 @@ test "overlongs" { | |||
| 129 | 273 | ||
| 130 | const std = @import("std"); | 274 | const std = @import("std"); |
| 131 | const testing = std.testing; | 275 | const testing = std.testing; |
| 276 | const assert = std.debug.assert; | ||