diff options
| -rw-r--r-- | src/code_point.zig | 136 |
1 files changed, 79 insertions, 57 deletions
diff --git a/src/code_point.zig b/src/code_point.zig index 2f2e80f..13e38bf 100644 --- a/src/code_point.zig +++ b/src/code_point.zig | |||
| @@ -8,74 +8,83 @@ pub const CodePoint = struct { | |||
| 8 | offset: u32, | 8 | offset: u32, |
| 9 | }; | 9 | }; |
| 10 | 10 | ||
| 11 | /// `Iterator` iterates a string one `CodePoint` at-a-time. | 11 | /// given a small slice of a string, decode the corresponding codepoint |
| 12 | pub const Iterator = struct { | 12 | pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { |
| 13 | bytes: []const u8, | 13 | // EOS fast path |
| 14 | i: u32 = 0, | 14 | if (bytes.len == 0) { |
| 15 | 15 | return null; | |
| 16 | pub fn next(self: *Iterator) ?CodePoint { | 16 | } |
| 17 | if (self.i >= self.bytes.len) return null; | ||
| 18 | |||
| 19 | if (self.bytes[self.i] < 128) { | ||
| 20 | // ASCII fast path | ||
| 21 | defer self.i += 1; | ||
| 22 | 17 | ||
| 23 | return .{ | 18 | // ASCII fast path |
| 24 | .code = self.bytes[self.i], | 19 | if (bytes[0] < 128) { |
| 25 | .len = 1, | 20 | return .{ |
| 26 | .offset = self.i, | 21 | .code = bytes[0], |
| 27 | }; | 22 | .len = 1, |
| 28 | } | 23 | .offset = offset, |
| 24 | }; | ||
| 25 | } | ||
| 29 | 26 | ||
| 30 | var cp = CodePoint{ | 27 | var cp = CodePoint{ |
| 31 | .code = undefined, | 28 | .code = undefined, |
| 32 | .len = switch (self.bytes[self.i]) { | 29 | .len = switch (bytes[0]) { |
| 33 | 0b1100_0000...0b1101_1111 => 2, | 30 | 0b1100_0000...0b1101_1111 => 2, |
| 34 | 0b1110_0000...0b1110_1111 => 3, | 31 | 0b1110_0000...0b1110_1111 => 3, |
| 35 | 0b1111_0000...0b1111_0111 => 4, | 32 | 0b1111_0000...0b1111_0111 => 4, |
| 36 | else => { | 33 | else => { |
| 37 | defer self.i += 1; | 34 | // unicode replacement code point. |
| 38 | // Unicode replacement code point. | 35 | return .{ |
| 39 | return .{ | 36 | .code = 0xfffd, |
| 40 | .code = 0xfffd, | 37 | .len = 1, |
| 41 | .len = 1, | 38 | .offset = offset, |
| 42 | .offset = self.i, | 39 | }; |
| 43 | }; | ||
| 44 | }, | ||
| 45 | }, | 40 | }, |
| 46 | .offset = self.i, | 41 | }, |
| 42 | .offset = offset, | ||
| 43 | }; | ||
| 44 | |||
| 45 | // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte | ||
| 46 | if (cp.len > bytes.len) { | ||
| 47 | // Unicode replacement code point. | ||
| 48 | return .{ | ||
| 49 | .code = 0xfffd, | ||
| 50 | .len = 1, | ||
| 51 | .offset = offset, | ||
| 47 | }; | 52 | }; |
| 53 | } | ||
| 48 | 54 | ||
| 49 | // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte | 55 | const cp_bytes = bytes[0..cp.len]; |
| 50 | if (self.i + cp.len > self.bytes.len) { | 56 | cp.code = switch (cp.len) { |
| 51 | defer self.i += 1; | 57 | 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111), |
| 52 | // Unicode replacement code point. | ||
| 53 | return .{ | ||
| 54 | .code = 0xfffd, | ||
| 55 | .len = 1, | ||
| 56 | .offset = self.i, | ||
| 57 | }; | ||
| 58 | } | ||
| 59 | 58 | ||
| 60 | const cp_bytes = self.bytes[self.i..][0..cp.len]; | 59 | 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) | |
| 61 | self.i += cp.len; | 60 | (cp_bytes[1] & 0b00111111)) << 6) | |
| 61 | (cp_bytes[2] & 0b00111111), | ||
| 62 | 62 | ||
| 63 | cp.code = switch (cp.len) { | 63 | 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) | |
| 64 | 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111), | 64 | (cp_bytes[1] & 0b00111111)) << 6) | |
| 65 | (cp_bytes[2] & 0b00111111)) << 6) | | ||
| 66 | (cp_bytes[3] & 0b00111111), | ||
| 65 | 67 | ||
| 66 | 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) | | 68 | else => @panic("CodePointIterator.next invalid code point length."), |
| 67 | (cp_bytes[1] & 0b00111111)) << 6) | | 69 | }; |
| 68 | (cp_bytes[2] & 0b00111111), | ||
| 69 | 70 | ||
| 70 | 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) | | 71 | return cp; |
| 71 | (cp_bytes[1] & 0b00111111)) << 6) | | 72 | } |
| 72 | (cp_bytes[2] & 0b00111111)) << 6) | | ||
| 73 | (cp_bytes[3] & 0b00111111), | ||
| 74 | 73 | ||
| 75 | else => @panic("CodePointIterator.next invalid code point length."), | 74 | /// `Iterator` iterates a string one `CodePoint` at-a-time. |
| 76 | }; | 75 | pub const Iterator = struct { |
| 76 | bytes: []const u8, | ||
| 77 | i: u32 = 0, | ||
| 78 | |||
| 79 | pub fn next(self: *Iterator) ?CodePoint { | ||
| 80 | if (self.i >= self.bytes.len) return null; | ||
| 81 | |||
| 82 | const res = decode(self.bytes[self.i..], self.i); | ||
| 83 | if (res) |cp| { | ||
| 84 | self.i += cp.len; | ||
| 85 | } | ||
| 77 | 86 | ||
| 78 | return cp; | 87 | return res; |
| 79 | } | 88 | } |
| 80 | 89 | ||
| 81 | pub fn peek(self: *Iterator) ?CodePoint { | 90 | pub fn peek(self: *Iterator) ?CodePoint { |
| @@ -85,6 +94,19 @@ pub const Iterator = struct { | |||
| 85 | } | 94 | } |
| 86 | }; | 95 | }; |
| 87 | 96 | ||
| 97 | test "decode" { | ||
| 98 | const bytes = "🌩️"; | ||
| 99 | const res = decode(bytes, 0); | ||
| 100 | |||
| 101 | if (res) |cp| { | ||
| 102 | try std.testing.expectEqual(@as(u21, 0x1F329), cp.code); | ||
| 103 | try std.testing.expectEqual(4, cp.len); | ||
| 104 | } else { | ||
| 105 | // shouldn't have failed to return | ||
| 106 | try std.testing.expect(false); | ||
| 107 | } | ||
| 108 | } | ||
| 109 | |||
| 88 | test "peek" { | 110 | test "peek" { |
| 89 | var iter = Iterator{ .bytes = "Hi" }; | 111 | var iter = Iterator{ .bytes = "Hi" }; |
| 90 | 112 | ||