diff options
Diffstat (limited to 'src/CodePoint.zig')
| -rw-r--r-- | src/CodePoint.zig | 84 |
1 files changed, 0 insertions, 84 deletions
diff --git a/src/CodePoint.zig b/src/CodePoint.zig deleted file mode 100644 index 62dd793..0000000 --- a/src/CodePoint.zig +++ /dev/null | |||
| @@ -1,84 +0,0 @@ | |||
| 1 | //! `CodePoint` represents a Unicode code point by its code, length, and offset in the source bytes. | ||
| 2 | |||
| 3 | const std = @import("std"); | ||
| 4 | |||
| 5 | code: u21, | ||
| 6 | len: u3, | ||
| 7 | offset: usize, | ||
| 8 | |||
| 9 | const CodePoint = @This(); | ||
| 10 | |||
| 11 | /// `CodePointIterator` iterates a string one `CodePoint` at-a-time. | ||
| 12 | pub const CodePointIterator = struct { | ||
| 13 | bytes: []const u8, | ||
| 14 | i: usize = 0, | ||
| 15 | |||
| 16 | pub fn next(self: *CodePointIterator) ?CodePoint { | ||
| 17 | if (self.i >= self.bytes.len) return null; | ||
| 18 | |||
| 19 | if (self.bytes[self.i] < 128) { | ||
| 20 | // ASCII fast path | ||
| 21 | self.i += 1; | ||
| 22 | return .{ | ||
| 23 | .code = self.bytes[self.i - 1], | ||
| 24 | .len = 1, | ||
| 25 | .offset = self.i - 1, | ||
| 26 | }; | ||
| 27 | } | ||
| 28 | |||
| 29 | var cp = CodePoint{ | ||
| 30 | .code = undefined, | ||
| 31 | .len = switch (self.bytes[self.i]) { | ||
| 32 | 0b1100_0000...0b1101_1111 => 2, | ||
| 33 | 0b1110_0000...0b1110_1111 => 3, | ||
| 34 | 0b1111_0000...0b1111_0111 => 4, | ||
| 35 | else => { | ||
| 36 | self.i += 1; | ||
| 37 | // Unicode replacement code point. | ||
| 38 | return .{ | ||
| 39 | .code = 0xfffd, | ||
| 40 | .len = 1, | ||
| 41 | .offset = self.i - 1, | ||
| 42 | }; | ||
| 43 | }, | ||
| 44 | }, | ||
| 45 | .offset = self.i, | ||
| 46 | }; | ||
| 47 | |||
| 48 | const cp_bytes = self.bytes[self.i..][0..cp.len]; | ||
| 49 | self.i += cp.len; | ||
| 50 | |||
| 51 | cp.code = switch (cp.len) { | ||
| 52 | 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111), | ||
| 53 | |||
| 54 | 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) | | ||
| 55 | (cp_bytes[1] & 0b00111111)) << 6) | | ||
| 56 | (cp_bytes[2] & 0b00111111), | ||
| 57 | |||
| 58 | 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) | | ||
| 59 | (cp_bytes[1] & 0b00111111)) << 6) | | ||
| 60 | (cp_bytes[2] & 0b00111111)) << 6) | | ||
| 61 | (cp_bytes[3] & 0b00111111), | ||
| 62 | |||
| 63 | else => @panic("CodePointIterator.next invalid code point length."), | ||
| 64 | }; | ||
| 65 | |||
| 66 | return cp; | ||
| 67 | } | ||
| 68 | |||
| 69 | pub fn peek(self: *CodePointIterator) ?CodePoint { | ||
| 70 | const saved_i = self.i; | ||
| 71 | defer self.i = saved_i; | ||
| 72 | return self.next(); | ||
| 73 | } | ||
| 74 | }; | ||
| 75 | |||
| 76 | test "CodePointIterator peek" { | ||
| 77 | var iter = CodePointIterator{ .bytes = "Hi" }; | ||
| 78 | |||
| 79 | try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); | ||
| 80 | try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); | ||
| 81 | try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code); | ||
| 82 | try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); | ||
| 83 | try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); | ||
| 84 | } | ||