summaryrefslogtreecommitdiff
path: root/src/CodePoint.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/CodePoint.zig')
-rw-r--r--src/CodePoint.zig131
1 files changed, 131 insertions, 0 deletions
diff --git a/src/CodePoint.zig b/src/CodePoint.zig
new file mode 100644
index 0000000..e72823b
--- /dev/null
+++ b/src/CodePoint.zig
@@ -0,0 +1,131 @@
1//! `CodePoint` represents a Unicode code point by its code, length, and offset in the source bytes.
2
3const std = @import("std");
4
5code: u21,
6len: u3,
7offset: usize,
8
9const CodePoint = @This();
10
11/// `CodePointIterator` iterates a string one `CodePoint` at-a-time.
12pub const CodePointIterator = struct {
13 bytes: []const u8,
14 i: usize = 0,
15
16 pub fn next(self: *CodePointIterator) ?CodePoint {
17 if (self.i >= self.bytes.len) return null;
18
19 if (self.bytes[self.i] < 128) {
20 // ASCII fast path
21 const cp = CodePoint{
22 .code = self.bytes[self.i],
23 .len = 1,
24 .offset = self.i,
25 };
26
27 self.i += 1;
28
29 return cp;
30 }
31
32 var cp = CodePoint{
33 .code = undefined,
34 .len = blk: {
35 break :blk switch (self.bytes[self.i]) {
36 0b1100_0000...0b1101_1111 => 2,
37 0b1110_0000...0b1110_1111 => 3,
38 0b1111_0000...0b1111_0111 => 4,
39 else => @panic("CodePointIterator.next: Ivalid code point start byte."),
40 };
41 },
42 .offset = self.i,
43 };
44
45 const cp_bytes = self.bytes[self.i..][0..cp.len];
46 self.i += cp.len;
47
48 cp.code = switch (cp.len) {
49 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
50
51 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
52 (cp_bytes[1] & 0b00111111)) << 6) |
53 (cp_bytes[2] & 0b00111111),
54
55 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
56 (cp_bytes[1] & 0b00111111)) << 6) |
57 (cp_bytes[2] & 0b00111111)) << 6) |
58 (cp_bytes[3] & 0b00111111),
59
60 else => @panic("CodePointIterator.next invalid code point length."),
61 };
62
63 return cp;
64 }
65
66 pub fn peek(self: *CodePointIterator) ?CodePoint {
67 const saved_i = self.i;
68 defer self.i = saved_i;
69 return self.next();
70 }
71};
72
73test "CodePointIterator peek" {
74 var iter = CodePointIterator{ .bytes = "Hi" };
75
76 try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
77 try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
78 try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);
79 try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
80 try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
81}
82
83/// `readCodePoint` returns the next code point code as a `u21` in the given reader, or null at end-of-input.
84pub fn readCodePoint(reader: anytype) !?u21 {
85 var buf: [4]u8 = undefined;
86
87 buf[0] = reader.readByte() catch |err| switch (err) {
88 error.EndOfStream => return null,
89 else => return err,
90 };
91
92 if (buf[0] < 128) return @as(u21, buf[0]);
93
94 const len: u3 = switch (buf[0]) {
95 0b1100_0000...0b1101_1111 => 2,
96 0b1110_0000...0b1110_1111 => 3,
97 0b1111_0000...0b1111_0111 => 4,
98 else => return error.InvalidUtf8,
99 };
100
101 const read = try reader.read(buf[1..len]);
102
103 if (read < len - 1) return error.InvalidUtf8;
104
105 return switch (len) {
106 2 => (@as(u21, (buf[0] & 0b00011111)) << 6) | (buf[1] & 0b00111111),
107
108 3 => (((@as(u21, (buf[0] & 0b00001111)) << 6) |
109 (buf[1] & 0b00111111)) << 6) |
110 (buf[2] & 0b00111111),
111
112 4 => (((((@as(u21, (buf[0] & 0b00000111)) << 6) |
113 (buf[1] & 0b00111111)) << 6) |
114 (buf[2] & 0b00111111)) << 6) |
115 (buf[3] & 0b00111111),
116
117 else => @panic("readCodePoint invalid code point length."),
118 };
119}
120
121test "readCodePoint" {
122 var buf = "abé😹".*;
123 var fis = std.io.fixedBufferStream(&buf);
124 const reader = fis.reader();
125
126 try std.testing.expectEqual(@as(u21, 'a'), (try readCodePoint(reader)).?);
127 try std.testing.expectEqual(@as(u21, 'b'), (try readCodePoint(reader)).?);
128 try std.testing.expectEqual(@as(u21, 'é'), (try readCodePoint(reader)).?);
129 try std.testing.expectEqual(@as(u21, '😹'), (try readCodePoint(reader)).?);
130 try std.testing.expectEqual(@as(?u21, null), try readCodePoint(reader));
131}