summaryrefslogtreecommitdiff
path: root/src/code_point.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/code_point.zig')
-rw-r--r--src/code_point.zig85
1 files changed, 85 insertions, 0 deletions
diff --git a/src/code_point.zig b/src/code_point.zig
new file mode 100644
index 0000000..ac37562
--- /dev/null
+++ b/src/code_point.zig
@@ -0,0 +1,85 @@
1const std = @import("std");
2
3/// `CodePoint` represents a Unicode code point by its code,
4/// length, and offset in the source bytes.
5pub const CodePoint = struct {
6 code: u21,
7 len: u3,
8 offset: u32,
9};
10
11/// `Iterator` iterates a string one `CodePoint` at-a-time.
12pub const Iterator = struct {
13 bytes: []const u8,
14 i: u32 = 0,
15
16 pub fn next(self: *Iterator) ?CodePoint {
17 if (self.i >= self.bytes.len) return null;
18
19 if (self.bytes[self.i] < 128) {
20 // ASCII fast path
21 defer self.i += 1;
22
23 return .{
24 .code = self.bytes[self.i],
25 .len = 1,
26 .offset = self.i,
27 };
28 }
29
30 var cp = CodePoint{
31 .code = undefined,
32 .len = switch (self.bytes[self.i]) {
33 0b1100_0000...0b1101_1111 => 2,
34 0b1110_0000...0b1110_1111 => 3,
35 0b1111_0000...0b1111_0111 => 4,
36 else => {
37 defer self.i += 1;
38 // Unicode replacement code point.
39 return .{
40 .code = 0xfffd,
41 .len = 1,
42 .offset = self.i,
43 };
44 },
45 },
46 .offset = self.i,
47 };
48
49 const cp_bytes = self.bytes[self.i..][0..cp.len];
50 self.i += cp.len;
51
52 cp.code = switch (cp.len) {
53 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
54
55 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
56 (cp_bytes[1] & 0b00111111)) << 6) |
57 (cp_bytes[2] & 0b00111111),
58
59 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
60 (cp_bytes[1] & 0b00111111)) << 6) |
61 (cp_bytes[2] & 0b00111111)) << 6) |
62 (cp_bytes[3] & 0b00111111),
63
64 else => @panic("CodePointIterator.next invalid code point length."),
65 };
66
67 return cp;
68 }
69
70 pub fn peek(self: *Iterator) ?CodePoint {
71 const saved_i = self.i;
72 defer self.i = saved_i;
73 return self.next();
74 }
75};
76
77test "peek" {
78 var iter = Iterator{ .bytes = "Hi" };
79
80 try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
81 try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
82 try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);
83 try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
84 try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
85}