summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/code_point.zig136
1 files changed, 79 insertions, 57 deletions
diff --git a/src/code_point.zig b/src/code_point.zig
index 2f2e80f..13e38bf 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -8,74 +8,83 @@ pub const CodePoint = struct {
8 offset: u32, 8 offset: u32,
9}; 9};
10 10
11/// `Iterator` iterates a string one `CodePoint` at-a-time. 11/// given a small slice of a string, decode the corresponding codepoint
12pub const Iterator = struct { 12pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
13 bytes: []const u8, 13 // EOS fast path
14 i: u32 = 0, 14 if (bytes.len == 0) {
15 15 return null;
16 pub fn next(self: *Iterator) ?CodePoint { 16 }
17 if (self.i >= self.bytes.len) return null;
18
19 if (self.bytes[self.i] < 128) {
20 // ASCII fast path
21 defer self.i += 1;
22 17
23 return .{ 18 // ASCII fast path
24 .code = self.bytes[self.i], 19 if (bytes[0] < 128) {
25 .len = 1, 20 return .{
26 .offset = self.i, 21 .code = bytes[0],
27 }; 22 .len = 1,
28 } 23 .offset = offset,
24 };
25 }
29 26
30 var cp = CodePoint{ 27 var cp = CodePoint{
31 .code = undefined, 28 .code = undefined,
32 .len = switch (self.bytes[self.i]) { 29 .len = switch (bytes[0]) {
33 0b1100_0000...0b1101_1111 => 2, 30 0b1100_0000...0b1101_1111 => 2,
34 0b1110_0000...0b1110_1111 => 3, 31 0b1110_0000...0b1110_1111 => 3,
35 0b1111_0000...0b1111_0111 => 4, 32 0b1111_0000...0b1111_0111 => 4,
36 else => { 33 else => {
37 defer self.i += 1; 34 // unicode replacement code point.
38 // Unicode replacement code point. 35 return .{
39 return .{ 36 .code = 0xfffd,
40 .code = 0xfffd, 37 .len = 1,
41 .len = 1, 38 .offset = offset,
42 .offset = self.i, 39 };
43 };
44 },
45 }, 40 },
46 .offset = self.i, 41 },
42 .offset = offset,
43 };
44
45 // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte
46 if (cp.len > bytes.len) {
47 // Unicode replacement code point.
48 return .{
49 .code = 0xfffd,
50 .len = 1,
51 .offset = offset,
47 }; 52 };
53 }
48 54
49 // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte 55 const cp_bytes = bytes[0..cp.len];
50 if (self.i + cp.len > self.bytes.len) { 56 cp.code = switch (cp.len) {
51 defer self.i += 1; 57 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
52 // Unicode replacement code point.
53 return .{
54 .code = 0xfffd,
55 .len = 1,
56 .offset = self.i,
57 };
58 }
59 58
60 const cp_bytes = self.bytes[self.i..][0..cp.len]; 59 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
61 self.i += cp.len; 60 (cp_bytes[1] & 0b00111111)) << 6) |
61 (cp_bytes[2] & 0b00111111),
62 62
63 cp.code = switch (cp.len) { 63 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
64 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111), 64 (cp_bytes[1] & 0b00111111)) << 6) |
65 (cp_bytes[2] & 0b00111111)) << 6) |
66 (cp_bytes[3] & 0b00111111),
65 67
66 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) | 68 else => @panic("CodePointIterator.next invalid code point length."),
67 (cp_bytes[1] & 0b00111111)) << 6) | 69 };
68 (cp_bytes[2] & 0b00111111),
69 70
70 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) | 71 return cp;
71 (cp_bytes[1] & 0b00111111)) << 6) | 72}
72 (cp_bytes[2] & 0b00111111)) << 6) |
73 (cp_bytes[3] & 0b00111111),
74 73
75 else => @panic("CodePointIterator.next invalid code point length."), 74/// `Iterator` iterates a string one `CodePoint` at-a-time.
76 }; 75pub const Iterator = struct {
76 bytes: []const u8,
77 i: u32 = 0,
78
79 pub fn next(self: *Iterator) ?CodePoint {
80 if (self.i >= self.bytes.len) return null;
81
82 const res = decode(self.bytes[self.i..], self.i);
83 if (res) |cp| {
84 self.i += cp.len;
85 }
77 86
78 return cp; 87 return res;
79 } 88 }
80 89
81 pub fn peek(self: *Iterator) ?CodePoint { 90 pub fn peek(self: *Iterator) ?CodePoint {
@@ -85,6 +94,19 @@ pub const Iterator = struct {
85 } 94 }
86}; 95};
87 96
97test "decode" {
98 const bytes = "🌩️";
99 const res = decode(bytes, 0);
100
101 if (res) |cp| {
102 try std.testing.expectEqual(@as(u21, 0x1F329), cp.code);
103 try std.testing.expectEqual(4, cp.len);
104 } else {
105 // shouldn't have failed to return
106 try std.testing.expect(false);
107 }
108}
109
88test "peek" { 110test "peek" {
89 var iter = Iterator{ .bytes = "Hi" }; 111 var iter = Iterator{ .bytes = "Hi" };
90 112