1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
/// `CodePoint` represents a Unicode code point by its code,
/// length, and offset in the source bytes.
pub const CodePoint = struct {
code: u21,
len: u3,
offset: u32,
};
/// given a small slice of a string, decode the corresponding codepoint
pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
// EOS fast path
if (bytes.len == 0) {
return null;
}
// ASCII fast path
if (bytes[0] < 128) {
return .{
.code = bytes[0],
.len = 1,
.offset = offset,
};
}
var cp = CodePoint{
.code = undefined,
.len = switch (bytes[0]) {
0b1100_0000...0b1101_1111 => 2,
0b1110_0000...0b1110_1111 => 3,
0b1111_0000...0b1111_0111 => 4,
else => {
// unicode replacement code point.
return .{
.code = 0xfffd,
.len = 1,
.offset = offset,
};
},
},
.offset = offset,
};
// Return replacement if we don' have a complete codepoint remaining. Consumes only one byte
if (cp.len > bytes.len) {
// Unicode replacement code point.
return .{
.code = 0xfffd,
.len = 1,
.offset = offset,
};
}
const cp_bytes = bytes[0..cp.len];
cp.code = switch (cp.len) {
2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
(cp_bytes[1] & 0b00111111)) << 6) |
(cp_bytes[2] & 0b00111111),
4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
(cp_bytes[1] & 0b00111111)) << 6) |
(cp_bytes[2] & 0b00111111)) << 6) |
(cp_bytes[3] & 0b00111111),
else => @panic("CodePointIterator.next invalid code point length."),
};
return cp;
}
/// `Iterator` iterates a string one `CodePoint` at-a-time.
pub const Iterator = struct {
bytes: []const u8,
i: u32 = 0,
pub fn next(self: *Iterator) ?CodePoint {
if (self.i >= self.bytes.len) return null;
const res = decode(self.bytes[self.i..], self.i);
if (res) |cp| {
self.i += cp.len;
}
return res;
}
pub fn peek(self: *Iterator) ?CodePoint {
const saved_i = self.i;
defer self.i = saved_i;
return self.next();
}
};
test "decode" {
const bytes = "🌩️";
const res = decode(bytes, 0);
if (res) |cp| {
try std.testing.expectEqual(@as(u21, 0x1F329), cp.code);
try std.testing.expectEqual(4, cp.len);
} else {
// shouldn't have failed to return
try std.testing.expect(false);
}
}
test "peek" {
var iter = Iterator{ .bytes = "Hi" };
try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);
try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
}
test "overlongs" {
// Should not pass!
const bytes = "\xC0\xAF";
const res = decode(bytes, 0);
if (res) |cp| {
try testing.expectEqual(@as(u21, '/'), cp.code);
try testing.expectEqual(2, cp.len);
} else {
try testing.expect(false);
}
}
const std = @import("std");
const testing = std.testing;
|