summaryrefslogtreecommitdiff
path: root/src/CodePoint.zig
blob: e72823b13fd6e40f310bd1bbf27350753201fc85 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
//! `CodePoint` represents a Unicode code point by its code, length, and offset in the source bytes.

const std = @import("std");

code: u21,
len: u3,
offset: usize,

const CodePoint = @This();

/// `CodePointIterator` iterates a string one `CodePoint` at-a-time.
pub const CodePointIterator = struct {
    bytes: []const u8,
    i: usize = 0,

    pub fn next(self: *CodePointIterator) ?CodePoint {
        if (self.i >= self.bytes.len) return null;

        if (self.bytes[self.i] < 128) {
            // ASCII fast path
            const cp = CodePoint{
                .code = self.bytes[self.i],
                .len = 1,
                .offset = self.i,
            };

            self.i += 1;

            return cp;
        }

        var cp = CodePoint{
            .code = undefined,
            .len = blk: {
                break :blk switch (self.bytes[self.i]) {
                    0b1100_0000...0b1101_1111 => 2,
                    0b1110_0000...0b1110_1111 => 3,
                    0b1111_0000...0b1111_0111 => 4,
                    else => @panic("CodePointIterator.next: Ivalid code point start byte."),
                };
            },
            .offset = self.i,
        };

        const cp_bytes = self.bytes[self.i..][0..cp.len];
        self.i += cp.len;

        cp.code = switch (cp.len) {
            2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),

            3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
                (cp_bytes[1] & 0b00111111)) << 6) |
                (cp_bytes[2] & 0b00111111),

            4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
                (cp_bytes[1] & 0b00111111)) << 6) |
                (cp_bytes[2] & 0b00111111)) << 6) |
                (cp_bytes[3] & 0b00111111),

            else => @panic("CodePointIterator.next invalid code point length."),
        };

        return cp;
    }

    pub fn peek(self: *CodePointIterator) ?CodePoint {
        const saved_i = self.i;
        defer self.i = saved_i;
        return self.next();
    }
};

test "CodePointIterator peek" {
    var iter = CodePointIterator{ .bytes = "Hi" };

    try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
    try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
    try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);
    try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
    try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
}

/// `readCodePoint` returns the next code point code as a `u21` in the given reader, or null at end-of-input.
pub fn readCodePoint(reader: anytype) !?u21 {
    var buf: [4]u8 = undefined;

    buf[0] = reader.readByte() catch |err| switch (err) {
        error.EndOfStream => return null,
        else => return err,
    };

    if (buf[0] < 128) return @as(u21, buf[0]);

    const len: u3 = switch (buf[0]) {
        0b1100_0000...0b1101_1111 => 2,
        0b1110_0000...0b1110_1111 => 3,
        0b1111_0000...0b1111_0111 => 4,
        else => return error.InvalidUtf8,
    };

    const read = try reader.read(buf[1..len]);

    if (read < len - 1) return error.InvalidUtf8;

    return switch (len) {
        2 => (@as(u21, (buf[0] & 0b00011111)) << 6) | (buf[1] & 0b00111111),

        3 => (((@as(u21, (buf[0] & 0b00001111)) << 6) |
            (buf[1] & 0b00111111)) << 6) |
            (buf[2] & 0b00111111),

        4 => (((((@as(u21, (buf[0] & 0b00000111)) << 6) |
            (buf[1] & 0b00111111)) << 6) |
            (buf[2] & 0b00111111)) << 6) |
            (buf[3] & 0b00111111),

        else => @panic("readCodePoint invalid code point length."),
    };
}

test "readCodePoint" {
    var buf = "abé😹".*;
    var fis = std.io.fixedBufferStream(&buf);
    const reader = fis.reader();

    try std.testing.expectEqual(@as(u21, 'a'), (try readCodePoint(reader)).?);
    try std.testing.expectEqual(@as(u21, 'b'), (try readCodePoint(reader)).?);
    try std.testing.expectEqual(@as(u21, 'é'), (try readCodePoint(reader)).?);
    try std.testing.expectEqual(@as(u21, '😹'), (try readCodePoint(reader)).?);
    try std.testing.expectEqual(@as(?u21, null), try readCodePoint(reader));
}