1 files changed, 131 insertions, 0 deletions
diff --git a/src/CodePoint.zig b/src/CodePoint.zig
new file mode 100644
index 0000000..e72823b
--- /dev/null
+++ b/src/CodePoint.zig
@@ -0,0 +1,131 @@
+//! `CodePoint` represents a Unicode code point by its code, length, and offset in the source bytes.
+const std = @import("std");
+code: u21,
+len: u3,
+offset: usize,
+const CodePoint = @This();
+/// `CodePointIterator` iterates a string one `CodePoint` at-a-time.
+pub const CodePointIterator = struct {
+    bytes: []const u8,
+    i: usize = 0,
+    pub fn next(self: *CodePointIterator) ?CodePoint {
+        if (self.i >= self.bytes.len) return null;
+        if (self.bytes[self.i] < 128) {
+            // ASCII fast path
+            const cp = CodePoint{
+                .code = self.bytes[self.i],
+                .len = 1,
+                .offset = self.i,
+            };
+            self.i += 1;
+            return cp;
+        }
+        var cp = CodePoint{
+            .code = undefined,
+            .len = blk: {
+                break :blk switch (self.bytes[self.i]) {
+                    0b1100_0000...0b1101_1111 => 2,
+                    0b1110_0000...0b1110_1111 => 3,
+                    0b1111_0000...0b1111_0111 => 4,
+                    else => @panic("CodePointIterator.next: Ivalid code point start byte."),
+                };
+            },
+            .offset = self.i,
+        };
+        const cp_bytes = self.bytes[self.i..][0..cp.len];
+        self.i += cp.len;
+        cp.code = switch (cp.len) {
+            2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
+            3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
+                (cp_bytes[1] & 0b00111111)) << 6) |
+                (cp_bytes[2] & 0b00111111),
+            4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
+                (cp_bytes[1] & 0b00111111)) << 6) |
+                (cp_bytes[2] & 0b00111111)) << 6) |
+                (cp_bytes[3] & 0b00111111),
+            else => @panic("CodePointIterator.next invalid code point length."),
+        };
+        return cp;
+    }
+    pub fn peek(self: *CodePointIterator) ?CodePoint {
+        const saved_i = self.i;
+        defer self.i = saved_i;
+        return self.next();
+    }
+};
+test "CodePointIterator peek" {
+    var iter = CodePointIterator{ .bytes = "Hi" };
+    try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
+    try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
+    try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);
+    try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
+    try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
+}
+/// `readCodePoint` returns the next code point code as a `u21` in the given reader, or null at end-of-input.
+pub fn readCodePoint(reader: anytype) !?u21 {
+    var buf: [4]u8 = undefined;
+    buf[0] = reader.readByte() catch |err| switch (err) {
+        error.EndOfStream => return null,
+        else => return err,
+    };
+    if (buf[0] < 128) return @as(u21, buf[0]);
+    const len: u3 = switch (buf[0]) {
+        0b1100_0000...0b1101_1111 => 2,
+        0b1110_0000...0b1110_1111 => 3,
+        0b1111_0000...0b1111_0111 => 4,
+        else => return error.InvalidUtf8,
+    };
+    const read = try reader.read(buf[1..len]);
+    if (read < len - 1) return error.InvalidUtf8;
+    return switch (len) {
+        2 => (@as(u21, (buf[0] & 0b00011111)) << 6) | (buf[1] & 0b00111111),
+        3 => (((@as(u21, (buf[0] & 0b00001111)) << 6) |
+            (buf[1] & 0b00111111)) << 6) |
+            (buf[2] & 0b00111111),
+        4 => (((((@as(u21, (buf[0] & 0b00000111)) << 6) |
+            (buf[1] & 0b00111111)) << 6) |
+            (buf[2] & 0b00111111)) << 6) |
+            (buf[3] & 0b00111111),
+        else => @panic("readCodePoint invalid code point length."),
+    };
+}
+test "readCodePoint" {
+    var buf = "abé😹".*;
+    var fis = std.io.fixedBufferStream(&buf);
+    const reader = fis.reader();
+    try std.testing.expectEqual(@as(u21, 'a'), (try readCodePoint(reader)).?);
+    try std.testing.expectEqual(@as(u21, 'b'), (try readCodePoint(reader)).?);
+    try std.testing.expectEqual(@as(u21, 'é'), (try readCodePoint(reader)).?);
+    try std.testing.expectEqual(@as(u21, '😹'), (try readCodePoint(reader)).?);
+    try std.testing.expectEqual(@as(?u21, null), try readCodePoint(reader));
+}

diff --git a/src/CodePoint.zig b/src/CodePoint.zig new file mode 100644 index 0000000..e72823b --- /dev/null +++ b/src/CodePoint.zig
@@ -0,0 +1,131 @@
	1	//! `CodePoint` represents a Unicode code point by its code, length, and offset in the source bytes.
	2
	3	const std = @import("std");
	4
	5	code: u21,
	6	len: u3,
	7	offset: usize,
	8
	9	const CodePoint = @This();
	10
	11	/// `CodePointIterator` iterates a string one `CodePoint` at-a-time.
	12	pub const CodePointIterator = struct {
	13	bytes: []const u8,
	14	i: usize = 0,
	15
	16	pub fn next(self: *CodePointIterator) ?CodePoint {
	17	if (self.i >= self.bytes.len) return null;
	18
	19	if (self.bytes[self.i] < 128) {
	20	// ASCII fast path
	21	const cp = CodePoint{
	22	.code = self.bytes[self.i],
	23	.len = 1,
	24	.offset = self.i,
	25	};
	26
	27	self.i += 1;
	28
	29	return cp;
	30	}
	31
	32	var cp = CodePoint{
	33	.code = undefined,
	34	.len = blk: {
	35	break :blk switch (self.bytes[self.i]) {
	36	0b1100_0000...0b1101_1111 => 2,
	37	0b1110_0000...0b1110_1111 => 3,
	38	0b1111_0000...0b1111_0111 => 4,
	39	else => @panic("CodePointIterator.next: Ivalid code point start byte."),
	40	};
	41	},
	42	.offset = self.i,
	43	};
	44
	45	const cp_bytes = self.bytes[self.i..][0..cp.len];
	46	self.i += cp.len;
	47
	48	cp.code = switch (cp.len) {
	49	2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) \| (cp_bytes[1] & 0b00111111),
	50
	51	3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) \|
	52	(cp_bytes[1] & 0b00111111)) << 6) \|
	53	(cp_bytes[2] & 0b00111111),
	54
	55	4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) \|
	56	(cp_bytes[1] & 0b00111111)) << 6) \|
	57	(cp_bytes[2] & 0b00111111)) << 6) \|
	58	(cp_bytes[3] & 0b00111111),
	59
	60	else => @panic("CodePointIterator.next invalid code point length."),
	61	};
	62
	63	return cp;
	64	}
	65
	66	pub fn peek(self: *CodePointIterator) ?CodePoint {
	67	const saved_i = self.i;
	68	defer self.i = saved_i;
	69	return self.next();
	70	}
	71	};
	72
	73	test "CodePointIterator peek" {
	74	var iter = CodePointIterator{ .bytes = "Hi" };
	75
	76	try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
	77	try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
	78	try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);
	79	try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
	80	try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
	81	}
	82
	83	/// `readCodePoint` returns the next code point code as a `u21` in the given reader, or null at end-of-input.
	84	pub fn readCodePoint(reader: anytype) !?u21 {
	85	var buf: [4]u8 = undefined;
	86
	87	buf[0] = reader.readByte() catch \|err\| switch (err) {
	88	error.EndOfStream => return null,
	89	else => return err,
	90	};
	91
	92	if (buf[0] < 128) return @as(u21, buf[0]);
	93
	94	const len: u3 = switch (buf[0]) {
	95	0b1100_0000...0b1101_1111 => 2,
	96	0b1110_0000...0b1110_1111 => 3,
	97	0b1111_0000...0b1111_0111 => 4,
	98	else => return error.InvalidUtf8,
	99	};
	100
	101	const read = try reader.read(buf[1..len]);
	102
	103	if (read < len - 1) return error.InvalidUtf8;
	104
	105	return switch (len) {
	106	2 => (@as(u21, (buf[0] & 0b00011111)) << 6) \| (buf[1] & 0b00111111),
	107
	108	3 => (((@as(u21, (buf[0] & 0b00001111)) << 6) \|
	109	(buf[1] & 0b00111111)) << 6) \|
	110	(buf[2] & 0b00111111),
	111
	112	4 => (((((@as(u21, (buf[0] & 0b00000111)) << 6) \|
	113	(buf[1] & 0b00111111)) << 6) \|
	114	(buf[2] & 0b00111111)) << 6) \|
	115	(buf[3] & 0b00111111),
	116
	117	else => @panic("readCodePoint invalid code point length."),
	118	};
	119	}
	120
	121	test "readCodePoint" {
	122	var buf = "abé😹".*;
	123	var fis = std.io.fixedBufferStream(&buf);
	124	const reader = fis.reader();
	125
	126	try std.testing.expectEqual(@as(u21, 'a'), (try readCodePoint(reader)).?);
	127	try std.testing.expectEqual(@as(u21, 'b'), (try readCodePoint(reader)).?);
	128	try std.testing.expectEqual(@as(u21, 'é'), (try readCodePoint(reader)).?);
	129	try std.testing.expectEqual(@as(u21, '😹'), (try readCodePoint(reader)).?);
	130	try std.testing.expectEqual(@as(?u21, null), try readCodePoint(reader));
	131	}