1 files changed, 79 insertions, 57 deletions
diff --git a/src/code_point.zig b/src/code_point.zig
index 2f2e80f..13e38bf 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -8,74 +8,83 @@ pub const CodePoint = struct {
    offset: u32,
 };
-/// `Iterator` iterates a string one `CodePoint` at-a-time.
+/// given a small slice of a string, decode the corresponding codepoint
-pub const Iterator = struct {
+pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
-    bytes: []const u8,
+    // EOS fast path
-    i: u32 = 0,
+    if (bytes.len == 0) {
+        return null;
-    pub fn next(self: *Iterator) ?CodePoint {
+    }
-        if (self.i >= self.bytes.len) return null;
-        if (self.bytes[self.i] < 128) {
-            // ASCII fast path
-            defer self.i += 1;
-            return .{
+    // ASCII fast path
-                .code = self.bytes[self.i],
+    if (bytes[0] < 128) {
-                .len = 1,
+        return .{
-                .offset = self.i,
+            .code = bytes[0],
-            };
+            .len = 1,
-        }
+            .offset = offset,
+        };
+    }
-        var cp = CodePoint{
+    var cp = CodePoint{
-            .code = undefined,
+        .code = undefined,
-            .len = switch (self.bytes[self.i]) {
+        .len = switch (bytes[0]) {
-                0b1100_0000...0b1101_1111 => 2,
+            0b1100_0000...0b1101_1111 => 2,
-                0b1110_0000...0b1110_1111 => 3,
+            0b1110_0000...0b1110_1111 => 3,
-                0b1111_0000...0b1111_0111 => 4,
+            0b1111_0000...0b1111_0111 => 4,
-                else => {
+            else => {
-                    defer self.i += 1;
+                // unicode replacement code point.
-                    // Unicode replacement code point.
+                return .{
-                    return .{
+                    .code = 0xfffd,
-                        .code = 0xfffd,
+                    .len = 1,
-                        .len = 1,
+                    .offset = offset,
-                        .offset = self.i,
+                };
-                    };
-                },
            },
-            .offset = self.i,
+        },
+        .offset = offset,
+    };
+    // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte
+    if (cp.len > bytes.len) {
+        // Unicode replacement code point.
+        return .{
+            .code = 0xfffd,
+            .len = 1,
+            .offset = offset,
        };
+    }
-        // Return replacement if we don' have a complete codepoint remaining. Consumes only one byte
+    const cp_bytes = bytes[0..cp.len];
-        if (self.i + cp.len > self.bytes.len) {
+    cp.code = switch (cp.len) {
-            defer self.i += 1;
+        2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
-            // Unicode replacement code point.
-            return .{
-                .code = 0xfffd,
-                .len = 1,
-                .offset = self.i,
-            };
-        }
-        const cp_bytes = self.bytes[self.i..][0..cp.len];
+        3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
-        self.i += cp.len;
+            (cp_bytes[1] & 0b00111111)) << 6) |
+            (cp_bytes[2] & 0b00111111),
-        cp.code = switch (cp.len) {
+        4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
-            2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
+            (cp_bytes[1] & 0b00111111)) << 6) |
+            (cp_bytes[2] & 0b00111111)) << 6) |
+            (cp_bytes[3] & 0b00111111),
-            3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
+        else => @panic("CodePointIterator.next invalid code point length."),
-                (cp_bytes[1] & 0b00111111)) << 6) |
+    };
-                (cp_bytes[2] & 0b00111111),
-            4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
+    return cp;
-                (cp_bytes[1] & 0b00111111)) << 6) |
+}
-                (cp_bytes[2] & 0b00111111)) << 6) |
-                (cp_bytes[3] & 0b00111111),
-            else => @panic("CodePointIterator.next invalid code point length."),
+/// `Iterator` iterates a string one `CodePoint` at-a-time.
-        };
+pub const Iterator = struct {
+    bytes: []const u8,
+    i: u32 = 0,
+    pub fn next(self: *Iterator) ?CodePoint {
+        if (self.i >= self.bytes.len) return null;
+        const res = decode(self.bytes[self.i..], self.i);
+        if (res) |cp| {
+            self.i += cp.len;
+        }
-        return cp;
+        return res;
    }
    pub fn peek(self: *Iterator) ?CodePoint {
@@ -85,6 +94,19 @@ pub const Iterator = struct {
    }
 };
+test "decode" {
+    const bytes = "🌩️";
+    const res = decode(bytes, 0);
+    if (res) |cp| {
+        try std.testing.expectEqual(@as(u21, 0x1F329), cp.code);
+        try std.testing.expectEqual(4, cp.len);
+    } else {
+        // shouldn't have failed to return
+        try std.testing.expect(false);
+    }
+}
 test "peek" {
    var iter = Iterator{ .bytes = "Hi" };

diff --git a/src/code_point.zig b/src/code_point.zig index 2f2e80f..13e38bf 100644 --- a/src/code_point.zig +++ b/src/code_point.zig
@@ -8,74 +8,83 @@ pub const CodePoint = struct {
8	offset: u32,	8	offset: u32,
9	};	9	};
10		10
11	/// `Iterator` iterates a string one `CodePoint` at-a-time.	11	/// given a small slice of a string, decode the corresponding codepoint
12	pub const Iterator = struct {	12	pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
13	bytes: []const u8,	13	// EOS fast path
14	i: u32 = 0,	14	if (bytes.len == 0) {
15		15	return null;
16	pub fn next(self: *Iterator) ?CodePoint {	16	}
17	if (self.i >= self.bytes.len) return null;
18
19	if (self.bytes[self.i] < 128) {
20	// ASCII fast path
21	defer self.i += 1;
22		17
23	return .{	18	// ASCII fast path
24	.code = self.bytes[self.i],	19	if (bytes[0] < 128) {
25	.len = 1,	20	return .{
26	.offset = self.i,	21	.code = bytes[0],
27	};	22	.len = 1,
28	}	23	.offset = offset,
		24	};
		25	}
29		26
30	var cp = CodePoint{	27	var cp = CodePoint{
31	.code = undefined,	28	.code = undefined,
32	.len = switch (self.bytes[self.i]) {	29	.len = switch (bytes[0]) {
33	0b1100_0000...0b1101_1111 => 2,	30	0b1100_0000...0b1101_1111 => 2,
34	0b1110_0000...0b1110_1111 => 3,	31	0b1110_0000...0b1110_1111 => 3,
35	0b1111_0000...0b1111_0111 => 4,	32	0b1111_0000...0b1111_0111 => 4,
36	else => {	33	else => {
37	defer self.i += 1;	34	// unicode replacement code point.
38	// Unicode replacement code point.	35	return .{
39	return .{	36	.code = 0xfffd,
40	.code = 0xfffd,	37	.len = 1,
41	.len = 1,	38	.offset = offset,
42	.offset = self.i,	39	};
43	};
44	},
45	},	40	},
46	.offset = self.i,	41	},
		42	.offset = offset,
		43	};
		44
		45	// Return replacement if we don' have a complete codepoint remaining. Consumes only one byte
		46	if (cp.len > bytes.len) {
		47	// Unicode replacement code point.
		48	return .{
		49	.code = 0xfffd,
		50	.len = 1,
		51	.offset = offset,
47	};	52	};
		53	}
48		54
49	// Return replacement if we don' have a complete codepoint remaining. Consumes only one byte	55	const cp_bytes = bytes[0..cp.len];
50	if (self.i + cp.len > self.bytes.len) {	56	cp.code = switch (cp.len) {
51	defer self.i += 1;	57	2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) \| (cp_bytes[1] & 0b00111111),
52	// Unicode replacement code point.
53	return .{
54	.code = 0xfffd,
55	.len = 1,
56	.offset = self.i,
57	};
58	}
59		58
60	const cp_bytes = self.bytes[self.i..][0..cp.len];	59	3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) \|
61	self.i += cp.len;	60	(cp_bytes[1] & 0b00111111)) << 6) \|
		61	(cp_bytes[2] & 0b00111111),
62		62
63	cp.code = switch (cp.len) {	63	4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) \|
64	2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) \| (cp_bytes[1] & 0b00111111),	64	(cp_bytes[1] & 0b00111111)) << 6) \|
		65	(cp_bytes[2] & 0b00111111)) << 6) \|
		66	(cp_bytes[3] & 0b00111111),
65		67
66	3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) \|	68	else => @panic("CodePointIterator.next invalid code point length."),
67	(cp_bytes[1] & 0b00111111)) << 6) \|	69	};
68	(cp_bytes[2] & 0b00111111),
69		70
70	4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) \|	71	return cp;
71	(cp_bytes[1] & 0b00111111)) << 6) \|	72	}
72	(cp_bytes[2] & 0b00111111)) << 6) \|
73	(cp_bytes[3] & 0b00111111),
74		73
75	else => @panic("CodePointIterator.next invalid code point length."),	74	/// `Iterator` iterates a string one `CodePoint` at-a-time.
76	};	75	pub const Iterator = struct {
		76	bytes: []const u8,
		77	i: u32 = 0,
		78
		79	pub fn next(self: *Iterator) ?CodePoint {
		80	if (self.i >= self.bytes.len) return null;
		81
		82	const res = decode(self.bytes[self.i..], self.i);
		83	if (res) \|cp\| {
		84	self.i += cp.len;
		85	}
77		86
78	return cp;	87	return res;
79	}	88	}
80		89
81	pub fn peek(self: *Iterator) ?CodePoint {	90	pub fn peek(self: *Iterator) ?CodePoint {
@@ -85,6 +94,19 @@ pub const Iterator = struct {
85	}	94	}
86	};	95	};
87		96
		97	test "decode" {
		98	const bytes = "🌩️";
		99	const res = decode(bytes, 0);
		100
		101	if (res) \|cp\| {
		102	try std.testing.expectEqual(@as(u21, 0x1F329), cp.code);
		103	try std.testing.expectEqual(4, cp.len);
		104	} else {
		105	// shouldn't have failed to return
		106	try std.testing.expect(false);
		107	}
		108	}
		109
88	test "peek" {	110	test "peek" {
89	var iter = Iterator{ .bytes = "Hi" };	111	var iter = Iterator{ .bytes = "Hi" };
90		112