3 files changed, 47 insertions, 44 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig
index f013aba..6981753 100644
--- a/src/Grapheme.zig
+++ b/src/Grapheme.zig
@@ -1,7 +1,6 @@
 const std = @import("std");
 const unicode = std.unicode;
-const CodePoint = @import("code_point").CodePoint;
 const CodePointIterator = @import("code_point").Iterator;
 const gbp = @import("gbp");
@@ -17,6 +16,13 @@ pub const Grapheme = struct {
    }
 };
+// We need the code as a u21.
+const CodePoint = struct {
+    code: u21,
+    len: u3,
+    offset: u32,
+};
 /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
 pub const Iterator = struct {
    buf: [2]?CodePoint = .{ null, null },
@@ -33,7 +39,13 @@ pub const Iterator = struct {
    fn advance(self: *Self) void {
        self.buf[0] = self.buf[1];
-        self.buf[1] = self.cp_iter.next();
+        const maybe_cp = self.cp_iter.next();
+        self.buf[1] = if (maybe_cp) |cp| .{
+            .code = cp.code(self.cp_iter.bytes),
+            .len = cp.len,
+            .offset = cp.offset,
+        } else null;
    }
    pub fn next(self: *Self) ?Grapheme {
diff --git a/src/code_point.zig b/src/code_point.zig
index ac37562..098e635 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -3,9 +3,29 @@ const std = @import("std");
 /// `CodePoint` represents a Unicode code point by its code,
 /// length, and offset in the source bytes.
 pub const CodePoint = struct {
-    code: u21,
    len: u3,
    offset: u32,
+    pub fn code(self: CodePoint, src: []const u8) u21 {
+        const cp_bytes = src[self.offset..][0..self.len];
+        return switch (self.len) {
+            1 => cp_bytes[0],
+            2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
+            3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
+                (cp_bytes[1] & 0b00111111)) << 6) |
+                (cp_bytes[2] & 0b00111111),
+            4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
+                (cp_bytes[1] & 0b00111111)) << 6) |
+                (cp_bytes[2] & 0b00111111)) << 6) |
+                (cp_bytes[3] & 0b00111111),
+            else => @panic("code_point.CodePoint.code: Invalid code point length."),
+        };
+    }
 };
 /// `Iterator` iterates a string one `CodePoint` at-a-time.
@@ -19,51 +39,20 @@ pub const Iterator = struct {
        if (self.bytes[self.i] < 128) {
            // ASCII fast path
            defer self.i += 1;
+            return .{ .len = 1, .offset = self.i };
-            return .{
-                .code = self.bytes[self.i],
-                .len = 1,
-                .offset = self.i,
-            };
        }
-        var cp = CodePoint{
+        const cp = CodePoint{
-            .code = undefined,
            .len = switch (self.bytes[self.i]) {
                0b1100_0000...0b1101_1111 => 2,
                0b1110_0000...0b1110_1111 => 3,
                0b1111_0000...0b1111_0111 => 4,
-                else => {
+                else => @panic("code_point.Iterator.next: Invalid start byte."),
-                    defer self.i += 1;
-                    // Unicode replacement code point.
-                    return .{
-                        .code = 0xfffd,
-                        .len = 1,
-                        .offset = self.i,
-                    };
-                },
            },
            .offset = self.i,
        };
-        const cp_bytes = self.bytes[self.i..][0..cp.len];
        self.i += cp.len;
-        cp.code = switch (cp.len) {
-            2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
-            3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
-                (cp_bytes[1] & 0b00111111)) << 6) |
-                (cp_bytes[2] & 0b00111111),
-            4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
-                (cp_bytes[1] & 0b00111111)) << 6) |
-                (cp_bytes[2] & 0b00111111)) << 6) |
-                (cp_bytes[3] & 0b00111111),
-            else => @panic("CodePointIterator.next invalid code point length."),
-        };
        return cp;
    }
@@ -75,11 +64,12 @@ pub const Iterator = struct {
 };
 test "peek" {
-    var iter = Iterator{ .bytes = "Hi" };
+    const src = "Hi";
+    var iter = Iterator{ .bytes = src };
-    try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
+    try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code(src));
-    try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
+    try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code(src));
-    try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);
+    try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code(src));
    try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
    try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
 }
diff --git a/src/display_width.zig b/src/display_width.zig
index e52da38..7f39566 100644
--- a/src/display_width.zig
+++ b/src/display_width.zig
@@ -52,17 +52,18 @@ pub fn strWidth(str: []const u8) usize {
    var giter = GraphemeIterator.init(str);
    while (giter.next()) |gc| {
-        var cp_iter = CodePointIterator{ .bytes = str[gc.offset..][0..gc.len] };
+        const gc_bytes = gc.bytes(str);
+        var cp_iter = CodePointIterator{ .bytes = gc_bytes };
        var gc_total: isize = 0;
        while (cp_iter.next()) |cp| {
-            var w = codePointWidth(cp.code);
+            var w = codePointWidth(cp.code(gc_bytes));
            if (w != 0) {
                // Handle text emoji sequence.
                if (cp_iter.next()) |ncp| {
                    // emoji text sequence.
-                    if (ncp.code == 0xFE0E) w = 1;
+                    if (ncp.code(gc_bytes) == 0xFE0E) w = 1;
                }
                // Only adding width of first non-zero-width code point.

diff --git a/src/Grapheme.zig b/src/Grapheme.zig index f013aba..6981753 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig
@@ -1,7 +1,6 @@
1	const std = @import("std");	1	const std = @import("std");
2	const unicode = std.unicode;	2	const unicode = std.unicode;
3		3
4	const CodePoint = @import("code_point").CodePoint;
5	const CodePointIterator = @import("code_point").Iterator;	4	const CodePointIterator = @import("code_point").Iterator;
6	const gbp = @import("gbp");	5	const gbp = @import("gbp");
7		6
@@ -17,6 +16,13 @@ pub const Grapheme = struct {
17	}	16	}
18	};	17	};
19		18
		19	// We need the code as a u21.
		20	const CodePoint = struct {
		21	code: u21,
		22	len: u3,
		23	offset: u32,
		24	};
		25
20	/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.	26	/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
21	pub const Iterator = struct {	27	pub const Iterator = struct {
22	buf: [2]?CodePoint = .{ null, null },	28	buf: [2]?CodePoint = .{ null, null },
@@ -33,7 +39,13 @@ pub const Iterator = struct {
33		39
34	fn advance(self: *Self) void {	40	fn advance(self: *Self) void {
35	self.buf[0] = self.buf[1];	41	self.buf[0] = self.buf[1];
36	self.buf[1] = self.cp_iter.next();	42
		43	const maybe_cp = self.cp_iter.next();
		44	self.buf[1] = if (maybe_cp) \|cp\| .{
		45	.code = cp.code(self.cp_iter.bytes),
		46	.len = cp.len,
		47	.offset = cp.offset,
		48	} else null;
37	}	49	}
38		50
39	pub fn next(self: *Self) ?Grapheme {	51	pub fn next(self: *Self) ?Grapheme {


diff --git a/src/code_point.zig b/src/code_point.zig index ac37562..098e635 100644 --- a/src/code_point.zig +++ b/src/code_point.zig
@@ -3,9 +3,29 @@ const std = @import("std");
3	/// `CodePoint` represents a Unicode code point by its code,	3	/// `CodePoint` represents a Unicode code point by its code,
4	/// length, and offset in the source bytes.	4	/// length, and offset in the source bytes.
5	pub const CodePoint = struct {	5	pub const CodePoint = struct {
6	code: u21,
7	len: u3,	6	len: u3,
8	offset: u32,	7	offset: u32,
		8
		9	pub fn code(self: CodePoint, src: []const u8) u21 {
		10	const cp_bytes = src[self.offset..][0..self.len];
		11
		12	return switch (self.len) {
		13	1 => cp_bytes[0],
		14
		15	2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) \| (cp_bytes[1] & 0b00111111),
		16
		17	3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) \|
		18	(cp_bytes[1] & 0b00111111)) << 6) \|
		19	(cp_bytes[2] & 0b00111111),
		20
		21	4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) \|
		22	(cp_bytes[1] & 0b00111111)) << 6) \|
		23	(cp_bytes[2] & 0b00111111)) << 6) \|
		24	(cp_bytes[3] & 0b00111111),
		25
		26	else => @panic("code_point.CodePoint.code: Invalid code point length."),
		27	};
		28	}
9	};	29	};
10		30
11	/// `Iterator` iterates a string one `CodePoint` at-a-time.	31	/// `Iterator` iterates a string one `CodePoint` at-a-time.
@@ -19,51 +39,20 @@ pub const Iterator = struct {
19	if (self.bytes[self.i] < 128) {	39	if (self.bytes[self.i] < 128) {
20	// ASCII fast path	40	// ASCII fast path
21	defer self.i += 1;	41	defer self.i += 1;
22		42	return .{ .len = 1, .offset = self.i };
23	return .{
24	.code = self.bytes[self.i],
25	.len = 1,
26	.offset = self.i,
27	};
28	}	43	}
29		44
30	var cp = CodePoint{	45	const cp = CodePoint{
31	.code = undefined,
32	.len = switch (self.bytes[self.i]) {	46	.len = switch (self.bytes[self.i]) {
33	0b1100_0000...0b1101_1111 => 2,	47	0b1100_0000...0b1101_1111 => 2,
34	0b1110_0000...0b1110_1111 => 3,	48	0b1110_0000...0b1110_1111 => 3,
35	0b1111_0000...0b1111_0111 => 4,	49	0b1111_0000...0b1111_0111 => 4,
36	else => {	50	else => @panic("code_point.Iterator.next: Invalid start byte."),
37	defer self.i += 1;
38	// Unicode replacement code point.
39	return .{
40	.code = 0xfffd,
41	.len = 1,
42	.offset = self.i,
43	};
44	},
45	},	51	},
46	.offset = self.i,	52	.offset = self.i,
47	};	53	};
48		54
49	const cp_bytes = self.bytes[self.i..][0..cp.len];
50	self.i += cp.len;	55	self.i += cp.len;
51
52	cp.code = switch (cp.len) {
53	2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) \| (cp_bytes[1] & 0b00111111),
54
55	3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) \|
56	(cp_bytes[1] & 0b00111111)) << 6) \|
57	(cp_bytes[2] & 0b00111111),
58
59	4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) \|
60	(cp_bytes[1] & 0b00111111)) << 6) \|
61	(cp_bytes[2] & 0b00111111)) << 6) \|
62	(cp_bytes[3] & 0b00111111),
63
64	else => @panic("CodePointIterator.next invalid code point length."),
65	};
66
67	return cp;	56	return cp;
68	}	57	}
69		58
@@ -75,11 +64,12 @@ pub const Iterator = struct {
75	};	64	};
76		65
77	test "peek" {	66	test "peek" {
78	var iter = Iterator{ .bytes = "Hi" };	67	const src = "Hi";
		68	var iter = Iterator{ .bytes = src };
79		69
80	try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);	70	try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code(src));
81	try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);	71	try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code(src));
82	try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);	72	try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code(src));
83	try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());	73	try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
84	try std.testing.expectEqual(@as(?CodePoint, null), iter.next());	74	try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
85	}	75	}


diff --git a/src/display_width.zig b/src/display_width.zig index e52da38..7f39566 100644 --- a/src/display_width.zig +++ b/src/display_width.zig
@@ -52,17 +52,18 @@ pub fn strWidth(str: []const u8) usize {
52	var giter = GraphemeIterator.init(str);	52	var giter = GraphemeIterator.init(str);
53		53
54	while (giter.next()) \|gc\| {	54	while (giter.next()) \|gc\| {
55	var cp_iter = CodePointIterator{ .bytes = str[gc.offset..][0..gc.len] };	55	const gc_bytes = gc.bytes(str);
		56	var cp_iter = CodePointIterator{ .bytes = gc_bytes };
56	var gc_total: isize = 0;	57	var gc_total: isize = 0;
57		58
58	while (cp_iter.next()) \|cp\| {	59	while (cp_iter.next()) \|cp\| {
59	var w = codePointWidth(cp.code);	60	var w = codePointWidth(cp.code(gc_bytes));
60		61
61	if (w != 0) {	62	if (w != 0) {
62	// Handle text emoji sequence.	63	// Handle text emoji sequence.
63	if (cp_iter.next()) \|ncp\| {	64	if (cp_iter.next()) \|ncp\| {
64	// emoji text sequence.	65	// emoji text sequence.
65	if (ncp.code == 0xFE0E) w = 1;	66	if (ncp.code(gc_bytes) == 0xFE0E) w = 1;
66	}	67	}
67		68
68	// Only adding width of first non-zero-width code point.	69	// Only adding width of first non-zero-width code point.