Code point and grapheme are now namespaces.

author: Jose Colon Rodriguez 2024-02-18 08:48:03 -0400
committer: Jose Colon Rodriguez 2024-02-18 08:48:03 -0400
commit: 1404c85f513a88bbd399ab9f3453da71e7478727 (patch)
tree: 0080678ceac38f223910d60bf650ebaddf27b0f9 /src
parent: Fixed isAsciiOnly and CodePointIterator ASCII bugs (diff)
download: zg-1404c85f513a88bbd399ab9f3453da71e7478727.tar.gz
zg-1404c85f513a88bbd399ab9f3453da71e7478727.tar.xz
zg-1404c85f513a88bbd399ab9f3453da71e7478727.zip
3 files changed, 54 insertions, 56 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig
index 910aec5..f013aba 100644
--- a/src/Grapheme.zig
+++ b/src/Grapheme.zig
@@ -1,30 +1,25 @@
-//! `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
 const std = @import("std");
 const unicode = std.unicode;
-const CodePoint = @import("CodePoint");
+const CodePoint = @import("code_point").CodePoint;
-const CodePointIterator = CodePoint.CodePointIterator;
+const CodePointIterator = @import("code_point").Iterator;
 const gbp = @import("gbp");
-pub const Grapheme = @This();
+/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
+pub const Grapheme = struct {
-len: usize,
+    len: u8,
-offset: usize,
+    offset: u32,
-/// `eql` comparse `str` with the bytes of this grapheme cluster in `src` for equality.
+    /// `bytes` returns the slice of bytes that correspond to
-pub fn eql(self: Grapheme, src: []const u8, other: []const u8) bool {
+    /// this grapheme cluster in `src`.
-    return std.mem.eql(u8, src[self.offset .. self.offset + self.len], other);
+    pub fn bytes(self: Grapheme, src: []const u8) []const u8 {
-}
+        return src[self.offset..][0..self.len];
+    }
-/// `slice` returns the bytes that correspond to this grapheme cluster in `src`.
+};
-pub fn slice(self: Grapheme, src: []const u8) []const u8 {
-    return src[self.offset .. self.offset + self.len];
-}
-/// `GraphemeIterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
+/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
-pub const GraphemeIterator = struct {
+pub const Iterator = struct {
-    buf: [2]?CodePoint = [_]?CodePoint{ null, null },
+    buf: [2]?CodePoint = .{ null, null },
    cp_iter: CodePointIterator,
    const Self = @This();
@@ -32,8 +27,7 @@ pub const GraphemeIterator = struct {
    /// Assumes `src` is valid UTF-8.
    pub fn init(str: []const u8) Self {
        var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } };
-        self.buf[1] = self.cp_iter.next();
+        self.advance();
        return self;
    }
@@ -55,7 +49,7 @@ pub const GraphemeIterator = struct {
        }
        const gc_start = self.buf[0].?.offset;
-        var gc_len: usize = self.buf[0].?.len;
+        var gc_len: u8 = self.buf[0].?.len;
        var state = State{};
        if (graphemeBreak(
@@ -266,13 +260,13 @@ test "Segmentation GraphemeIterator" {
        defer all_bytes.deinit();
        var graphemes = std.mem.split(u8, line, " ÷ ");
-        var bytes_index: usize = 0;
+        var bytes_index: u32 = 0;
        while (graphemes.next()) |field| {
            var code_points = std.mem.split(u8, field, " ");
            var cp_buf: [4]u8 = undefined;
-            var cp_index: usize = 0;
+            var cp_index: u32 = 0;
-            var gc_len: usize = 0;
+            var gc_len: u8 = 0;
            while (code_points.next()) |code_point| {
                if (std.mem.eql(u8, code_point, "×")) continue;
@@ -288,12 +282,15 @@ test "Segmentation GraphemeIterator" {
        }
        // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
-        var iter = GraphemeIterator.init(all_bytes.items);
+        var iter = Iterator.init(all_bytes.items);
        // Chaeck.
-        for (want.items) |w| {
+        for (want.items) |want_gc| {
-            const g = (iter.next()).?;
+            const got_gc = (iter.next()).?;
-            try std.testing.expect(w.eql(all_bytes.items, all_bytes.items[g.offset .. g.offset + g.len]));
+            try std.testing.expectEqualStrings(
+                want_gc.bytes(all_bytes.items),
+                got_gc.bytes(all_bytes.items),
+            );
        }
    }
 }
@@ -303,10 +300,10 @@ test "Segmentation comptime GraphemeIterator" {
    comptime {
        const src = "Héllo";
-        var ct_iter = GraphemeIterator.init(src);
+        var ct_iter = Iterator.init(src);
        var i = 0;
        while (ct_iter.next()) |grapheme| : (i += 1) {
-            try std.testing.expect(grapheme.eql(src, want[i]));
+            try std.testing.expectEqualStrings(grapheme.bytes(src), want[i]);
        }
    }
 }
@@ -318,17 +315,17 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
    const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;
    const no_joiner = seq_1 ++ seq_2;
-    var ct_iter = GraphemeIterator.init(with_zwj);
+    var ct_iter = Iterator.init(with_zwj);
    var i: usize = 0;
    while (ct_iter.next()) |_| : (i += 1) {}
    try std.testing.expectEqual(@as(usize, 1), i);
-    ct_iter = GraphemeIterator.init(with_zwsp);
+    ct_iter = Iterator.init(with_zwsp);
    i = 0;
    while (ct_iter.next()) |_| : (i += 1) {}
    try std.testing.expectEqual(@as(usize, 3), i);
-    ct_iter = GraphemeIterator.init(no_joiner);
+    ct_iter = Iterator.init(no_joiner);
    i = 0;
    while (ct_iter.next()) |_| : (i += 1) {}
    try std.testing.expectEqual(@as(usize, 2), i);
diff --git a/src/CodePoint.zig b/src/code_point.zig
index 62dd793..ac37562 100644
--- a/src/CodePoint.zig
+++ b/src/code_point.zig
@@ -1,28 +1,29 @@
-//! `CodePoint` represents a Unicode code point by its code, length, and offset in the source bytes.
 const std = @import("std");
-code: u21,
+/// `CodePoint` represents a Unicode code point by its code,
-len: u3,
+/// length, and offset in the source bytes.
-offset: usize,
+pub const CodePoint = struct {
+    code: u21,
-const CodePoint = @This();
+    len: u3,
+    offset: u32,
+};
-/// `CodePointIterator` iterates a string one `CodePoint` at-a-time.
+/// `Iterator` iterates a string one `CodePoint` at-a-time.
-pub const CodePointIterator = struct {
+pub const Iterator = struct {
    bytes: []const u8,
-    i: usize = 0,
+    i: u32 = 0,
-    pub fn next(self: *CodePointIterator) ?CodePoint {
+    pub fn next(self: *Iterator) ?CodePoint {
        if (self.i >= self.bytes.len) return null;
        if (self.bytes[self.i] < 128) {
            // ASCII fast path
-            self.i += 1;
+            defer self.i += 1;
            return .{
-                .code = self.bytes[self.i - 1],
+                .code = self.bytes[self.i],
                .len = 1,
-                .offset = self.i - 1,
+                .offset = self.i,
            };
        }
@@ -33,12 +34,12 @@ pub const CodePointIterator = struct {
                0b1110_0000...0b1110_1111 => 3,
                0b1111_0000...0b1111_0111 => 4,
                else => {
-                    self.i += 1;
+                    defer self.i += 1;
                    // Unicode replacement code point.
                    return .{
                        .code = 0xfffd,
                        .len = 1,
-                        .offset = self.i - 1,
+                        .offset = self.i,
                    };
                },
            },
@@ -66,15 +67,15 @@ pub const CodePointIterator = struct {
        return cp;
    }
-    pub fn peek(self: *CodePointIterator) ?CodePoint {
+    pub fn peek(self: *Iterator) ?CodePoint {
        const saved_i = self.i;
        defer self.i = saved_i;
        return self.next();
    }
 };
-test "CodePointIterator peek" {
+test "peek" {
-    var iter = CodePointIterator{ .bytes = "Hi" };
+    var iter = Iterator{ .bytes = "Hi" };
    try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
    try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
diff --git a/src/display_width.zig b/src/display_width.zig
index ba76052..e52da38 100644
--- a/src/display_width.zig
+++ b/src/display_width.zig
@@ -2,8 +2,8 @@ const std = @import("std");
 const simd = std.simd;
 const testing = std.testing;
-const CodePointIterator = @import("CodePoint").CodePointIterator;
+const CodePointIterator = @import("code_point").Iterator;
-const GraphemeIterator = @import("Grapheme").GraphemeIterator;
+const GraphemeIterator = @import("grapheme").Iterator;
 const dwp = @import("dwp");
 /// codePointWidth returns the number of cells `cp` requires when rendered
author	Jose Colon Rodriguez	2024-02-18 08:48:03 -0400
committer	Jose Colon Rodriguez	2024-02-18 08:48:03 -0400
commit	1404c85f513a88bbd399ab9f3453da71e7478727 (patch)
tree	0080678ceac38f223910d60bf650ebaddf27b0f9 /src
parent	Fixed isAsciiOnly and CodePointIterator ASCII bugs (diff)
download	zg-1404c85f513a88bbd399ab9f3453da71e7478727.tar.gz zg-1404c85f513a88bbd399ab9f3453da71e7478727.tar.xz zg-1404c85f513a88bbd399ab9f3453da71e7478727.zip

diff --git a/src/Grapheme.zig b/src/Grapheme.zig index 910aec5..f013aba 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig
@@ -1,30 +1,25 @@
1	//! `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
2
3	const std = @import("std");	1	const std = @import("std");
4	const unicode = std.unicode;	2	const unicode = std.unicode;
5		3
6	const CodePoint = @import("CodePoint");	4	const CodePoint = @import("code_point").CodePoint;
7	const CodePointIterator = CodePoint.CodePointIterator;	5	const CodePointIterator = @import("code_point").Iterator;
8	const gbp = @import("gbp");	6	const gbp = @import("gbp");
9		7
10	pub const Grapheme = @This();	8	/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
11		9	pub const Grapheme = struct {
12	len: usize,	10	len: u8,
13	offset: usize,	11	offset: u32,
14		12
15	/// `eql` comparse `str` with the bytes of this grapheme cluster in `src` for equality.	13	/// `bytes` returns the slice of bytes that correspond to
16	pub fn eql(self: Grapheme, src: []const u8, other: []const u8) bool {	14	/// this grapheme cluster in `src`.
17	return std.mem.eql(u8, src[self.offset .. self.offset + self.len], other);	15	pub fn bytes(self: Grapheme, src: []const u8) []const u8 {
18	}	16	return src[self.offset..][0..self.len];
19		17	}
20	/// `slice` returns the bytes that correspond to this grapheme cluster in `src`.	18	};
21	pub fn slice(self: Grapheme, src: []const u8) []const u8 {
22	return src[self.offset .. self.offset + self.len];
23	}
24		19
25	/// `GraphemeIterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.	20	/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
26	pub const GraphemeIterator = struct {	21	pub const Iterator = struct {
27	buf: [2]?CodePoint = [_]?CodePoint{ null, null },	22	buf: [2]?CodePoint = .{ null, null },
28	cp_iter: CodePointIterator,	23	cp_iter: CodePointIterator,
29		24
30	const Self = @This();	25	const Self = @This();
@@ -32,8 +27,7 @@ pub const GraphemeIterator = struct {
32	/// Assumes `src` is valid UTF-8.	27	/// Assumes `src` is valid UTF-8.
33	pub fn init(str: []const u8) Self {	28	pub fn init(str: []const u8) Self {
34	var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } };	29	var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } };
35	self.buf[1] = self.cp_iter.next();	30	self.advance();
36
37	return self;	31	return self;
38	}	32	}
39		33
@@ -55,7 +49,7 @@ pub const GraphemeIterator = struct {
55	}	49	}
56		50
57	const gc_start = self.buf[0].?.offset;	51	const gc_start = self.buf[0].?.offset;
58	var gc_len: usize = self.buf[0].?.len;	52	var gc_len: u8 = self.buf[0].?.len;
59	var state = State{};	53	var state = State{};
60		54
61	if (graphemeBreak(	55	if (graphemeBreak(
@@ -266,13 +260,13 @@ test "Segmentation GraphemeIterator" {
266	defer all_bytes.deinit();	260	defer all_bytes.deinit();
267		261
268	var graphemes = std.mem.split(u8, line, " ÷ ");	262	var graphemes = std.mem.split(u8, line, " ÷ ");
269	var bytes_index: usize = 0;	263	var bytes_index: u32 = 0;
270		264
271	while (graphemes.next()) \|field\| {	265	while (graphemes.next()) \|field\| {
272	var code_points = std.mem.split(u8, field, " ");	266	var code_points = std.mem.split(u8, field, " ");
273	var cp_buf: [4]u8 = undefined;	267	var cp_buf: [4]u8 = undefined;
274	var cp_index: usize = 0;	268	var cp_index: u32 = 0;
275	var gc_len: usize = 0;	269	var gc_len: u8 = 0;
276		270
277	while (code_points.next()) \|code_point\| {	271	while (code_points.next()) \|code_point\| {
278	if (std.mem.eql(u8, code_point, "×")) continue;	272	if (std.mem.eql(u8, code_point, "×")) continue;
@@ -288,12 +282,15 @@ test "Segmentation GraphemeIterator" {
288	}	282	}
289		283
290	// std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });	284	// std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
291	var iter = GraphemeIterator.init(all_bytes.items);	285	var iter = Iterator.init(all_bytes.items);
292		286
293	// Chaeck.	287	// Chaeck.
294	for (want.items) \|w\| {	288	for (want.items) \|want_gc\| {
295	const g = (iter.next()).?;	289	const got_gc = (iter.next()).?;
296	try std.testing.expect(w.eql(all_bytes.items, all_bytes.items[g.offset .. g.offset + g.len]));	290	try std.testing.expectEqualStrings(
		291	want_gc.bytes(all_bytes.items),
		292	got_gc.bytes(all_bytes.items),
		293	);
297	}	294	}
298	}	295	}
299	}	296	}
@@ -303,10 +300,10 @@ test "Segmentation comptime GraphemeIterator" {
303		300
304	comptime {	301	comptime {
305	const src = "Héllo";	302	const src = "Héllo";
306	var ct_iter = GraphemeIterator.init(src);	303	var ct_iter = Iterator.init(src);
307	var i = 0;	304	var i = 0;
308	while (ct_iter.next()) \|grapheme\| : (i += 1) {	305	while (ct_iter.next()) \|grapheme\| : (i += 1) {
309	try std.testing.expect(grapheme.eql(src, want[i]));	306	try std.testing.expectEqualStrings(grapheme.bytes(src), want[i]);
310	}	307	}
311	}	308	}
312	}	309	}
@@ -318,17 +315,17 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
318	const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;	315	const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;
319	const no_joiner = seq_1 ++ seq_2;	316	const no_joiner = seq_1 ++ seq_2;
320		317
321	var ct_iter = GraphemeIterator.init(with_zwj);	318	var ct_iter = Iterator.init(with_zwj);
322	var i: usize = 0;	319	var i: usize = 0;
323	while (ct_iter.next()) \|_\| : (i += 1) {}	320	while (ct_iter.next()) \|_\| : (i += 1) {}
324	try std.testing.expectEqual(@as(usize, 1), i);	321	try std.testing.expectEqual(@as(usize, 1), i);
325		322
326	ct_iter = GraphemeIterator.init(with_zwsp);	323	ct_iter = Iterator.init(with_zwsp);
327	i = 0;	324	i = 0;
328	while (ct_iter.next()) \|_\| : (i += 1) {}	325	while (ct_iter.next()) \|_\| : (i += 1) {}
329	try std.testing.expectEqual(@as(usize, 3), i);	326	try std.testing.expectEqual(@as(usize, 3), i);
330		327
331	ct_iter = GraphemeIterator.init(no_joiner);	328	ct_iter = Iterator.init(no_joiner);
332	i = 0;	329	i = 0;
333	while (ct_iter.next()) \|_\| : (i += 1) {}	330	while (ct_iter.next()) \|_\| : (i += 1) {}
334	try std.testing.expectEqual(@as(usize, 2), i);	331	try std.testing.expectEqual(@as(usize, 2), i);


diff --git a/src/CodePoint.zig b/src/code_point.zig index 62dd793..ac37562 100644 --- a/src/CodePoint.zig +++ b/src/code_point.zig
@@ -1,28 +1,29 @@
1	//! `CodePoint` represents a Unicode code point by its code, length, and offset in the source bytes.
2
3	const std = @import("std");	1	const std = @import("std");
4		2
5	code: u21,	3	/// `CodePoint` represents a Unicode code point by its code,
6	len: u3,	4	/// length, and offset in the source bytes.
7	offset: usize,	5	pub const CodePoint = struct {
8		6	code: u21,
9	const CodePoint = @This();	7	len: u3,
		8	offset: u32,
		9	};
10		10
11	/// `CodePointIterator` iterates a string one `CodePoint` at-a-time.	11	/// `Iterator` iterates a string one `CodePoint` at-a-time.
12	pub const CodePointIterator = struct {	12	pub const Iterator = struct {
13	bytes: []const u8,	13	bytes: []const u8,
14	i: usize = 0,	14	i: u32 = 0,
15		15
16	pub fn next(self: *CodePointIterator) ?CodePoint {	16	pub fn next(self: *Iterator) ?CodePoint {
17	if (self.i >= self.bytes.len) return null;	17	if (self.i >= self.bytes.len) return null;
18		18
19	if (self.bytes[self.i] < 128) {	19	if (self.bytes[self.i] < 128) {
20	// ASCII fast path	20	// ASCII fast path
21	self.i += 1;	21	defer self.i += 1;
		22
22	return .{	23	return .{
23	.code = self.bytes[self.i - 1],	24	.code = self.bytes[self.i],
24	.len = 1,	25	.len = 1,
25	.offset = self.i - 1,	26	.offset = self.i,
26	};	27	};
27	}	28	}
28		29
@@ -33,12 +34,12 @@ pub const CodePointIterator = struct {
33	0b1110_0000...0b1110_1111 => 3,	34	0b1110_0000...0b1110_1111 => 3,
34	0b1111_0000...0b1111_0111 => 4,	35	0b1111_0000...0b1111_0111 => 4,
35	else => {	36	else => {
36	self.i += 1;	37	defer self.i += 1;
37	// Unicode replacement code point.	38	// Unicode replacement code point.
38	return .{	39	return .{
39	.code = 0xfffd,	40	.code = 0xfffd,
40	.len = 1,	41	.len = 1,
41	.offset = self.i - 1,	42	.offset = self.i,
42	};	43	};
43	},	44	},
44	},	45	},
@@ -66,15 +67,15 @@ pub const CodePointIterator = struct {
66	return cp;	67	return cp;
67	}	68	}
68		69
69	pub fn peek(self: *CodePointIterator) ?CodePoint {	70	pub fn peek(self: *Iterator) ?CodePoint {
70	const saved_i = self.i;	71	const saved_i = self.i;
71	defer self.i = saved_i;	72	defer self.i = saved_i;
72	return self.next();	73	return self.next();
73	}	74	}
74	};	75	};
75		76
76	test "CodePointIterator peek" {	77	test "peek" {
77	var iter = CodePointIterator{ .bytes = "Hi" };	78	var iter = Iterator{ .bytes = "Hi" };
78		79
79	try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);	80	try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
80	try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);	81	try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);


diff --git a/src/display_width.zig b/src/display_width.zig index ba76052..e52da38 100644 --- a/src/display_width.zig +++ b/src/display_width.zig
@@ -2,8 +2,8 @@ const std = @import("std");
2	const simd = std.simd;	2	const simd = std.simd;
3	const testing = std.testing;	3	const testing = std.testing;
4		4
5	const CodePointIterator = @import("CodePoint").CodePointIterator;	5	const CodePointIterator = @import("code_point").Iterator;
6	const GraphemeIterator = @import("Grapheme").GraphemeIterator;	6	const GraphemeIterator = @import("grapheme").Iterator;
7	const dwp = @import("dwp");	7	const dwp = @import("dwp");
8		8
9	/// codePointWidth returns the number of cells `cp` requires when rendered	9	/// codePointWidth returns the number of cells `cp` requires when rendered