summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Jose Colon2024-11-03 19:29:09 +0000
committerGravatar Jose Colon2024-11-03 19:29:09 +0000
commitd667d180c82c83d5c3b41853d80b12536084404e (patch)
tree0424b9f9e54972837652042dc858dfe5ba12b5de /src
parentMerge pull request 'GraphemeData and WidthData: make init read errors unreach... (diff)
parentAdd peek() to Grapheme.Iterator (diff)
downloadzg-d667d180c82c83d5c3b41853d80b12536084404e.tar.gz
zg-d667d180c82c83d5c3b41853d80b12536084404e.tar.xz
zg-d667d180c82c83d5c3b41853d80b12536084404e.zip
Merge pull request 'grapheme-peek' (#18) from atman/zg:grapheme-peek into master
Reviewed-on: https://codeberg.org/dude_the_builder/zg/pulls/18
Diffstat (limited to 'src')
-rw-r--r--src/grapheme.zig69
-rw-r--r--src/unicode_tests.zig42
2 files changed, 103 insertions, 8 deletions
diff --git a/src/grapheme.zig b/src/grapheme.zig
index 911c856..7538f5b 100644
--- a/src/grapheme.zig
+++ b/src/grapheme.zig
@@ -77,6 +77,75 @@ pub const Iterator = struct {
77 77
78 return Grapheme{ .len = gc_len, .offset = gc_start }; 78 return Grapheme{ .len = gc_len, .offset = gc_start };
79 } 79 }
80
81 pub fn peek(self: *Self) ?Grapheme {
82 const saved_cp_iter = self.cp_iter;
83 const s0 = self.buf[0];
84 const s1 = self.buf[1];
85
86 self.advance();
87
88 // If no more
89 if (self.buf[0] == null) {
90 self.cp_iter = saved_cp_iter;
91 self.buf[0] = s0;
92 self.buf[1] = s1;
93 return null;
94 }
95 // If last one
96 if (self.buf[1] == null) {
97 const len = self.buf[0].?.len;
98 const offset = self.buf[0].?.offset;
99 self.cp_iter = saved_cp_iter;
100 self.buf[0] = s0;
101 self.buf[1] = s1;
102 return Grapheme{ .len = len, .offset = offset };
103 }
104 // If ASCII
105 if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) {
106 const len = self.buf[0].?.len;
107 const offset = self.buf[0].?.offset;
108 self.cp_iter = saved_cp_iter;
109 self.buf[0] = s0;
110 self.buf[1] = s1;
111 return Grapheme{ .len = len, .offset = offset };
112 }
113
114 const gc_start = self.buf[0].?.offset;
115 var gc_len: u8 = self.buf[0].?.len;
116 var state = State{};
117
118 if (graphemeBreak(
119 self.buf[0].?.code,
120 self.buf[1].?.code,
121 self.data,
122 &state,
123 )) {
124 self.cp_iter = saved_cp_iter;
125 self.buf[0] = s0;
126 self.buf[1] = s1;
127 return Grapheme{ .len = gc_len, .offset = gc_start };
128 }
129
130 while (true) {
131 self.advance();
132 if (self.buf[0] == null) break;
133
134 gc_len += self.buf[0].?.len;
135
136 if (graphemeBreak(
137 self.buf[0].?.code,
138 if (self.buf[1]) |ncp| ncp.code else 0,
139 self.data,
140 &state,
141 )) break;
142 }
143 self.cp_iter = saved_cp_iter;
144 self.buf[0] = s0;
145 self.buf[1] = s1;
146
147 return Grapheme{ .len = gc_len, .offset = gc_start };
148 }
80}; 149};
81 150
82// Predicates 151// Predicates
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 448ce41..245c03f 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -7,11 +7,37 @@ const mem = std.mem;
7const testing = std.testing; 7const testing = std.testing;
8const unicode = std.unicode; 8const unicode = std.unicode;
9 9
10const grapheme = @import("grapheme");
10const Grapheme = @import("grapheme").Grapheme; 11const Grapheme = @import("grapheme").Grapheme;
11const GraphemeData = @import("grapheme").GraphemeData; 12const GraphemeData = @import("grapheme").GraphemeData;
12const GraphemeIterator = @import("grapheme").Iterator; 13const GraphemeIterator = @import("grapheme").Iterator;
13const Normalize = @import("Normalize"); 14const Normalize = @import("Normalize");
14 15
16comptime {
17 testing.refAllDecls(grapheme);
18}
19test "Iterator.peek" {
20 const peek_seq = "aΔ👨🏻‍🌾→";
21 const data = try GraphemeData.init(std.testing.allocator);
22 defer data.deinit();
23
24 var iter = grapheme.Iterator.init(peek_seq, &data);
25 const peek_a = iter.peek().?;
26 const next_a = iter.next().?;
27 try std.testing.expectEqual(peek_a, next_a);
28 try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq));
29 const peek_d1 = iter.peek().?;
30 const peek_d2 = iter.peek().?;
31 try std.testing.expectEqual(peek_d1, peek_d2);
32 const next_d = iter.next().?;
33 try std.testing.expectEqual(peek_d2, next_d);
34 try std.testing.expectEqual(iter.peek(), iter.next());
35 try std.testing.expectEqual(iter.peek(), iter.next());
36 try std.testing.expectEqual(null, iter.peek());
37 try std.testing.expectEqual(null, iter.peek());
38 try std.testing.expectEqual(iter.peek(), iter.next());
39}
40
15test "Unicode normalization tests" { 41test "Unicode normalization tests" {
16 var arena = heap.ArenaAllocator.init(testing.allocator); 42 var arena = heap.ArenaAllocator.init(testing.allocator);
17 defer arena.deinit(); 43 defer arena.deinit();
@@ -35,7 +61,7 @@ test "Unicode normalization tests" {
35 // Skip comments or empty lines. 61 // Skip comments or empty lines.
36 if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; 62 if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
37 // Iterate over fields. 63 // Iterate over fields.
38 var fields = mem.split(u8, line, ";"); 64 var fields = mem.splitScalar(u8, line, ';');
39 var field_index: usize = 0; 65 var field_index: usize = 0;
40 var input: []u8 = undefined; 66 var input: []u8 = undefined;
41 defer allocator.free(input); 67 defer allocator.free(input);
@@ -45,7 +71,7 @@ test "Unicode normalization tests" {
45 var i_buf = std.ArrayList(u8).init(allocator); 71 var i_buf = std.ArrayList(u8).init(allocator);
46 defer i_buf.deinit(); 72 defer i_buf.deinit();
47 73
48 var i_fields = mem.split(u8, field, " "); 74 var i_fields = mem.splitScalar(u8, field, ' ');
49 while (i_fields.next()) |s| { 75 while (i_fields.next()) |s| {
50 const icp = try fmt.parseInt(u21, s, 16); 76 const icp = try fmt.parseInt(u21, s, 16);
51 const len = try unicode.utf8Encode(icp, &cp_buf); 77 const len = try unicode.utf8Encode(icp, &cp_buf);
@@ -59,7 +85,7 @@ test "Unicode normalization tests" {
59 var w_buf = std.ArrayList(u8).init(allocator); 85 var w_buf = std.ArrayList(u8).init(allocator);
60 defer w_buf.deinit(); 86 defer w_buf.deinit();
61 87
62 var w_fields = mem.split(u8, field, " "); 88 var w_fields = mem.splitScalar(u8, field, ' ');
63 while (w_fields.next()) |s| { 89 while (w_fields.next()) |s| {
64 const wcp = try fmt.parseInt(u21, s, 16); 90 const wcp = try fmt.parseInt(u21, s, 16);
65 const len = try unicode.utf8Encode(wcp, &cp_buf); 91 const len = try unicode.utf8Encode(wcp, &cp_buf);
@@ -76,7 +102,7 @@ test "Unicode normalization tests" {
76 var w_buf = std.ArrayList(u8).init(allocator); 102 var w_buf = std.ArrayList(u8).init(allocator);
77 defer w_buf.deinit(); 103 defer w_buf.deinit();
78 104
79 var w_fields = mem.split(u8, field, " "); 105 var w_fields = mem.splitScalar(u8, field, ' ');
80 while (w_fields.next()) |s| { 106 while (w_fields.next()) |s| {
81 const wcp = try fmt.parseInt(u21, s, 16); 107 const wcp = try fmt.parseInt(u21, s, 16);
82 const len = try unicode.utf8Encode(wcp, &cp_buf); 108 const len = try unicode.utf8Encode(wcp, &cp_buf);
@@ -93,7 +119,7 @@ test "Unicode normalization tests" {
93 var w_buf = std.ArrayList(u8).init(allocator); 119 var w_buf = std.ArrayList(u8).init(allocator);
94 defer w_buf.deinit(); 120 defer w_buf.deinit();
95 121
96 var w_fields = mem.split(u8, field, " "); 122 var w_fields = mem.splitScalar(u8, field, ' ');
97 while (w_fields.next()) |s| { 123 while (w_fields.next()) |s| {
98 const wcp = try fmt.parseInt(u21, s, 16); 124 const wcp = try fmt.parseInt(u21, s, 16);
99 const len = try unicode.utf8Encode(wcp, &cp_buf); 125 const len = try unicode.utf8Encode(wcp, &cp_buf);
@@ -110,7 +136,7 @@ test "Unicode normalization tests" {
110 var w_buf = std.ArrayList(u8).init(allocator); 136 var w_buf = std.ArrayList(u8).init(allocator);
111 defer w_buf.deinit(); 137 defer w_buf.deinit();
112 138
113 var w_fields = mem.split(u8, field, " "); 139 var w_fields = mem.splitScalar(u8, field, ' ');
114 while (w_fields.next()) |s| { 140 while (w_fields.next()) |s| {
115 const wcp = try fmt.parseInt(u21, s, 16); 141 const wcp = try fmt.parseInt(u21, s, 16);
116 const len = try unicode.utf8Encode(wcp, &cp_buf); 142 const len = try unicode.utf8Encode(wcp, &cp_buf);
@@ -158,11 +184,11 @@ test "Segmentation GraphemeIterator" {
158 var all_bytes = std.ArrayList(u8).init(allocator); 184 var all_bytes = std.ArrayList(u8).init(allocator);
159 defer all_bytes.deinit(); 185 defer all_bytes.deinit();
160 186
161 var graphemes = std.mem.split(u8, line, " ÷ "); 187 var graphemes = std.mem.splitSequence(u8, line, " ÷ ");
162 var bytes_index: u32 = 0; 188 var bytes_index: u32 = 0;
163 189
164 while (graphemes.next()) |field| { 190 while (graphemes.next()) |field| {
165 var code_points = std.mem.split(u8, field, " "); 191 var code_points = std.mem.splitScalar(u8, field, ' ');
166 var cp_buf: [4]u8 = undefined; 192 var cp_buf: [4]u8 = undefined;
167 var cp_index: u32 = 0; 193 var cp_index: u32 = 0;
168 var gc_len: u8 = 0; 194 var gc_len: u8 = 0;