summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-13 13:50:22 -0400
committerGravatar Jose Colon Rodriguez2024-02-13 13:50:22 -0400
commitbbe4bc5a08a042b47a3474e8e9caf20e216634a8 (patch)
tree3a2d19765f560481d74b6c2abb3696db573a70e3 /src
parentMerge table (diff)
downloadzg-bbe4bc5a08a042b47a3474e8e9caf20e216634a8.tar.gz
zg-bbe4bc5a08a042b47a3474e8e9caf20e216634a8.tar.xz
zg-bbe4bc5a08a042b47a3474e8e9caf20e216634a8.zip
Passing ziglyph tests
Diffstat (limited to 'src')
-rw-r--r--src/Grapheme.zig248
-rw-r--r--src/gbp_gen.zig14
-rw-r--r--src/main.zig4
3 files changed, 250 insertions, 16 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig
index a8a7638..1e9606f 100644
--- a/src/Grapheme.zig
+++ b/src/Grapheme.zig
@@ -3,10 +3,12 @@
3const std = @import("std"); 3const std = @import("std");
4const unicode = std.unicode; 4const unicode = std.unicode;
5 5
6const CodePoint = @import("ziglyph").CodePoint; 6const ziglyph = @import("ziglyph");
7const CodePoint = ziglyph.CodePoint;
7const CodePointIterator = CodePoint.CodePointIterator; 8const CodePointIterator = CodePoint.CodePointIterator;
8const emoji = @import("ziglyph").emoji; 9const readCodePoint = CodePoint.readCodePoint;
9 10const emoji = ziglyph.emoji;
11// const gbp = ziglyph.grapheme_break;
10const gbp = @import("gbp"); 12const gbp = @import("gbp");
11 13
12pub const Grapheme = @This(); 14pub const Grapheme = @This();
@@ -78,6 +80,171 @@ pub const GraphemeIterator = struct {
78 } 80 }
79}; 81};
80 82
83/// `StreamingGraphemeIterator` iterates a `std.io.Reader` one grapheme cluster at-a-time.
84/// Note that, given the steaming context, each grapheme cluster is returned as a slice of bytes.
85pub fn StreamingGraphemeIterator(comptime T: type) type {
86 return struct {
87 allocator: std.mem.Allocator,
88 buf: [2]?u21 = [_]?u21{ null, null },
89 reader: T,
90
91 const Self = @This();
92
93 pub fn init(allocator: std.mem.Allocator, reader: anytype) !Self {
94 var self = Self{ .allocator = allocator, .reader = reader };
95 self.buf[1] = try readCodePoint(self.reader);
96
97 return self;
98 }
99
100 /// Caller must free returned bytes with `allocator` passed to `init`.
101 pub fn next(self: *Self) !?[]u8 {
102 const code = (try self.advance()) orelse return null;
103
104 var all_bytes = std.ArrayList(u8).init(self.allocator);
105 errdefer all_bytes.deinit();
106
107 try encode_and_append(code, &all_bytes);
108
109 // If at end
110 if (self.buf[1] == null) return try all_bytes.toOwnedSlice();
111
112 // Instant breakers
113 // CR
114 if (code == '\x0d') {
115 if (self.buf[1].? == '\x0a') {
116 // CRLF
117 try encode_and_append(self.buf[1].?, &all_bytes);
118 _ = self.advance() catch unreachable;
119 }
120
121 return try all_bytes.toOwnedSlice();
122 }
123 // LF
124 if (code == '\x0a') return try all_bytes.toOwnedSlice();
125 // Control
126 if (gbp.isControl(code)) return try all_bytes.toOwnedSlice();
127
128 // Common chars
129 if (code < 0xa9) {
130 // Extend / ignorables loop
131 while (self.buf[1]) |next_cp| {
132 if (next_cp >= 0x300 and isIgnorable(next_cp)) {
133 try encode_and_append(next_cp, &all_bytes);
134 _ = self.advance() catch unreachable;
135 } else {
136 break;
137 }
138 }
139
140 return try all_bytes.toOwnedSlice();
141 }
142
143 if (emoji.isExtendedPictographic(code)) {
144 var after_zwj = false;
145
146 // Extend / ignorables loop
147 while (self.buf[1]) |next_cp| {
148 if (next_cp >= 0x300 and
149 after_zwj and
150 emoji.isExtendedPictographic(next_cp))
151 {
152 try encode_and_append(next_cp, &all_bytes);
153 _ = self.advance() catch unreachable;
154 after_zwj = false;
155 } else if (next_cp >= 0x300 and isIgnorable(next_cp)) {
156 try encode_and_append(next_cp, &all_bytes);
157 _ = self.advance() catch unreachable;
158 if (next_cp == '\u{200d}') after_zwj = true;
159 } else {
160 break;
161 }
162 }
163
164 return try all_bytes.toOwnedSlice();
165 }
166
167 if (0x1100 <= code and code <= 0xd7c6) {
168 const next_cp = self.buf[1].?;
169
170 if (gbp.isL(code)) {
171 if (next_cp >= 0x1100 and
172 (gbp.isL(next_cp) or
173 gbp.isV(next_cp) or
174 gbp.isLv(next_cp) or
175 gbp.isLvt(next_cp)))
176 {
177 try encode_and_append(next_cp, &all_bytes);
178 _ = self.advance() catch unreachable;
179 }
180 } else if (gbp.isLv(code) or gbp.isV(code)) {
181 if (next_cp >= 0x1100 and
182 (gbp.isV(next_cp) or
183 gbp.isT(next_cp)))
184 {
185 try encode_and_append(next_cp, &all_bytes);
186 _ = self.advance() catch unreachable;
187 }
188 } else if (gbp.isLvt(code) or gbp.isT(code)) {
189 if (next_cp >= 0x1100 and gbp.isT(next_cp)) {
190 try encode_and_append(next_cp, &all_bytes);
191 _ = self.advance() catch unreachable;
192 }
193 }
194 } else if (0x600 <= code and code <= 0x11f02) {
195 if (gbp.isPrepend(code)) {
196 const next_cp = self.buf[1].?;
197
198 if (isBreaker(next_cp)) {
199 return try all_bytes.toOwnedSlice();
200 } else {
201 try encode_and_append(next_cp, &all_bytes);
202 _ = self.advance() catch unreachable;
203 }
204 }
205 } else if (0x1f1e6 <= code and code <= 0x1f1ff) {
206 if (gbp.isRegionalIndicator(code)) {
207 const next_cp = self.buf[1].?;
208
209 if (next_cp >= 0x1f1e6 and gbp.isRegionalIndicator(next_cp)) {
210 try encode_and_append(next_cp, &all_bytes);
211 _ = self.advance() catch unreachable;
212 }
213 }
214 }
215
216 // Extend / ignorables loop
217 while (self.buf[1]) |next_cp| {
218 if (next_cp >= 0x300 and isIgnorable(next_cp)) {
219 try encode_and_append(next_cp, &all_bytes);
220 _ = self.advance() catch unreachable;
221 } else {
222 break;
223 }
224 }
225
226 return try all_bytes.toOwnedSlice();
227 }
228
229 fn advance(self: *Self) !?u21 {
230 self.buf[0] = self.buf[1];
231 self.buf[1] = try readCodePoint(self.reader);
232
233 return self.buf[0];
234 }
235
236 fn peek(self: Self) ?u21 {
237 return self.buf[1];
238 }
239
240 fn encode_and_append(cp: u21, list: *std.ArrayList(u8)) !void {
241 var tmp: [4]u8 = undefined;
242 const len = try unicode.utf8Encode(cp, &tmp);
243 try list.appendSlice(tmp[0..len]);
244 }
245 };
246}
247
81// Predicates 248// Predicates
82fn isBreaker(cp: u21) bool { 249fn isBreaker(cp: u21) bool {
83 return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp); 250 return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp);
@@ -100,6 +267,22 @@ test "Segmentation comptime GraphemeIterator" {
100 } 267 }
101} 268}
102 269
270test "Simple StreamingGraphemeIterator" {
271 var buf = "abe\u{301}😹".*;
272 var fis = std.io.fixedBufferStream(&buf);
273 const reader = fis.reader();
274 var iter = try StreamingGraphemeIterator(@TypeOf(reader)).init(std.testing.allocator, reader);
275 const want = [_][]const u8{ "a", "b", "e\u{301}", "😹" };
276
277 for (want) |str| {
278 const gc = (try iter.next()).?;
279 defer std.testing.allocator.free(gc);
280 try std.testing.expectEqualStrings(gc, str);
281 }
282
283 try std.testing.expectEqual(@as(?[]u8, null), try iter.next());
284}
285
103test "Segmentation ZWJ and ZWSP emoji sequences" { 286test "Segmentation ZWJ and ZWSP emoji sequences" {
104 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; 287 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
105 const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; 288 const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
@@ -218,3 +401,62 @@ pub fn graphemeBreak(
218 401
219 return true; 402 return true;
220} 403}
404
405test "Segmentation GraphemeIterator" {
406 const allocator = std.testing.allocator;
407 var file = try std.fs.cwd().openFile("GraphemeBreakTest.txt", .{});
408 defer file.close();
409 var buf_reader = std.io.bufferedReader(file.reader());
410 var input_stream = buf_reader.reader();
411
412 var buf: [4096]u8 = undefined;
413 var line_no: usize = 1;
414
415 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) {
416 // Skip comments or empty lines.
417 if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
418
419 // Clean up.
420 var line = std.mem.trimLeft(u8, raw, "÷ ");
421 if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| {
422 line = line[0..octo];
423 }
424 // Iterate over fields.
425 var want = std.ArrayList(Grapheme).init(allocator);
426 defer want.deinit();
427
428 var all_bytes = std.ArrayList(u8).init(allocator);
429 defer all_bytes.deinit();
430
431 var graphemes = std.mem.split(u8, line, " ÷ ");
432 var bytes_index: usize = 0;
433
434 while (graphemes.next()) |field| {
435 var code_points = std.mem.split(u8, field, " ");
436 var cp_buf: [4]u8 = undefined;
437 var cp_index: usize = 0;
438 var gc_len: usize = 0;
439
440 while (code_points.next()) |code_point| {
441 if (std.mem.eql(u8, code_point, "×")) continue;
442 const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
443 const len = try unicode.utf8Encode(cp, &cp_buf);
444 try all_bytes.appendSlice(cp_buf[0..len]);
445 cp_index += len;
446 gc_len += len;
447 }
448
449 try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
450 bytes_index += cp_index;
451 }
452
453 // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
454 var iter = GraphemeIterator.init(all_bytes.items);
455
456 // Chaeck.
457 for (want.items) |w| {
458 const g = (iter.next()).?;
459 try std.testing.expect(w.eql(all_bytes.items, all_bytes.items[g.offset .. g.offset + g.len]));
460 }
461 }
462}
diff --git a/src/gbp_gen.zig b/src/gbp_gen.zig
index afc54fc..7e27f35 100644
--- a/src/gbp_gen.zig
+++ b/src/gbp_gen.zig
@@ -62,8 +62,6 @@ pub fn main() !void {
62 var blocks_map = BlockMap.init(allocator); 62 var blocks_map = BlockMap.init(allocator);
63 defer blocks_map.deinit(); 63 defer blocks_map.deinit();
64 64
65 const no_prop = std.math.maxInt(u16);
66
67 var stage1 = std.ArrayList(u16).init(allocator); 65 var stage1 = std.ArrayList(u16).init(allocator);
68 defer stage1.deinit(); 66 defer stage1.deinit();
69 67
@@ -101,12 +99,7 @@ pub fn main() !void {
101 try stage2.appendSlice(block[0..block_len]); 99 try stage2.appendSlice(block[0..block_len]);
102 } 100 }
103 101
104 if (prop == .none) { 102 try stage1.append(gop.value_ptr.*);
105 try stage1.append(no_prop);
106 } else {
107 try stage1.append(gop.value_ptr.*);
108 }
109
110 block_len = 0; 103 block_len = 0;
111 } 104 }
112 105
@@ -120,8 +113,6 @@ pub fn main() !void {
120 const writer = out_buf.writer(); 113 const writer = out_buf.writer();
121 114
122 const prop_code = 115 const prop_code =
123 \\const std = @import("std");
124 \\
125 \\const Prop = enum { 116 \\const Prop = enum {
126 \\ none, 117 \\ none,
127 \\ 118 \\
@@ -161,11 +152,8 @@ pub fn main() !void {
161 try writer.writeAll("};\n"); 152 try writer.writeAll("};\n");
162 153
163 const code = 154 const code =
164 \\const no_prop = std.math.maxInt(u16);
165 \\
166 \\inline fn getProp(cp: u21) Prop { 155 \\inline fn getProp(cp: u21) Prop {
167 \\ const stage_1_index = cp >> 8; 156 \\ const stage_1_index = cp >> 8;
168 \\ if (stage_1[stage_1_index] == no_prop) return .none;
169 \\ const stage_2_index = stage_1[stage_1_index] + (cp & 0xff); 157 \\ const stage_2_index = stage_1[stage_1_index] + (cp & 0xff);
170 \\ const stage_3_index = stage_2[stage_2_index]; 158 \\ const stage_3_index = stage_2[stage_2_index];
171 \\ return stage_3[stage_3_index]; 159 \\ return stage_3[stage_3_index];
diff --git a/src/main.zig b/src/main.zig
index ca167e8..8335530 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -23,3 +23,7 @@ pub fn main() !void {
23 23
24 std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); 24 std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms });
25} 25}
26
27test {
28 _ = @import("Grapheme.zig");
29}