summaryrefslogtreecommitdiff
path: root/src/unicode_tests.zig
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-12-23 09:34:19 -0500
committerGravatar Sam Atman2025-12-23 09:34:19 -0500
commit79b133e5d88fe6cfce337dd401fc09999db08852 (patch)
tree8b3f9062edde82724c73147abf42143a885640fc /src/unicode_tests.zig
parentMerge branch 'develop-next' (diff)
parentUse takeDelimiterInclusive to support Zig 0.15.2 (diff)
downloadzg-79b133e5d88fe6cfce337dd401fc09999db08852.tar.gz
zg-79b133e5d88fe6cfce337dd401fc09999db08852.tar.xz
zg-79b133e5d88fe6cfce337dd401fc09999db08852.zip
Merge branch 'fifteen-two'
Close #90 Close #87 Close #83 Thanks everyone.
Diffstat (limited to 'src/unicode_tests.zig')
-rw-r--r--src/unicode_tests.zig94
1 files changed, 45 insertions, 49 deletions
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index ae177a9..e2a5a96 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -3,35 +3,30 @@ const dbg_print = false;
3test "Unicode normalization tests" { 3test "Unicode normalization tests" {
4 var arena = heap.ArenaAllocator.init(testing.allocator); 4 var arena = heap.ArenaAllocator.init(testing.allocator);
5 defer arena.deinit(); 5 defer arena.deinit();
6 var allocator = arena.allocator(); 6 const allocator = arena.allocator();
7 7
8 const n = try Normalize.init(allocator); 8 const n = try Normalize.init(allocator);
9 defer n.deinit(allocator); 9 defer n.deinit(allocator);
10 10
11 var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); 11 var reader = std.io.Reader.fixed(@embedFile("NormalizationTest.txt"));
12 defer file.close();
13 var buf_reader = io.bufferedReader(file.reader());
14 var input_stream = buf_reader.reader();
15
16 var buf: [4096]u8 = undefined;
17 var cp_buf: [4]u8 = undefined; 12 var cp_buf: [4]u8 = undefined;
18 13
19 var line_iter: IterRead = .{ .read = &input_stream }; 14 var line_iter: IterRead = .{ .read = &reader };
20 15
21 while (try line_iter.next(&buf)) |line| { 16 while (line_iter.next()) |line| {
22 // Iterate over fields. 17 // Iterate over fields.
23 var fields = mem.splitScalar(u8, line, ';'); 18 var fields = mem.splitScalar(u8, line, ';');
24 var field_index: usize = 0; 19 var field_index: usize = 0;
25 var input: []u8 = undefined; 20 var input: []u8 = undefined;
26 defer allocator.free(input); 21 if (dbg_print) std.debug.print("Line: {s}\n", .{line});
27
28 while (fields.next()) |field| : (field_index += 1) { 22 while (fields.next()) |field| : (field_index += 1) {
29 if (field_index == 0) { 23 if (field_index == 0) {
30 var i_buf = std.ArrayList(u8).init(allocator); 24 var i_buf = std.array_list.Managed(u8).init(allocator);
31 defer i_buf.deinit(); 25 defer i_buf.deinit();
32 26
33 var i_fields = mem.splitScalar(u8, field, ' '); 27 var i_fields = mem.splitScalar(u8, field, ' ');
34 while (i_fields.next()) |s| { 28 while (i_fields.next()) |s| {
29 if (dbg_print) std.debug.print("Debug: {s}\n", .{s});
35 const icp = try fmt.parseInt(u21, s, 16); 30 const icp = try fmt.parseInt(u21, s, 16);
36 const len = try unicode.utf8Encode(icp, &cp_buf); 31 const len = try unicode.utf8Encode(icp, &cp_buf);
37 try i_buf.appendSlice(cp_buf[0..len]); 32 try i_buf.appendSlice(cp_buf[0..len]);
@@ -41,7 +36,7 @@ test "Unicode normalization tests" {
41 } else if (field_index == 1) { 36 } else if (field_index == 1) {
42 if (dbg_print) debug.print("\n*** {s} ***\n", .{line}); 37 if (dbg_print) debug.print("\n*** {s} ***\n", .{line});
43 // NFC, time to test. 38 // NFC, time to test.
44 var w_buf = std.ArrayList(u8).init(allocator); 39 var w_buf = std.array_list.Managed(u8).init(allocator);
45 defer w_buf.deinit(); 40 defer w_buf.deinit();
46 41
47 var w_fields = mem.splitScalar(u8, field, ' '); 42 var w_fields = mem.splitScalar(u8, field, ' ');
@@ -58,7 +53,7 @@ test "Unicode normalization tests" {
58 try testing.expectEqualStrings(want, got.slice); 53 try testing.expectEqualStrings(want, got.slice);
59 } else if (field_index == 2) { 54 } else if (field_index == 2) {
60 // NFD, time to test. 55 // NFD, time to test.
61 var w_buf = std.ArrayList(u8).init(allocator); 56 var w_buf = std.array_list.Managed(u8).init(allocator);
62 defer w_buf.deinit(); 57 defer w_buf.deinit();
63 58
64 var w_fields = mem.splitScalar(u8, field, ' '); 59 var w_fields = mem.splitScalar(u8, field, ' ');
@@ -75,7 +70,7 @@ test "Unicode normalization tests" {
75 try testing.expectEqualStrings(want, got.slice); 70 try testing.expectEqualStrings(want, got.slice);
76 } else if (field_index == 3) { 71 } else if (field_index == 3) {
77 // NFKC, time to test. 72 // NFKC, time to test.
78 var w_buf = std.ArrayList(u8).init(allocator); 73 var w_buf = std.array_list.Managed(u8).init(allocator);
79 defer w_buf.deinit(); 74 defer w_buf.deinit();
80 75
81 var w_fields = mem.splitScalar(u8, field, ' '); 76 var w_fields = mem.splitScalar(u8, field, ' ');
@@ -92,7 +87,7 @@ test "Unicode normalization tests" {
92 try testing.expectEqualStrings(want, got.slice); 87 try testing.expectEqualStrings(want, got.slice);
93 } else if (field_index == 4) { 88 } else if (field_index == 4) {
94 // NFKD, time to test. 89 // NFKD, time to test.
95 var w_buf = std.ArrayList(u8).init(allocator); 90 var w_buf = std.array_list.Managed(u8).init(allocator);
96 defer w_buf.deinit(); 91 defer w_buf.deinit();
97 92
98 var w_fields = mem.splitScalar(u8, field, ' '); 93 var w_fields = mem.splitScalar(u8, field, ' ');
@@ -111,33 +106,34 @@ test "Unicode normalization tests" {
111 continue; 106 continue;
112 } 107 }
113 } 108 }
109 } else |err| switch (err) {
110 error.EndOfStream => {},
111 else => {
112 return err;
113 },
114 } 114 }
115} 115}
116 116
117test "Segmentation GraphemeIterator" { 117test "Segmentation GraphemeIterator" {
118 const allocator = std.testing.allocator; 118 const allocator = std.testing.allocator;
119 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{});
120 defer file.close();
121 var buf_reader = std.io.bufferedReader(file.reader());
122 var input_stream = buf_reader.reader();
123 119
120 var reader = std.io.Reader.fixed(@embedFile("GraphemeBreakTest.txt"));
124 const graph = try Graphemes.init(allocator); 121 const graph = try Graphemes.init(allocator);
125 defer graph.deinit(allocator); 122 defer graph.deinit(allocator);
126 123
127 var buf: [4096]u8 = undefined; 124 var line_iter: IterRead = .{ .read = &reader };
128 var line_iter: IterRead = .{ .read = &input_stream };
129 125
130 while (try line_iter.next(&buf)) |raw| { 126 while (line_iter.next()) |raw| {
131 // Clean up. 127 // Clean up.
132 var line = std.mem.trimLeft(u8, raw, "÷ "); 128 var line = std.mem.trimLeft(u8, raw, "÷ ");
133 if (std.mem.indexOf(u8, line, " ÷\t")) |final| { 129 if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
134 line = line[0..final]; 130 line = line[0..final];
135 } 131 }
136 // Iterate over fields. 132 // Iterate over fields.
137 var want = std.ArrayList(Grapheme).init(allocator); 133 var want = std.array_list.Managed(Grapheme).init(allocator);
138 defer want.deinit(); 134 defer want.deinit();
139 135
140 var all_bytes = std.ArrayList(u8).init(allocator); 136 var all_bytes = std.array_list.Managed(u8).init(allocator);
141 defer all_bytes.deinit(); 137 defer all_bytes.deinit();
142 138
143 var graphemes = std.mem.splitSequence(u8, line, " ÷ "); 139 var graphemes = std.mem.splitSequence(u8, line, " ÷ ");
@@ -250,33 +246,33 @@ test "Segmentation GraphemeIterator" {
250 } 246 }
251 } 247 }
252 } 248 }
249 } else |err| switch (err) {
250 error.EndOfStream => {},
251 else => {
252 return err;
253 },
253 } 254 }
254} 255}
255 256
256test "Segmentation Word Iterator" { 257test "Segmentation Word Iterator" {
257 const allocator = std.testing.allocator; 258 const allocator = std.testing.allocator;
258 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{}); 259 var reader = std.io.Reader.fixed(@embedFile("WordBreakTest.txt"));
259 defer file.close();
260 var buf_reader = std.io.bufferedReader(file.reader());
261 var input_stream = buf_reader.reader();
262
263 const wb = try Words.init(allocator); 260 const wb = try Words.init(allocator);
264 defer wb.deinit(allocator); 261 defer wb.deinit(allocator);
265 262
266 var buf: [4096]u8 = undefined; 263 var line_iter: IterRead = .{ .read = &reader };
267 var line_iter: IterRead = .{ .read = &input_stream };
268 264
269 while (try line_iter.next(&buf)) |raw| { 265 while (line_iter.next()) |raw| {
270 // Clean up. 266 // Clean up.
271 var line = std.mem.trimLeft(u8, raw, "÷ "); 267 var line = std.mem.trimLeft(u8, raw, "÷ ");
272 if (std.mem.indexOf(u8, line, " ÷\t")) |final| { 268 if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
273 line = line[0..final]; 269 line = line[0..final];
274 } 270 }
275 // Iterate over fields. 271 // Iterate over fields.
276 var want = std.ArrayList(Word).init(allocator); 272 var want = std.array_list.Managed(Word).init(allocator);
277 defer want.deinit(); 273 defer want.deinit();
278 274
279 var all_bytes = std.ArrayList(u8).init(allocator); 275 var all_bytes = std.array_list.Managed(u8).init(allocator);
280 defer all_bytes.deinit(); 276 defer all_bytes.deinit();
281 277
282 var words = std.mem.splitSequence(u8, line, " ÷ "); 278 var words = std.mem.splitSequence(u8, line, " ÷ ");
@@ -439,26 +435,27 @@ test "Segmentation Word Iterator" {
439 if (idx == 0) break; 435 if (idx == 0) break;
440 } 436 }
441 } 437 }
438 } else |err| switch (err) {
439 error.EndOfStream => {},
440 else => {
441 return err;
442 },
442 } 443 }
443} 444}
444 445
445const IterRead = struct { 446const IterRead = struct {
446 read: *Reader, 447 read: *io.Reader,
447 line: usize = 0, 448 line: usize = 0,
448 449
449 pub fn next(iter: *IterRead, buf: []u8) !?[]const u8 { 450 pub fn next(iter: *IterRead) anyerror![]const u8 {
450 defer iter.line += 1; 451 iter.line += 1;
451 const maybe_line = try iter.read.readUntilDelimiterOrEof(buf, '#'); 452 const took = try iter.read.takeDelimiterInclusive('\n');
452 if (maybe_line) |this_line| { 453 const this_line = std.mem.trimRight(u8, took, "\n");
453 try iter.read.skipUntilDelimiterOrEof('\n'); 454 if (this_line.len == 0 or this_line[0] == '@' or this_line[0] == '#') {
454 if (this_line.len == 0 or this_line[0] == '@') { 455 // comment, next line
455 // comment, next line 456 return iter.next();
456 return iter.next(buf);
457 } else {
458 return this_line;
459 }
460 } else { 457 } else {
461 return null; 458 return this_line;
462 } 459 }
463 } 460 }
464}; 461};
@@ -467,7 +464,6 @@ const std = @import("std");
467const fmt = std.fmt; 464const fmt = std.fmt;
468const fs = std.fs; 465const fs = std.fs;
469const io = std.io; 466const io = std.io;
470const Reader = io.BufferedReader(4096, fs.File.Reader).Reader;
471const heap = std.heap; 467const heap = std.heap;
472const mem = std.mem; 468const mem = std.mem;
473const debug = std.debug; 469const debug = std.debug;