summaryrefslogtreecommitdiff
path: root/src/unicode_tests.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/unicode_tests.zig')
-rw-r--r--src/unicode_tests.zig88
1 files changed, 47 insertions, 41 deletions
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index ae177a9..ff49b2a 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -3,35 +3,34 @@ const dbg_print = false;
3test "Unicode normalization tests" { 3test "Unicode normalization tests" {
4 var arena = heap.ArenaAllocator.init(testing.allocator); 4 var arena = heap.ArenaAllocator.init(testing.allocator);
5 defer arena.deinit(); 5 defer arena.deinit();
6 var allocator = arena.allocator(); 6 const allocator = arena.allocator();
7 7
8 const n = try Normalize.init(allocator); 8 const n = try Normalize.init(allocator);
9 defer n.deinit(allocator); 9 defer n.deinit(allocator);
10 10
11 var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); 11 var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
12 defer file.close(); 12 defer file.close();
13 var buf_reader = io.bufferedReader(file.reader());
14 var input_stream = buf_reader.reader();
15
16 var buf: [4096]u8 = undefined; 13 var buf: [4096]u8 = undefined;
14 var reader = file.reader(&buf);
15
17 var cp_buf: [4]u8 = undefined; 16 var cp_buf: [4]u8 = undefined;
18 17
19 var line_iter: IterRead = .{ .read = &input_stream }; 18 var line_iter: IterRead = .{ .read = &reader.interface };
20 19
21 while (try line_iter.next(&buf)) |line| { 20 while (line_iter.next()) |line| {
22 // Iterate over fields. 21 // Iterate over fields.
23 var fields = mem.splitScalar(u8, line, ';'); 22 var fields = mem.splitScalar(u8, line, ';');
24 var field_index: usize = 0; 23 var field_index: usize = 0;
25 var input: []u8 = undefined; 24 var input: []u8 = undefined;
26 defer allocator.free(input); 25 if (dbg_print) std.debug.print("Line: {s}\n", .{line});
27
28 while (fields.next()) |field| : (field_index += 1) { 26 while (fields.next()) |field| : (field_index += 1) {
29 if (field_index == 0) { 27 if (field_index == 0) {
30 var i_buf = std.ArrayList(u8).init(allocator); 28 var i_buf = std.array_list.Managed(u8).init(allocator);
31 defer i_buf.deinit(); 29 defer i_buf.deinit();
32 30
33 var i_fields = mem.splitScalar(u8, field, ' '); 31 var i_fields = mem.splitScalar(u8, field, ' ');
34 while (i_fields.next()) |s| { 32 while (i_fields.next()) |s| {
33 if (dbg_print) std.debug.print("Debug: {s}\n", .{s});
35 const icp = try fmt.parseInt(u21, s, 16); 34 const icp = try fmt.parseInt(u21, s, 16);
36 const len = try unicode.utf8Encode(icp, &cp_buf); 35 const len = try unicode.utf8Encode(icp, &cp_buf);
37 try i_buf.appendSlice(cp_buf[0..len]); 36 try i_buf.appendSlice(cp_buf[0..len]);
@@ -41,7 +40,7 @@ test "Unicode normalization tests" {
41 } else if (field_index == 1) { 40 } else if (field_index == 1) {
42 if (dbg_print) debug.print("\n*** {s} ***\n", .{line}); 41 if (dbg_print) debug.print("\n*** {s} ***\n", .{line});
43 // NFC, time to test. 42 // NFC, time to test.
44 var w_buf = std.ArrayList(u8).init(allocator); 43 var w_buf = std.array_list.Managed(u8).init(allocator);
45 defer w_buf.deinit(); 44 defer w_buf.deinit();
46 45
47 var w_fields = mem.splitScalar(u8, field, ' '); 46 var w_fields = mem.splitScalar(u8, field, ' ');
@@ -58,7 +57,7 @@ test "Unicode normalization tests" {
58 try testing.expectEqualStrings(want, got.slice); 57 try testing.expectEqualStrings(want, got.slice);
59 } else if (field_index == 2) { 58 } else if (field_index == 2) {
60 // NFD, time to test. 59 // NFD, time to test.
61 var w_buf = std.ArrayList(u8).init(allocator); 60 var w_buf = std.array_list.Managed(u8).init(allocator);
62 defer w_buf.deinit(); 61 defer w_buf.deinit();
63 62
64 var w_fields = mem.splitScalar(u8, field, ' '); 63 var w_fields = mem.splitScalar(u8, field, ' ');
@@ -75,7 +74,7 @@ test "Unicode normalization tests" {
75 try testing.expectEqualStrings(want, got.slice); 74 try testing.expectEqualStrings(want, got.slice);
76 } else if (field_index == 3) { 75 } else if (field_index == 3) {
77 // NFKC, time to test. 76 // NFKC, time to test.
78 var w_buf = std.ArrayList(u8).init(allocator); 77 var w_buf = std.array_list.Managed(u8).init(allocator);
79 defer w_buf.deinit(); 78 defer w_buf.deinit();
80 79
81 var w_fields = mem.splitScalar(u8, field, ' '); 80 var w_fields = mem.splitScalar(u8, field, ' ');
@@ -92,7 +91,7 @@ test "Unicode normalization tests" {
92 try testing.expectEqualStrings(want, got.slice); 91 try testing.expectEqualStrings(want, got.slice);
93 } else if (field_index == 4) { 92 } else if (field_index == 4) {
94 // NFKD, time to test. 93 // NFKD, time to test.
95 var w_buf = std.ArrayList(u8).init(allocator); 94 var w_buf = std.array_list.Managed(u8).init(allocator);
96 defer w_buf.deinit(); 95 defer w_buf.deinit();
97 96
98 var w_fields = mem.splitScalar(u8, field, ' '); 97 var w_fields = mem.splitScalar(u8, field, ' ');
@@ -111,6 +110,11 @@ test "Unicode normalization tests" {
111 continue; 110 continue;
112 } 111 }
113 } 112 }
113 } else |err| switch (err) {
114 error.EndOfStream => {},
115 else => {
116 return err;
117 },
114 } 118 }
115} 119}
116 120
@@ -118,26 +122,25 @@ test "Segmentation GraphemeIterator" {
118 const allocator = std.testing.allocator; 122 const allocator = std.testing.allocator;
119 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); 123 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{});
120 defer file.close(); 124 defer file.close();
121 var buf_reader = std.io.bufferedReader(file.reader()); 125 var buf: [4096]u8 = undefined;
122 var input_stream = buf_reader.reader(); 126 var reader = file.reader(&buf);
123 127
124 const graph = try Graphemes.init(allocator); 128 const graph = try Graphemes.init(allocator);
125 defer graph.deinit(allocator); 129 defer graph.deinit(allocator);
126 130
127 var buf: [4096]u8 = undefined; 131 var line_iter: IterRead = .{ .read = &reader.interface };
128 var line_iter: IterRead = .{ .read = &input_stream };
129 132
130 while (try line_iter.next(&buf)) |raw| { 133 while (line_iter.next()) |raw| {
131 // Clean up. 134 // Clean up.
132 var line = std.mem.trimLeft(u8, raw, "÷ "); 135 var line = std.mem.trimLeft(u8, raw, "÷ ");
133 if (std.mem.indexOf(u8, line, " ÷\t")) |final| { 136 if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
134 line = line[0..final]; 137 line = line[0..final];
135 } 138 }
136 // Iterate over fields. 139 // Iterate over fields.
137 var want = std.ArrayList(Grapheme).init(allocator); 140 var want = std.array_list.Managed(Grapheme).init(allocator);
138 defer want.deinit(); 141 defer want.deinit();
139 142
140 var all_bytes = std.ArrayList(u8).init(allocator); 143 var all_bytes = std.array_list.Managed(u8).init(allocator);
141 defer all_bytes.deinit(); 144 defer all_bytes.deinit();
142 145
143 var graphemes = std.mem.splitSequence(u8, line, " ÷ "); 146 var graphemes = std.mem.splitSequence(u8, line, " ÷ ");
@@ -250,6 +253,11 @@ test "Segmentation GraphemeIterator" {
250 } 253 }
251 } 254 }
252 } 255 }
256 } else |err| switch (err) {
257 error.EndOfStream => {},
258 else => {
259 return err;
260 },
253 } 261 }
254} 262}
255 263
@@ -257,26 +265,25 @@ test "Segmentation Word Iterator" {
257 const allocator = std.testing.allocator; 265 const allocator = std.testing.allocator;
258 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{}); 266 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{});
259 defer file.close(); 267 defer file.close();
260 var buf_reader = std.io.bufferedReader(file.reader()); 268 var buf: [4096]u8 = undefined;
261 var input_stream = buf_reader.reader(); 269 var reader = file.reader(&buf);
262 270
263 const wb = try Words.init(allocator); 271 const wb = try Words.init(allocator);
264 defer wb.deinit(allocator); 272 defer wb.deinit(allocator);
265 273
266 var buf: [4096]u8 = undefined; 274 var line_iter: IterRead = .{ .read = &reader.interface };
267 var line_iter: IterRead = .{ .read = &input_stream };
268 275
269 while (try line_iter.next(&buf)) |raw| { 276 while (line_iter.next()) |raw| {
270 // Clean up. 277 // Clean up.
271 var line = std.mem.trimLeft(u8, raw, "÷ "); 278 var line = std.mem.trimLeft(u8, raw, "÷ ");
272 if (std.mem.indexOf(u8, line, " ÷\t")) |final| { 279 if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
273 line = line[0..final]; 280 line = line[0..final];
274 } 281 }
275 // Iterate over fields. 282 // Iterate over fields.
276 var want = std.ArrayList(Word).init(allocator); 283 var want = std.array_list.Managed(Word).init(allocator);
277 defer want.deinit(); 284 defer want.deinit();
278 285
279 var all_bytes = std.ArrayList(u8).init(allocator); 286 var all_bytes = std.array_list.Managed(u8).init(allocator);
280 defer all_bytes.deinit(); 287 defer all_bytes.deinit();
281 288
282 var words = std.mem.splitSequence(u8, line, " ÷ "); 289 var words = std.mem.splitSequence(u8, line, " ÷ ");
@@ -439,26 +446,26 @@ test "Segmentation Word Iterator" {
439 if (idx == 0) break; 446 if (idx == 0) break;
440 } 447 }
441 } 448 }
449 } else |err| switch (err) {
450 error.EndOfStream => {},
451 else => {
452 return err;
453 },
442 } 454 }
443} 455}
444 456
445const IterRead = struct { 457const IterRead = struct {
446 read: *Reader, 458 read: *io.Reader,
447 line: usize = 0, 459 line: usize = 0,
448 460
449 pub fn next(iter: *IterRead, buf: []u8) !?[]const u8 { 461 pub fn next(iter: *IterRead) anyerror![]const u8 {
450 defer iter.line += 1; 462 iter.line += 1;
451 const maybe_line = try iter.read.readUntilDelimiterOrEof(buf, '#'); 463 const this_line = try iter.read.takeDelimiterExclusive('\n');
452 if (maybe_line) |this_line| { 464 if (this_line.len == 0 or this_line[0] == '@' or this_line[0] == '#') {
453 try iter.read.skipUntilDelimiterOrEof('\n'); 465 // comment, next line
454 if (this_line.len == 0 or this_line[0] == '@') { 466 return iter.next();
455 // comment, next line
456 return iter.next(buf);
457 } else {
458 return this_line;
459 }
460 } else { 467 } else {
461 return null; 468 return this_line;
462 } 469 }
463 } 470 }
464}; 471};
@@ -467,7 +474,6 @@ const std = @import("std");
467const fmt = std.fmt; 474const fmt = std.fmt;
468const fs = std.fs; 475const fs = std.fs;
469const io = std.io; 476const io = std.io;
470const Reader = io.BufferedReader(4096, fs.File.Reader).Reader;
471const heap = std.heap; 477const heap = std.heap;
472const mem = std.mem; 478const mem = std.mem;
473const debug = std.debug; 479const debug = std.debug;