diff options
Diffstat (limited to 'src/unicode_tests.zig')
| -rw-r--r-- | src/unicode_tests.zig | 94 |
1 files changed, 45 insertions, 49 deletions
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index ae177a9..e2a5a96 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -3,35 +3,30 @@ const dbg_print = false; | |||
| 3 | test "Unicode normalization tests" { | 3 | test "Unicode normalization tests" { |
| 4 | var arena = heap.ArenaAllocator.init(testing.allocator); | 4 | var arena = heap.ArenaAllocator.init(testing.allocator); |
| 5 | defer arena.deinit(); | 5 | defer arena.deinit(); |
| 6 | var allocator = arena.allocator(); | 6 | const allocator = arena.allocator(); |
| 7 | 7 | ||
| 8 | const n = try Normalize.init(allocator); | 8 | const n = try Normalize.init(allocator); |
| 9 | defer n.deinit(allocator); | 9 | defer n.deinit(allocator); |
| 10 | 10 | ||
| 11 | var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); | 11 | var reader = std.io.Reader.fixed(@embedFile("NormalizationTest.txt")); |
| 12 | defer file.close(); | ||
| 13 | var buf_reader = io.bufferedReader(file.reader()); | ||
| 14 | var input_stream = buf_reader.reader(); | ||
| 15 | |||
| 16 | var buf: [4096]u8 = undefined; | ||
| 17 | var cp_buf: [4]u8 = undefined; | 12 | var cp_buf: [4]u8 = undefined; |
| 18 | 13 | ||
| 19 | var line_iter: IterRead = .{ .read = &input_stream }; | 14 | var line_iter: IterRead = .{ .read = &reader }; |
| 20 | 15 | ||
| 21 | while (try line_iter.next(&buf)) |line| { | 16 | while (line_iter.next()) |line| { |
| 22 | // Iterate over fields. | 17 | // Iterate over fields. |
| 23 | var fields = mem.splitScalar(u8, line, ';'); | 18 | var fields = mem.splitScalar(u8, line, ';'); |
| 24 | var field_index: usize = 0; | 19 | var field_index: usize = 0; |
| 25 | var input: []u8 = undefined; | 20 | var input: []u8 = undefined; |
| 26 | defer allocator.free(input); | 21 | if (dbg_print) std.debug.print("Line: {s}\n", .{line}); |
| 27 | |||
| 28 | while (fields.next()) |field| : (field_index += 1) { | 22 | while (fields.next()) |field| : (field_index += 1) { |
| 29 | if (field_index == 0) { | 23 | if (field_index == 0) { |
| 30 | var i_buf = std.ArrayList(u8).init(allocator); | 24 | var i_buf = std.array_list.Managed(u8).init(allocator); |
| 31 | defer i_buf.deinit(); | 25 | defer i_buf.deinit(); |
| 32 | 26 | ||
| 33 | var i_fields = mem.splitScalar(u8, field, ' '); | 27 | var i_fields = mem.splitScalar(u8, field, ' '); |
| 34 | while (i_fields.next()) |s| { | 28 | while (i_fields.next()) |s| { |
| 29 | if (dbg_print) std.debug.print("Debug: {s}\n", .{s}); | ||
| 35 | const icp = try fmt.parseInt(u21, s, 16); | 30 | const icp = try fmt.parseInt(u21, s, 16); |
| 36 | const len = try unicode.utf8Encode(icp, &cp_buf); | 31 | const len = try unicode.utf8Encode(icp, &cp_buf); |
| 37 | try i_buf.appendSlice(cp_buf[0..len]); | 32 | try i_buf.appendSlice(cp_buf[0..len]); |
| @@ -41,7 +36,7 @@ test "Unicode normalization tests" { | |||
| 41 | } else if (field_index == 1) { | 36 | } else if (field_index == 1) { |
| 42 | if (dbg_print) debug.print("\n*** {s} ***\n", .{line}); | 37 | if (dbg_print) debug.print("\n*** {s} ***\n", .{line}); |
| 43 | // NFC, time to test. | 38 | // NFC, time to test. |
| 44 | var w_buf = std.ArrayList(u8).init(allocator); | 39 | var w_buf = std.array_list.Managed(u8).init(allocator); |
| 45 | defer w_buf.deinit(); | 40 | defer w_buf.deinit(); |
| 46 | 41 | ||
| 47 | var w_fields = mem.splitScalar(u8, field, ' '); | 42 | var w_fields = mem.splitScalar(u8, field, ' '); |
| @@ -58,7 +53,7 @@ test "Unicode normalization tests" { | |||
| 58 | try testing.expectEqualStrings(want, got.slice); | 53 | try testing.expectEqualStrings(want, got.slice); |
| 59 | } else if (field_index == 2) { | 54 | } else if (field_index == 2) { |
| 60 | // NFD, time to test. | 55 | // NFD, time to test. |
| 61 | var w_buf = std.ArrayList(u8).init(allocator); | 56 | var w_buf = std.array_list.Managed(u8).init(allocator); |
| 62 | defer w_buf.deinit(); | 57 | defer w_buf.deinit(); |
| 63 | 58 | ||
| 64 | var w_fields = mem.splitScalar(u8, field, ' '); | 59 | var w_fields = mem.splitScalar(u8, field, ' '); |
| @@ -75,7 +70,7 @@ test "Unicode normalization tests" { | |||
| 75 | try testing.expectEqualStrings(want, got.slice); | 70 | try testing.expectEqualStrings(want, got.slice); |
| 76 | } else if (field_index == 3) { | 71 | } else if (field_index == 3) { |
| 77 | // NFKC, time to test. | 72 | // NFKC, time to test. |
| 78 | var w_buf = std.ArrayList(u8).init(allocator); | 73 | var w_buf = std.array_list.Managed(u8).init(allocator); |
| 79 | defer w_buf.deinit(); | 74 | defer w_buf.deinit(); |
| 80 | 75 | ||
| 81 | var w_fields = mem.splitScalar(u8, field, ' '); | 76 | var w_fields = mem.splitScalar(u8, field, ' '); |
| @@ -92,7 +87,7 @@ test "Unicode normalization tests" { | |||
| 92 | try testing.expectEqualStrings(want, got.slice); | 87 | try testing.expectEqualStrings(want, got.slice); |
| 93 | } else if (field_index == 4) { | 88 | } else if (field_index == 4) { |
| 94 | // NFKD, time to test. | 89 | // NFKD, time to test. |
| 95 | var w_buf = std.ArrayList(u8).init(allocator); | 90 | var w_buf = std.array_list.Managed(u8).init(allocator); |
| 96 | defer w_buf.deinit(); | 91 | defer w_buf.deinit(); |
| 97 | 92 | ||
| 98 | var w_fields = mem.splitScalar(u8, field, ' '); | 93 | var w_fields = mem.splitScalar(u8, field, ' '); |
| @@ -111,33 +106,34 @@ test "Unicode normalization tests" { | |||
| 111 | continue; | 106 | continue; |
| 112 | } | 107 | } |
| 113 | } | 108 | } |
| 109 | } else |err| switch (err) { | ||
| 110 | error.EndOfStream => {}, | ||
| 111 | else => { | ||
| 112 | return err; | ||
| 113 | }, | ||
| 114 | } | 114 | } |
| 115 | } | 115 | } |
| 116 | 116 | ||
| 117 | test "Segmentation GraphemeIterator" { | 117 | test "Segmentation GraphemeIterator" { |
| 118 | const allocator = std.testing.allocator; | 118 | const allocator = std.testing.allocator; |
| 119 | var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); | ||
| 120 | defer file.close(); | ||
| 121 | var buf_reader = std.io.bufferedReader(file.reader()); | ||
| 122 | var input_stream = buf_reader.reader(); | ||
| 123 | 119 | ||
| 120 | var reader = std.io.Reader.fixed(@embedFile("GraphemeBreakTest.txt")); | ||
| 124 | const graph = try Graphemes.init(allocator); | 121 | const graph = try Graphemes.init(allocator); |
| 125 | defer graph.deinit(allocator); | 122 | defer graph.deinit(allocator); |
| 126 | 123 | ||
| 127 | var buf: [4096]u8 = undefined; | 124 | var line_iter: IterRead = .{ .read = &reader }; |
| 128 | var line_iter: IterRead = .{ .read = &input_stream }; | ||
| 129 | 125 | ||
| 130 | while (try line_iter.next(&buf)) |raw| { | 126 | while (line_iter.next()) |raw| { |
| 131 | // Clean up. | 127 | // Clean up. |
| 132 | var line = std.mem.trimLeft(u8, raw, "÷ "); | 128 | var line = std.mem.trimLeft(u8, raw, "÷ "); |
| 133 | if (std.mem.indexOf(u8, line, " ÷\t")) |final| { | 129 | if (std.mem.indexOf(u8, line, " ÷\t")) |final| { |
| 134 | line = line[0..final]; | 130 | line = line[0..final]; |
| 135 | } | 131 | } |
| 136 | // Iterate over fields. | 132 | // Iterate over fields. |
| 137 | var want = std.ArrayList(Grapheme).init(allocator); | 133 | var want = std.array_list.Managed(Grapheme).init(allocator); |
| 138 | defer want.deinit(); | 134 | defer want.deinit(); |
| 139 | 135 | ||
| 140 | var all_bytes = std.ArrayList(u8).init(allocator); | 136 | var all_bytes = std.array_list.Managed(u8).init(allocator); |
| 141 | defer all_bytes.deinit(); | 137 | defer all_bytes.deinit(); |
| 142 | 138 | ||
| 143 | var graphemes = std.mem.splitSequence(u8, line, " ÷ "); | 139 | var graphemes = std.mem.splitSequence(u8, line, " ÷ "); |
| @@ -250,33 +246,33 @@ test "Segmentation GraphemeIterator" { | |||
| 250 | } | 246 | } |
| 251 | } | 247 | } |
| 252 | } | 248 | } |
| 249 | } else |err| switch (err) { | ||
| 250 | error.EndOfStream => {}, | ||
| 251 | else => { | ||
| 252 | return err; | ||
| 253 | }, | ||
| 253 | } | 254 | } |
| 254 | } | 255 | } |
| 255 | 256 | ||
| 256 | test "Segmentation Word Iterator" { | 257 | test "Segmentation Word Iterator" { |
| 257 | const allocator = std.testing.allocator; | 258 | const allocator = std.testing.allocator; |
| 258 | var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{}); | 259 | var reader = std.io.Reader.fixed(@embedFile("WordBreakTest.txt")); |
| 259 | defer file.close(); | ||
| 260 | var buf_reader = std.io.bufferedReader(file.reader()); | ||
| 261 | var input_stream = buf_reader.reader(); | ||
| 262 | |||
| 263 | const wb = try Words.init(allocator); | 260 | const wb = try Words.init(allocator); |
| 264 | defer wb.deinit(allocator); | 261 | defer wb.deinit(allocator); |
| 265 | 262 | ||
| 266 | var buf: [4096]u8 = undefined; | 263 | var line_iter: IterRead = .{ .read = &reader }; |
| 267 | var line_iter: IterRead = .{ .read = &input_stream }; | ||
| 268 | 264 | ||
| 269 | while (try line_iter.next(&buf)) |raw| { | 265 | while (line_iter.next()) |raw| { |
| 270 | // Clean up. | 266 | // Clean up. |
| 271 | var line = std.mem.trimLeft(u8, raw, "÷ "); | 267 | var line = std.mem.trimLeft(u8, raw, "÷ "); |
| 272 | if (std.mem.indexOf(u8, line, " ÷\t")) |final| { | 268 | if (std.mem.indexOf(u8, line, " ÷\t")) |final| { |
| 273 | line = line[0..final]; | 269 | line = line[0..final]; |
| 274 | } | 270 | } |
| 275 | // Iterate over fields. | 271 | // Iterate over fields. |
| 276 | var want = std.ArrayList(Word).init(allocator); | 272 | var want = std.array_list.Managed(Word).init(allocator); |
| 277 | defer want.deinit(); | 273 | defer want.deinit(); |
| 278 | 274 | ||
| 279 | var all_bytes = std.ArrayList(u8).init(allocator); | 275 | var all_bytes = std.array_list.Managed(u8).init(allocator); |
| 280 | defer all_bytes.deinit(); | 276 | defer all_bytes.deinit(); |
| 281 | 277 | ||
| 282 | var words = std.mem.splitSequence(u8, line, " ÷ "); | 278 | var words = std.mem.splitSequence(u8, line, " ÷ "); |
| @@ -439,26 +435,27 @@ test "Segmentation Word Iterator" { | |||
| 439 | if (idx == 0) break; | 435 | if (idx == 0) break; |
| 440 | } | 436 | } |
| 441 | } | 437 | } |
| 438 | } else |err| switch (err) { | ||
| 439 | error.EndOfStream => {}, | ||
| 440 | else => { | ||
| 441 | return err; | ||
| 442 | }, | ||
| 442 | } | 443 | } |
| 443 | } | 444 | } |
| 444 | 445 | ||
| 445 | const IterRead = struct { | 446 | const IterRead = struct { |
| 446 | read: *Reader, | 447 | read: *io.Reader, |
| 447 | line: usize = 0, | 448 | line: usize = 0, |
| 448 | 449 | ||
| 449 | pub fn next(iter: *IterRead, buf: []u8) !?[]const u8 { | 450 | pub fn next(iter: *IterRead) anyerror![]const u8 { |
| 450 | defer iter.line += 1; | 451 | iter.line += 1; |
| 451 | const maybe_line = try iter.read.readUntilDelimiterOrEof(buf, '#'); | 452 | const took = try iter.read.takeDelimiterInclusive('\n'); |
| 452 | if (maybe_line) |this_line| { | 453 | const this_line = std.mem.trimRight(u8, took, "\n"); |
| 453 | try iter.read.skipUntilDelimiterOrEof('\n'); | 454 | if (this_line.len == 0 or this_line[0] == '@' or this_line[0] == '#') { |
| 454 | if (this_line.len == 0 or this_line[0] == '@') { | 455 | // comment, next line |
| 455 | // comment, next line | 456 | return iter.next(); |
| 456 | return iter.next(buf); | ||
| 457 | } else { | ||
| 458 | return this_line; | ||
| 459 | } | ||
| 460 | } else { | 457 | } else { |
| 461 | return null; | 458 | return this_line; |
| 462 | } | 459 | } |
| 463 | } | 460 | } |
| 464 | }; | 461 | }; |
| @@ -467,7 +464,6 @@ const std = @import("std"); | |||
| 467 | const fmt = std.fmt; | 464 | const fmt = std.fmt; |
| 468 | const fs = std.fs; | 465 | const fs = std.fs; |
| 469 | const io = std.io; | 466 | const io = std.io; |
| 470 | const Reader = io.BufferedReader(4096, fs.File.Reader).Reader; | ||
| 471 | const heap = std.heap; | 467 | const heap = std.heap; |
| 472 | const mem = std.mem; | 468 | const mem = std.mem; |
| 473 | const debug = std.debug; | 469 | const debug = std.debug; |