diff options
Diffstat (limited to 'src/unicode_tests.zig')
| -rw-r--r-- | src/unicode_tests.zig | 88 |
1 files changed, 47 insertions, 41 deletions
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index ae177a9..ff49b2a 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -3,35 +3,34 @@ const dbg_print = false; | |||
| 3 | test "Unicode normalization tests" { | 3 | test "Unicode normalization tests" { |
| 4 | var arena = heap.ArenaAllocator.init(testing.allocator); | 4 | var arena = heap.ArenaAllocator.init(testing.allocator); |
| 5 | defer arena.deinit(); | 5 | defer arena.deinit(); |
| 6 | var allocator = arena.allocator(); | 6 | const allocator = arena.allocator(); |
| 7 | 7 | ||
| 8 | const n = try Normalize.init(allocator); | 8 | const n = try Normalize.init(allocator); |
| 9 | defer n.deinit(allocator); | 9 | defer n.deinit(allocator); |
| 10 | 10 | ||
| 11 | var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); | 11 | var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); |
| 12 | defer file.close(); | 12 | defer file.close(); |
| 13 | var buf_reader = io.bufferedReader(file.reader()); | ||
| 14 | var input_stream = buf_reader.reader(); | ||
| 15 | |||
| 16 | var buf: [4096]u8 = undefined; | 13 | var buf: [4096]u8 = undefined; |
| 14 | var reader = file.reader(&buf); | ||
| 15 | |||
| 17 | var cp_buf: [4]u8 = undefined; | 16 | var cp_buf: [4]u8 = undefined; |
| 18 | 17 | ||
| 19 | var line_iter: IterRead = .{ .read = &input_stream }; | 18 | var line_iter: IterRead = .{ .read = &reader.interface }; |
| 20 | 19 | ||
| 21 | while (try line_iter.next(&buf)) |line| { | 20 | while (line_iter.next()) |line| { |
| 22 | // Iterate over fields. | 21 | // Iterate over fields. |
| 23 | var fields = mem.splitScalar(u8, line, ';'); | 22 | var fields = mem.splitScalar(u8, line, ';'); |
| 24 | var field_index: usize = 0; | 23 | var field_index: usize = 0; |
| 25 | var input: []u8 = undefined; | 24 | var input: []u8 = undefined; |
| 26 | defer allocator.free(input); | 25 | if (dbg_print) std.debug.print("Line: {s}\n", .{line}); |
| 27 | |||
| 28 | while (fields.next()) |field| : (field_index += 1) { | 26 | while (fields.next()) |field| : (field_index += 1) { |
| 29 | if (field_index == 0) { | 27 | if (field_index == 0) { |
| 30 | var i_buf = std.ArrayList(u8).init(allocator); | 28 | var i_buf = std.array_list.Managed(u8).init(allocator); |
| 31 | defer i_buf.deinit(); | 29 | defer i_buf.deinit(); |
| 32 | 30 | ||
| 33 | var i_fields = mem.splitScalar(u8, field, ' '); | 31 | var i_fields = mem.splitScalar(u8, field, ' '); |
| 34 | while (i_fields.next()) |s| { | 32 | while (i_fields.next()) |s| { |
| 33 | if (dbg_print) std.debug.print("Debug: {s}\n", .{s}); | ||
| 35 | const icp = try fmt.parseInt(u21, s, 16); | 34 | const icp = try fmt.parseInt(u21, s, 16); |
| 36 | const len = try unicode.utf8Encode(icp, &cp_buf); | 35 | const len = try unicode.utf8Encode(icp, &cp_buf); |
| 37 | try i_buf.appendSlice(cp_buf[0..len]); | 36 | try i_buf.appendSlice(cp_buf[0..len]); |
| @@ -41,7 +40,7 @@ test "Unicode normalization tests" { | |||
| 41 | } else if (field_index == 1) { | 40 | } else if (field_index == 1) { |
| 42 | if (dbg_print) debug.print("\n*** {s} ***\n", .{line}); | 41 | if (dbg_print) debug.print("\n*** {s} ***\n", .{line}); |
| 43 | // NFC, time to test. | 42 | // NFC, time to test. |
| 44 | var w_buf = std.ArrayList(u8).init(allocator); | 43 | var w_buf = std.array_list.Managed(u8).init(allocator); |
| 45 | defer w_buf.deinit(); | 44 | defer w_buf.deinit(); |
| 46 | 45 | ||
| 47 | var w_fields = mem.splitScalar(u8, field, ' '); | 46 | var w_fields = mem.splitScalar(u8, field, ' '); |
| @@ -58,7 +57,7 @@ test "Unicode normalization tests" { | |||
| 58 | try testing.expectEqualStrings(want, got.slice); | 57 | try testing.expectEqualStrings(want, got.slice); |
| 59 | } else if (field_index == 2) { | 58 | } else if (field_index == 2) { |
| 60 | // NFD, time to test. | 59 | // NFD, time to test. |
| 61 | var w_buf = std.ArrayList(u8).init(allocator); | 60 | var w_buf = std.array_list.Managed(u8).init(allocator); |
| 62 | defer w_buf.deinit(); | 61 | defer w_buf.deinit(); |
| 63 | 62 | ||
| 64 | var w_fields = mem.splitScalar(u8, field, ' '); | 63 | var w_fields = mem.splitScalar(u8, field, ' '); |
| @@ -75,7 +74,7 @@ test "Unicode normalization tests" { | |||
| 75 | try testing.expectEqualStrings(want, got.slice); | 74 | try testing.expectEqualStrings(want, got.slice); |
| 76 | } else if (field_index == 3) { | 75 | } else if (field_index == 3) { |
| 77 | // NFKC, time to test. | 76 | // NFKC, time to test. |
| 78 | var w_buf = std.ArrayList(u8).init(allocator); | 77 | var w_buf = std.array_list.Managed(u8).init(allocator); |
| 79 | defer w_buf.deinit(); | 78 | defer w_buf.deinit(); |
| 80 | 79 | ||
| 81 | var w_fields = mem.splitScalar(u8, field, ' '); | 80 | var w_fields = mem.splitScalar(u8, field, ' '); |
| @@ -92,7 +91,7 @@ test "Unicode normalization tests" { | |||
| 92 | try testing.expectEqualStrings(want, got.slice); | 91 | try testing.expectEqualStrings(want, got.slice); |
| 93 | } else if (field_index == 4) { | 92 | } else if (field_index == 4) { |
| 94 | // NFKD, time to test. | 93 | // NFKD, time to test. |
| 95 | var w_buf = std.ArrayList(u8).init(allocator); | 94 | var w_buf = std.array_list.Managed(u8).init(allocator); |
| 96 | defer w_buf.deinit(); | 95 | defer w_buf.deinit(); |
| 97 | 96 | ||
| 98 | var w_fields = mem.splitScalar(u8, field, ' '); | 97 | var w_fields = mem.splitScalar(u8, field, ' '); |
| @@ -111,6 +110,11 @@ test "Unicode normalization tests" { | |||
| 111 | continue; | 110 | continue; |
| 112 | } | 111 | } |
| 113 | } | 112 | } |
| 113 | } else |err| switch (err) { | ||
| 114 | error.EndOfStream => {}, | ||
| 115 | else => { | ||
| 116 | return err; | ||
| 117 | }, | ||
| 114 | } | 118 | } |
| 115 | } | 119 | } |
| 116 | 120 | ||
| @@ -118,26 +122,25 @@ test "Segmentation GraphemeIterator" { | |||
| 118 | const allocator = std.testing.allocator; | 122 | const allocator = std.testing.allocator; |
| 119 | var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); | 123 | var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); |
| 120 | defer file.close(); | 124 | defer file.close(); |
| 121 | var buf_reader = std.io.bufferedReader(file.reader()); | 125 | var buf: [4096]u8 = undefined; |
| 122 | var input_stream = buf_reader.reader(); | 126 | var reader = file.reader(&buf); |
| 123 | 127 | ||
| 124 | const graph = try Graphemes.init(allocator); | 128 | const graph = try Graphemes.init(allocator); |
| 125 | defer graph.deinit(allocator); | 129 | defer graph.deinit(allocator); |
| 126 | 130 | ||
| 127 | var buf: [4096]u8 = undefined; | 131 | var line_iter: IterRead = .{ .read = &reader.interface }; |
| 128 | var line_iter: IterRead = .{ .read = &input_stream }; | ||
| 129 | 132 | ||
| 130 | while (try line_iter.next(&buf)) |raw| { | 133 | while (line_iter.next()) |raw| { |
| 131 | // Clean up. | 134 | // Clean up. |
| 132 | var line = std.mem.trimLeft(u8, raw, "÷ "); | 135 | var line = std.mem.trimLeft(u8, raw, "÷ "); |
| 133 | if (std.mem.indexOf(u8, line, " ÷\t")) |final| { | 136 | if (std.mem.indexOf(u8, line, " ÷\t")) |final| { |
| 134 | line = line[0..final]; | 137 | line = line[0..final]; |
| 135 | } | 138 | } |
| 136 | // Iterate over fields. | 139 | // Iterate over fields. |
| 137 | var want = std.ArrayList(Grapheme).init(allocator); | 140 | var want = std.array_list.Managed(Grapheme).init(allocator); |
| 138 | defer want.deinit(); | 141 | defer want.deinit(); |
| 139 | 142 | ||
| 140 | var all_bytes = std.ArrayList(u8).init(allocator); | 143 | var all_bytes = std.array_list.Managed(u8).init(allocator); |
| 141 | defer all_bytes.deinit(); | 144 | defer all_bytes.deinit(); |
| 142 | 145 | ||
| 143 | var graphemes = std.mem.splitSequence(u8, line, " ÷ "); | 146 | var graphemes = std.mem.splitSequence(u8, line, " ÷ "); |
| @@ -250,6 +253,11 @@ test "Segmentation GraphemeIterator" { | |||
| 250 | } | 253 | } |
| 251 | } | 254 | } |
| 252 | } | 255 | } |
| 256 | } else |err| switch (err) { | ||
| 257 | error.EndOfStream => {}, | ||
| 258 | else => { | ||
| 259 | return err; | ||
| 260 | }, | ||
| 253 | } | 261 | } |
| 254 | } | 262 | } |
| 255 | 263 | ||
| @@ -257,26 +265,25 @@ test "Segmentation Word Iterator" { | |||
| 257 | const allocator = std.testing.allocator; | 265 | const allocator = std.testing.allocator; |
| 258 | var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{}); | 266 | var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{}); |
| 259 | defer file.close(); | 267 | defer file.close(); |
| 260 | var buf_reader = std.io.bufferedReader(file.reader()); | 268 | var buf: [4096]u8 = undefined; |
| 261 | var input_stream = buf_reader.reader(); | 269 | var reader = file.reader(&buf); |
| 262 | 270 | ||
| 263 | const wb = try Words.init(allocator); | 271 | const wb = try Words.init(allocator); |
| 264 | defer wb.deinit(allocator); | 272 | defer wb.deinit(allocator); |
| 265 | 273 | ||
| 266 | var buf: [4096]u8 = undefined; | 274 | var line_iter: IterRead = .{ .read = &reader.interface }; |
| 267 | var line_iter: IterRead = .{ .read = &input_stream }; | ||
| 268 | 275 | ||
| 269 | while (try line_iter.next(&buf)) |raw| { | 276 | while (line_iter.next()) |raw| { |
| 270 | // Clean up. | 277 | // Clean up. |
| 271 | var line = std.mem.trimLeft(u8, raw, "÷ "); | 278 | var line = std.mem.trimLeft(u8, raw, "÷ "); |
| 272 | if (std.mem.indexOf(u8, line, " ÷\t")) |final| { | 279 | if (std.mem.indexOf(u8, line, " ÷\t")) |final| { |
| 273 | line = line[0..final]; | 280 | line = line[0..final]; |
| 274 | } | 281 | } |
| 275 | // Iterate over fields. | 282 | // Iterate over fields. |
| 276 | var want = std.ArrayList(Word).init(allocator); | 283 | var want = std.array_list.Managed(Word).init(allocator); |
| 277 | defer want.deinit(); | 284 | defer want.deinit(); |
| 278 | 285 | ||
| 279 | var all_bytes = std.ArrayList(u8).init(allocator); | 286 | var all_bytes = std.array_list.Managed(u8).init(allocator); |
| 280 | defer all_bytes.deinit(); | 287 | defer all_bytes.deinit(); |
| 281 | 288 | ||
| 282 | var words = std.mem.splitSequence(u8, line, " ÷ "); | 289 | var words = std.mem.splitSequence(u8, line, " ÷ "); |
| @@ -439,26 +446,26 @@ test "Segmentation Word Iterator" { | |||
| 439 | if (idx == 0) break; | 446 | if (idx == 0) break; |
| 440 | } | 447 | } |
| 441 | } | 448 | } |
| 449 | } else |err| switch (err) { | ||
| 450 | error.EndOfStream => {}, | ||
| 451 | else => { | ||
| 452 | return err; | ||
| 453 | }, | ||
| 442 | } | 454 | } |
| 443 | } | 455 | } |
| 444 | 456 | ||
| 445 | const IterRead = struct { | 457 | const IterRead = struct { |
| 446 | read: *Reader, | 458 | read: *io.Reader, |
| 447 | line: usize = 0, | 459 | line: usize = 0, |
| 448 | 460 | ||
| 449 | pub fn next(iter: *IterRead, buf: []u8) !?[]const u8 { | 461 | pub fn next(iter: *IterRead) anyerror![]const u8 { |
| 450 | defer iter.line += 1; | 462 | iter.line += 1; |
| 451 | const maybe_line = try iter.read.readUntilDelimiterOrEof(buf, '#'); | 463 | const this_line = try iter.read.takeDelimiterExclusive('\n'); |
| 452 | if (maybe_line) |this_line| { | 464 | if (this_line.len == 0 or this_line[0] == '@' or this_line[0] == '#') { |
| 453 | try iter.read.skipUntilDelimiterOrEof('\n'); | 465 | // comment, next line |
| 454 | if (this_line.len == 0 or this_line[0] == '@') { | 466 | return iter.next(); |
| 455 | // comment, next line | ||
| 456 | return iter.next(buf); | ||
| 457 | } else { | ||
| 458 | return this_line; | ||
| 459 | } | ||
| 460 | } else { | 467 | } else { |
| 461 | return null; | 468 | return this_line; |
| 462 | } | 469 | } |
| 463 | } | 470 | } |
| 464 | }; | 471 | }; |
| @@ -467,7 +474,6 @@ const std = @import("std"); | |||
| 467 | const fmt = std.fmt; | 474 | const fmt = std.fmt; |
| 468 | const fs = std.fs; | 475 | const fs = std.fs; |
| 469 | const io = std.io; | 476 | const io = std.io; |
| 470 | const Reader = io.BufferedReader(4096, fs.File.Reader).Reader; | ||
| 471 | const heap = std.heap; | 477 | const heap = std.heap; |
| 472 | const mem = std.mem; | 478 | const mem = std.mem; |
| 473 | const debug = std.debug; | 479 | const debug = std.debug; |