diff options
| author | 2024-03-28 22:19:50 -0400 | |
|---|---|---|
| committer | 2024-03-28 22:19:50 -0400 | |
| commit | a2c4b7a57fe6b64bdd7c71305d408e5030af3157 (patch) | |
| tree | c7af1ed4381ab0eeea52e2a9081cb19469b8c0e6 /src/unicode_tests.zig | |
| parent | Merged NumericData into PropsData (diff) | |
| download | zg-a2c4b7a57fe6b64bdd7c71305d408e5030af3157.tar.gz zg-a2c4b7a57fe6b64bdd7c71305d408e5030af3157.tar.xz zg-a2c4b7a57fe6b64bdd7c71305d408e5030af3157.zip | |
Split out Unicode tests to separate file
Diffstat (limited to 'src/unicode_tests.zig')
| -rw-r--r-- | src/unicode_tests.zig | 194 |
1 files changed, 194 insertions, 0 deletions
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig new file mode 100644 index 0000000..5442f63 --- /dev/null +++ b/src/unicode_tests.zig | |||
| @@ -0,0 +1,194 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const fmt = std.fmt; | ||
| 3 | const fs = std.fs; | ||
| 4 | const io = std.io; | ||
| 5 | const heap = std.heap; | ||
| 6 | const mem = std.mem; | ||
| 7 | const testing = std.testing; | ||
| 8 | const unicode = std.unicode; | ||
| 9 | |||
| 10 | const Grapheme = @import("grapheme").Grapheme; | ||
| 11 | const GraphemeData = @import("grapheme").GraphemeData; | ||
| 12 | const GraphemeIterator = @import("grapheme").Iterator; | ||
| 13 | const Normalize = @import("Normalize"); | ||
| 14 | |||
| 15 | test "Unicode normalization tests" { | ||
| 16 | var arena = heap.ArenaAllocator.init(testing.allocator); | ||
| 17 | defer arena.deinit(); | ||
| 18 | var allocator = arena.allocator(); | ||
| 19 | |||
| 20 | const data = try Normalize.NormData.init(allocator); | ||
| 21 | const n = Normalize{ .norm_data = &data }; | ||
| 22 | |||
| 23 | var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); | ||
| 24 | defer file.close(); | ||
| 25 | var buf_reader = io.bufferedReader(file.reader()); | ||
| 26 | const input_stream = buf_reader.reader(); | ||
| 27 | |||
| 28 | var line_no: usize = 0; | ||
| 29 | var buf: [4096]u8 = undefined; | ||
| 30 | var cp_buf: [4]u8 = undefined; | ||
| 31 | |||
| 32 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| { | ||
| 33 | line_no += 1; | ||
| 34 | // Skip comments or empty lines. | ||
| 35 | if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; | ||
| 36 | // Iterate over fields. | ||
| 37 | var fields = mem.split(u8, line, ";"); | ||
| 38 | var field_index: usize = 0; | ||
| 39 | var input: []u8 = undefined; | ||
| 40 | defer allocator.free(input); | ||
| 41 | |||
| 42 | while (fields.next()) |field| : (field_index += 1) { | ||
| 43 | if (field_index == 0) { | ||
| 44 | var i_buf = std.ArrayList(u8).init(allocator); | ||
| 45 | defer i_buf.deinit(); | ||
| 46 | |||
| 47 | var i_fields = mem.split(u8, field, " "); | ||
| 48 | while (i_fields.next()) |s| { | ||
| 49 | const icp = try fmt.parseInt(u21, s, 16); | ||
| 50 | const len = try unicode.utf8Encode(icp, &cp_buf); | ||
| 51 | try i_buf.appendSlice(cp_buf[0..len]); | ||
| 52 | } | ||
| 53 | |||
| 54 | input = try i_buf.toOwnedSlice(); | ||
| 55 | } else if (field_index == 1) { | ||
| 56 | //debug.print("\n*** {s} ***\n", .{line}); | ||
| 57 | // NFC, time to test. | ||
| 58 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 59 | defer w_buf.deinit(); | ||
| 60 | |||
| 61 | var w_fields = mem.split(u8, field, " "); | ||
| 62 | while (w_fields.next()) |s| { | ||
| 63 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 64 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 65 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 66 | } | ||
| 67 | |||
| 68 | const want = w_buf.items; | ||
| 69 | var got = try n.nfc(allocator, input); | ||
| 70 | defer got.deinit(); | ||
| 71 | |||
| 72 | try testing.expectEqualStrings(want, got.slice); | ||
| 73 | } else if (field_index == 2) { | ||
| 74 | // NFD, time to test. | ||
| 75 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 76 | defer w_buf.deinit(); | ||
| 77 | |||
| 78 | var w_fields = mem.split(u8, field, " "); | ||
| 79 | while (w_fields.next()) |s| { | ||
| 80 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 81 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 82 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 83 | } | ||
| 84 | |||
| 85 | const want = w_buf.items; | ||
| 86 | var got = try n.nfd(allocator, input); | ||
| 87 | defer got.deinit(); | ||
| 88 | |||
| 89 | try testing.expectEqualStrings(want, got.slice); | ||
| 90 | } else if (field_index == 3) { | ||
| 91 | // NFKC, time to test. | ||
| 92 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 93 | defer w_buf.deinit(); | ||
| 94 | |||
| 95 | var w_fields = mem.split(u8, field, " "); | ||
| 96 | while (w_fields.next()) |s| { | ||
| 97 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 98 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 99 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 100 | } | ||
| 101 | |||
| 102 | const want = w_buf.items; | ||
| 103 | var got = try n.nfkc(allocator, input); | ||
| 104 | defer got.deinit(); | ||
| 105 | |||
| 106 | try testing.expectEqualStrings(want, got.slice); | ||
| 107 | } else if (field_index == 4) { | ||
| 108 | // NFKD, time to test. | ||
| 109 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 110 | defer w_buf.deinit(); | ||
| 111 | |||
| 112 | var w_fields = mem.split(u8, field, " "); | ||
| 113 | while (w_fields.next()) |s| { | ||
| 114 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 115 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 116 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 117 | } | ||
| 118 | |||
| 119 | const want = w_buf.items; | ||
| 120 | const got = try n.nfkd(allocator, input); | ||
| 121 | defer got.deinit(); | ||
| 122 | |||
| 123 | try testing.expectEqualStrings(want, got.slice); | ||
| 124 | } else { | ||
| 125 | continue; | ||
| 126 | } | ||
| 127 | } | ||
| 128 | } | ||
| 129 | } | ||
| 130 | |||
| 131 | test "Segmentation GraphemeIterator" { | ||
| 132 | const allocator = std.testing.allocator; | ||
| 133 | var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); | ||
| 134 | defer file.close(); | ||
| 135 | var buf_reader = std.io.bufferedReader(file.reader()); | ||
| 136 | var input_stream = buf_reader.reader(); | ||
| 137 | |||
| 138 | const data = try GraphemeData.init(allocator); | ||
| 139 | defer data.deinit(); | ||
| 140 | |||
| 141 | var buf: [4096]u8 = undefined; | ||
| 142 | var line_no: usize = 1; | ||
| 143 | |||
| 144 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { | ||
| 145 | // Skip comments or empty lines. | ||
| 146 | if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; | ||
| 147 | |||
| 148 | // Clean up. | ||
| 149 | var line = std.mem.trimLeft(u8, raw, "÷ "); | ||
| 150 | if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { | ||
| 151 | line = line[0..octo]; | ||
| 152 | } | ||
| 153 | // Iterate over fields. | ||
| 154 | var want = std.ArrayList(Grapheme).init(allocator); | ||
| 155 | defer want.deinit(); | ||
| 156 | |||
| 157 | var all_bytes = std.ArrayList(u8).init(allocator); | ||
| 158 | defer all_bytes.deinit(); | ||
| 159 | |||
| 160 | var graphemes = std.mem.split(u8, line, " ÷ "); | ||
| 161 | var bytes_index: u32 = 0; | ||
| 162 | |||
| 163 | while (graphemes.next()) |field| { | ||
| 164 | var code_points = std.mem.split(u8, field, " "); | ||
| 165 | var cp_buf: [4]u8 = undefined; | ||
| 166 | var cp_index: u32 = 0; | ||
| 167 | var gc_len: u8 = 0; | ||
| 168 | |||
| 169 | while (code_points.next()) |code_point| { | ||
| 170 | if (std.mem.eql(u8, code_point, "×")) continue; | ||
| 171 | const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); | ||
| 172 | const len = try unicode.utf8Encode(cp, &cp_buf); | ||
| 173 | try all_bytes.appendSlice(cp_buf[0..len]); | ||
| 174 | cp_index += len; | ||
| 175 | gc_len += len; | ||
| 176 | } | ||
| 177 | |||
| 178 | try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); | ||
| 179 | bytes_index += cp_index; | ||
| 180 | } | ||
| 181 | |||
| 182 | // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); | ||
| 183 | var iter = GraphemeIterator.init(all_bytes.items, &data); | ||
| 184 | |||
| 185 | // Chaeck. | ||
| 186 | for (want.items) |want_gc| { | ||
| 187 | const got_gc = (iter.next()).?; | ||
| 188 | try std.testing.expectEqualStrings( | ||
| 189 | want_gc.bytes(all_bytes.items), | ||
| 190 | got_gc.bytes(all_bytes.items), | ||
| 191 | ); | ||
| 192 | } | ||
| 193 | } | ||
| 194 | } | ||