diff options
Diffstat (limited to 'src/unicode_tests.zig')
| -rw-r--r-- | src/unicode_tests.zig | 102 |
1 files changed, 70 insertions, 32 deletions
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index ee259a3..7ce2b4e 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -1,31 +1,5 @@ | |||
| 1 | const dbg_print = false; | 1 | const dbg_print = false; |
| 2 | 2 | ||
| 3 | comptime { | ||
| 4 | testing.refAllDecls(grapheme); | ||
| 5 | } | ||
| 6 | |||
| 7 | test "Iterator.peek" { | ||
| 8 | const peek_seq = "aΔ👨🏻🌾→"; | ||
| 9 | const data = try Graphemes.init(std.testing.allocator); | ||
| 10 | defer data.deinit(std.testing.allocator); | ||
| 11 | |||
| 12 | var iter = data.iterator(peek_seq); | ||
| 13 | const peek_a = iter.peek().?; | ||
| 14 | const next_a = iter.next().?; | ||
| 15 | try std.testing.expectEqual(peek_a, next_a); | ||
| 16 | try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq)); | ||
| 17 | const peek_d1 = iter.peek().?; | ||
| 18 | const peek_d2 = iter.peek().?; | ||
| 19 | try std.testing.expectEqual(peek_d1, peek_d2); | ||
| 20 | const next_d = iter.next().?; | ||
| 21 | try std.testing.expectEqual(peek_d2, next_d); | ||
| 22 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 23 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 24 | try std.testing.expectEqual(null, iter.peek()); | ||
| 25 | try std.testing.expectEqual(null, iter.peek()); | ||
| 26 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 27 | } | ||
| 28 | |||
| 29 | test "Unicode normalization tests" { | 3 | test "Unicode normalization tests" { |
| 30 | var arena = heap.ArenaAllocator.init(testing.allocator); | 4 | var arena = heap.ArenaAllocator.init(testing.allocator); |
| 31 | defer arena.deinit(); | 5 | defer arena.deinit(); |
| @@ -147,15 +121,13 @@ test "Segmentation GraphemeIterator" { | |||
| 147 | var buf_reader = std.io.bufferedReader(file.reader()); | 121 | var buf_reader = std.io.bufferedReader(file.reader()); |
| 148 | var input_stream = buf_reader.reader(); | 122 | var input_stream = buf_reader.reader(); |
| 149 | 123 | ||
| 150 | const data = try Graphemes.init(allocator); | 124 | const graph = try Graphemes.init(allocator); |
| 151 | defer data.deinit(allocator); | 125 | defer graph.deinit(allocator); |
| 152 | 126 | ||
| 153 | var buf: [4096]u8 = undefined; | 127 | var buf: [4096]u8 = undefined; |
| 154 | var line_iter: IterRead = .{ .read = &input_stream }; | 128 | var line_iter: IterRead = .{ .read = &input_stream }; |
| 155 | 129 | ||
| 156 | while (try line_iter.next(&buf)) |raw| { | 130 | while (try line_iter.next(&buf)) |raw| { |
| 157 | // Skip comments or empty lines. | ||
| 158 | // if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; | ||
| 159 | // Clean up. | 131 | // Clean up. |
| 160 | var line = std.mem.trimLeft(u8, raw, "÷ "); | 132 | var line = std.mem.trimLeft(u8, raw, "÷ "); |
| 161 | if (std.mem.indexOf(u8, line, " ÷\t")) |final| { | 133 | if (std.mem.indexOf(u8, line, " ÷\t")) |final| { |
| @@ -190,7 +162,7 @@ test "Segmentation GraphemeIterator" { | |||
| 190 | bytes_index += cp_index; | 162 | bytes_index += cp_index; |
| 191 | } | 163 | } |
| 192 | 164 | ||
| 193 | var iter = data.iterator(all_bytes.items); | 165 | var iter = graph.iterator(all_bytes.items); |
| 194 | 166 | ||
| 195 | // Check. | 167 | // Check. |
| 196 | for (want.items) |want_gc| { | 168 | for (want.items) |want_gc| { |
| @@ -203,6 +175,71 @@ test "Segmentation GraphemeIterator" { | |||
| 203 | } | 175 | } |
| 204 | } | 176 | } |
| 205 | 177 | ||
| 178 | test "Segmentation Word Iterator" { | ||
| 179 | const allocator = std.testing.allocator; | ||
| 180 | var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{}); | ||
| 181 | defer file.close(); | ||
| 182 | var buf_reader = std.io.bufferedReader(file.reader()); | ||
| 183 | var input_stream = buf_reader.reader(); | ||
| 184 | |||
| 185 | const wb = try WordBreak.init(allocator); | ||
| 186 | defer wb.deinit(allocator); | ||
| 187 | |||
| 188 | var buf: [4096]u8 = undefined; | ||
| 189 | var line_iter: IterRead = .{ .read = &input_stream }; | ||
| 190 | |||
| 191 | while (try line_iter.next(&buf)) |raw| { | ||
| 192 | // Clean up. | ||
| 193 | var line = std.mem.trimLeft(u8, raw, "÷ "); | ||
| 194 | if (std.mem.indexOf(u8, line, " ÷\t")) |final| { | ||
| 195 | line = line[0..final]; | ||
| 196 | } | ||
| 197 | // Iterate over fields. | ||
| 198 | var want = std.ArrayList(Grapheme).init(allocator); | ||
| 199 | defer want.deinit(); | ||
| 200 | |||
| 201 | var all_bytes = std.ArrayList(u8).init(allocator); | ||
| 202 | defer all_bytes.deinit(); | ||
| 203 | |||
| 204 | var words = std.mem.splitSequence(u8, line, " ÷ "); | ||
| 205 | var bytes_index: u32 = 0; | ||
| 206 | |||
| 207 | while (words.next()) |field| { | ||
| 208 | var code_points = std.mem.splitScalar(u8, field, ' '); | ||
| 209 | var cp_buf: [4]u8 = undefined; | ||
| 210 | var cp_index: u32 = 0; | ||
| 211 | var gc_len: u8 = 0; | ||
| 212 | |||
| 213 | while (code_points.next()) |code_point| { | ||
| 214 | if (std.mem.eql(u8, code_point, "×")) continue; | ||
| 215 | const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); | ||
| 216 | const len = try unicode.utf8Encode(cp, &cp_buf); | ||
| 217 | try all_bytes.appendSlice(cp_buf[0..len]); | ||
| 218 | cp_index += len; | ||
| 219 | gc_len += len; | ||
| 220 | } | ||
| 221 | |||
| 222 | try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); | ||
| 223 | bytes_index += cp_index; | ||
| 224 | } | ||
| 225 | |||
| 226 | var iter = wb.iterator(all_bytes.items); | ||
| 227 | |||
| 228 | // Check. | ||
| 229 | for (want.items, 1..) |want_word, i| { | ||
| 230 | const got_word = (iter.next()).?; | ||
| 231 | std.testing.expectEqualSlices( | ||
| 232 | u8, | ||
| 233 | want_word.bytes(all_bytes.items), | ||
| 234 | got_word.bytes(all_bytes.items), | ||
| 235 | ) catch |err| { | ||
| 236 | debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i }); | ||
| 237 | return err; | ||
| 238 | }; | ||
| 239 | } | ||
| 240 | } | ||
| 241 | } | ||
| 242 | |||
| 206 | const IterRead = struct { | 243 | const IterRead = struct { |
| 207 | read: *Reader, | 244 | read: *Reader, |
| 208 | line: usize = 0, | 245 | line: usize = 0, |
| @@ -235,8 +272,9 @@ const debug = std.debug; | |||
| 235 | const testing = std.testing; | 272 | const testing = std.testing; |
| 236 | const unicode = std.unicode; | 273 | const unicode = std.unicode; |
| 237 | 274 | ||
| 238 | const grapheme = @import("Graphemes"); | ||
| 239 | const Grapheme = @import("Graphemes").Grapheme; | 275 | const Grapheme = @import("Graphemes").Grapheme; |
| 240 | const Graphemes = @import("Graphemes"); | 276 | const Graphemes = @import("Graphemes"); |
| 241 | const GraphemeIterator = @import("Graphemes").Iterator; | 277 | const GraphemeIterator = @import("Graphemes").Iterator; |
| 242 | const Normalize = @import("Normalize"); | 278 | const Normalize = @import("Normalize"); |
| 279 | |||
| 280 | const WordBreak = @import("WordBreak"); | ||