diff options
Diffstat (limited to 'src/unicode_tests.zig')
| -rw-r--r-- | src/unicode_tests.zig | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 2249007..828559a 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -219,3 +219,77 @@ test "Segmentation GraphemeIterator" { | |||
| 219 | } | 219 | } |
| 220 | } | 220 | } |
| 221 | } | 221 | } |
| 222 | |||
| 223 | test "Segmentation ReverseGraphemeIterator" { | ||
| 224 | const allocator = std.testing.allocator; | ||
| 225 | var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); | ||
| 226 | defer file.close(); | ||
| 227 | var buf_reader = std.io.bufferedReader(file.reader()); | ||
| 228 | var input_stream = buf_reader.reader(); | ||
| 229 | |||
| 230 | const data = try Graphemes.init(allocator); | ||
| 231 | defer data.deinit(allocator); | ||
| 232 | |||
| 233 | var buf: [4096]u8 = undefined; | ||
| 234 | var line_no: usize = 1; | ||
| 235 | |||
| 236 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { | ||
| 237 | // Skip comments or empty lines. | ||
| 238 | if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; | ||
| 239 | |||
| 240 | // Clean up. | ||
| 241 | var line = std.mem.trimLeft(u8, raw, "÷ "); | ||
| 242 | if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { | ||
| 243 | line = line[0..octo]; | ||
| 244 | } | ||
| 245 | // Iterate over fields. | ||
| 246 | var want = std.ArrayList(Grapheme).init(allocator); | ||
| 247 | defer want.deinit(); | ||
| 248 | |||
| 249 | var all_bytes = std.ArrayList(u8).init(allocator); | ||
| 250 | defer all_bytes.deinit(); | ||
| 251 | |||
| 252 | var graphemes = std.mem.splitSequence(u8, line, " ÷ "); | ||
| 253 | var bytes_index: u32 = 0; | ||
| 254 | |||
| 255 | while (graphemes.next()) |field| { | ||
| 256 | var code_points = std.mem.splitScalar(u8, field, ' '); | ||
| 257 | var cp_buf: [4]u8 = undefined; | ||
| 258 | var cp_index: u32 = 0; | ||
| 259 | var gc_len: u8 = 0; | ||
| 260 | |||
| 261 | while (code_points.next()) |code_point| { | ||
| 262 | if (std.mem.eql(u8, code_point, "×")) continue; | ||
| 263 | const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); | ||
| 264 | const len = try unicode.utf8Encode(cp, &cp_buf); | ||
| 265 | try all_bytes.appendSlice(cp_buf[0..len]); | ||
| 266 | cp_index += len; | ||
| 267 | gc_len += len; | ||
| 268 | } | ||
| 269 | |||
| 270 | try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); | ||
| 271 | bytes_index += cp_index; | ||
| 272 | } | ||
| 273 | |||
| 274 | // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); | ||
| 275 | var iter = data.reverseIterator(all_bytes.items); | ||
| 276 | |||
| 277 | // Check. | ||
| 278 | var i: usize = want.items.len; | ||
| 279 | while (i > 0) { | ||
| 280 | i -= 1; | ||
| 281 | const want_gc = want.items[i]; | ||
| 282 | const got_gc = iter.prev() orelse { | ||
| 283 | std.debug.print("line {d} grapheme {d}: expected {any} found null\n", .{ line_no, i, want_gc }); | ||
| 284 | return error.TestExpectedEqual; | ||
| 285 | }; | ||
| 286 | std.testing.expectEqualStrings( | ||
| 287 | want_gc.bytes(all_bytes.items), | ||
| 288 | got_gc.bytes(all_bytes.items), | ||
| 289 | ) catch |err| { | ||
| 290 | std.debug.print("line {d} grapheme {d}: expected {any} found {any}\n", .{ line_no, i, want_gc, got_gc }); | ||
| 291 | return err; | ||
| 292 | }; | ||
| 293 | } | ||
| 294 | } | ||
| 295 | } | ||