diff options
| -rw-r--r-- | build.zig | 45 | ||||
| -rw-r--r-- | src/Normalize.zig | 121 | ||||
| -rw-r--r-- | src/grapheme.zig | 65 | ||||
| -rw-r--r-- | src/unicode_tests.zig | 194 |
4 files changed, 209 insertions, 216 deletions
| @@ -196,13 +196,13 @@ pub fn build(b: *std.Build) void { | |||
| 196 | }); | 196 | }); |
| 197 | 197 | ||
| 198 | // Fixed pitch font display width | 198 | // Fixed pitch font display width |
| 199 | const dw_data = b.createModule(.{ | 199 | const width_data = b.createModule(.{ |
| 200 | .root_source_file = .{ .path = "src/WidthData.zig" }, | 200 | .root_source_file = .{ .path = "src/WidthData.zig" }, |
| 201 | .target = target, | 201 | .target = target, |
| 202 | .optimize = optimize, | 202 | .optimize = optimize, |
| 203 | }); | 203 | }); |
| 204 | dw_data.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out }); | 204 | width_data.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out }); |
| 205 | dw_data.addImport("GraphemeData", grapheme_data); | 205 | width_data.addImport("GraphemeData", grapheme_data); |
| 206 | 206 | ||
| 207 | const display_width = b.addModule("DisplayWidth", .{ | 207 | const display_width = b.addModule("DisplayWidth", .{ |
| 208 | .root_source_file = .{ .path = "src/DisplayWidth.zig" }, | 208 | .root_source_file = .{ .path = "src/DisplayWidth.zig" }, |
| @@ -212,7 +212,7 @@ pub fn build(b: *std.Build) void { | |||
| 212 | display_width.addImport("ascii", ascii); | 212 | display_width.addImport("ascii", ascii); |
| 213 | display_width.addImport("code_point", code_point); | 213 | display_width.addImport("code_point", code_point); |
| 214 | display_width.addImport("grapheme", grapheme); | 214 | display_width.addImport("grapheme", grapheme); |
| 215 | display_width.addImport("DisplayWidthData", dw_data); | 215 | display_width.addImport("DisplayWidthData", width_data); |
| 216 | 216 | ||
| 217 | // Normalization | 217 | // Normalization |
| 218 | const ccc_data = b.createModule(.{ | 218 | const ccc_data = b.createModule(.{ |
| @@ -324,34 +324,17 @@ pub fn build(b: *std.Build) void { | |||
| 324 | props_data.addAnonymousImport("props", .{ .root_source_file = props_gen_out }); | 324 | props_data.addAnonymousImport("props", .{ .root_source_file = props_gen_out }); |
| 325 | props_data.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out }); | 325 | props_data.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out }); |
| 326 | 326 | ||
| 327 | // Tests | 327 | // Unicode Tests |
| 328 | const exe_unit_tests = b.addTest(.{ | 328 | const unicode_tests = b.addTest(.{ |
| 329 | .root_source_file = .{ .path = "src/PropsData.zig" }, | 329 | .root_source_file = .{ .path = "src/unicode_tests.zig" }, |
| 330 | .target = target, | 330 | .target = target, |
| 331 | .optimize = optimize, | 331 | .optimize = optimize, |
| 332 | }); | 332 | }); |
| 333 | // exe_unit_tests.root_module.addImport("ascii", ascii); | 333 | unicode_tests.root_module.addImport("grapheme", grapheme); |
| 334 | // exe_unit_tests.root_module.addImport("code_point", code_point); | 334 | unicode_tests.root_module.addImport("Normalize", norm); |
| 335 | // exe_unit_tests.root_module.addImport("GraphemeData", grapheme_data); | 335 | |
| 336 | // exe_unit_tests.root_module.addImport("grapheme", grapheme); | 336 | const run_unicode_tests = b.addRunArtifact(unicode_tests); |
| 337 | // exe_unit_tests.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); | 337 | |
| 338 | // exe_unit_tests.root_module.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out }); | 338 | const unicode_test_step = b.step("unicode-test", "Run Unicode tests"); |
| 339 | // exe_unit_tests.root_module.addImport("DisplayWidthData", dw_data); | 339 | unicode_test_step.dependOn(&run_unicode_tests.step); |
| 340 | // exe_unit_tests.root_module.addImport("NormData", norm_data); | ||
| 341 | // exe_unit_tests.root_module.addImport("Normalize", norm); | ||
| 342 | // exe_unit_tests.root_module.addImport("FoldData", fold_data); | ||
| 343 | // exe_unit_tests.root_module.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out }); | ||
| 344 | // exe_unit_tests.root_module.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out }); | ||
| 345 | // exe_unit_tests.root_module.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out }); | ||
| 346 | // exe_unit_tests.root_module.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out }); | ||
| 347 | // exe_unit_tests.root_module.addAnonymousImport("scripts", .{ .root_source_file = scripts_gen_out }); | ||
| 348 | exe_unit_tests.root_module.addAnonymousImport("core_props", .{ .root_source_file = core_gen_out }); | ||
| 349 | exe_unit_tests.root_module.addAnonymousImport("props", .{ .root_source_file = props_gen_out }); | ||
| 350 | exe_unit_tests.root_module.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out }); | ||
| 351 | // exe_unit_tests.filter = "nfd !ASCII"; | ||
| 352 | |||
| 353 | const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); | ||
| 354 | |||
| 355 | const test_step = b.step("test", "Run unit tests"); | ||
| 356 | test_step.dependOn(&run_exe_unit_tests.step); | ||
| 357 | } | 340 | } |
diff --git a/src/Normalize.zig b/src/Normalize.zig index daf774d..f437f4f 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig | |||
| @@ -3,12 +3,10 @@ | |||
| 3 | //! NFKC, NFD, and NFKD normalization forms. | 3 | //! NFKC, NFD, and NFKD normalization forms. |
| 4 | 4 | ||
| 5 | const std = @import("std"); | 5 | const std = @import("std"); |
| 6 | const assert = std.debug.assert; | ||
| 7 | const debug = std.debug; | 6 | const debug = std.debug; |
| 7 | const assert = debug.assert; | ||
| 8 | const fmt = std.fmt; | 8 | const fmt = std.fmt; |
| 9 | const fs = std.fs; | ||
| 10 | const heap = std.heap; | 9 | const heap = std.heap; |
| 11 | const io = std.io; | ||
| 12 | const mem = std.mem; | 10 | const mem = std.mem; |
| 13 | const simd = std.simd; | 11 | const simd = std.simd; |
| 14 | const testing = std.testing; | 12 | const testing = std.testing; |
| @@ -615,123 +613,6 @@ test "isFcd" { | |||
| 615 | try testing.expect(!n.isFcd(not_fcd)); | 613 | try testing.expect(!n.isFcd(not_fcd)); |
| 616 | } | 614 | } |
| 617 | 615 | ||
| 618 | test "Unicode normalization tests" { | ||
| 619 | var arena = heap.ArenaAllocator.init(testing.allocator); | ||
| 620 | defer arena.deinit(); | ||
| 621 | var allocator = arena.allocator(); | ||
| 622 | |||
| 623 | const data = try NormData.init(allocator); | ||
| 624 | defer data.deinit(); | ||
| 625 | const n = Self{ .norm_data = &data }; | ||
| 626 | |||
| 627 | var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); | ||
| 628 | defer file.close(); | ||
| 629 | var buf_reader = io.bufferedReader(file.reader()); | ||
| 630 | const input_stream = buf_reader.reader(); | ||
| 631 | |||
| 632 | var line_no: usize = 0; | ||
| 633 | var buf: [4096]u8 = undefined; | ||
| 634 | var cp_buf: [4]u8 = undefined; | ||
| 635 | |||
| 636 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| { | ||
| 637 | line_no += 1; | ||
| 638 | // Skip comments or empty lines. | ||
| 639 | if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; | ||
| 640 | // Iterate over fields. | ||
| 641 | var fields = mem.split(u8, line, ";"); | ||
| 642 | var field_index: usize = 0; | ||
| 643 | var input: []u8 = undefined; | ||
| 644 | defer allocator.free(input); | ||
| 645 | |||
| 646 | while (fields.next()) |field| : (field_index += 1) { | ||
| 647 | if (field_index == 0) { | ||
| 648 | var i_buf = std.ArrayList(u8).init(allocator); | ||
| 649 | defer i_buf.deinit(); | ||
| 650 | |||
| 651 | var i_fields = mem.split(u8, field, " "); | ||
| 652 | while (i_fields.next()) |s| { | ||
| 653 | const icp = try fmt.parseInt(u21, s, 16); | ||
| 654 | const len = try unicode.utf8Encode(icp, &cp_buf); | ||
| 655 | try i_buf.appendSlice(cp_buf[0..len]); | ||
| 656 | } | ||
| 657 | |||
| 658 | input = try i_buf.toOwnedSlice(); | ||
| 659 | } else if (field_index == 1) { | ||
| 660 | //debug.print("\n*** {s} ***\n", .{line}); | ||
| 661 | // NFC, time to test. | ||
| 662 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 663 | defer w_buf.deinit(); | ||
| 664 | |||
| 665 | var w_fields = mem.split(u8, field, " "); | ||
| 666 | while (w_fields.next()) |s| { | ||
| 667 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 668 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 669 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 670 | } | ||
| 671 | |||
| 672 | const want = w_buf.items; | ||
| 673 | var got = try n.nfc(allocator, input); | ||
| 674 | defer got.deinit(); | ||
| 675 | |||
| 676 | try testing.expectEqualStrings(want, got.slice); | ||
| 677 | } else if (field_index == 2) { | ||
| 678 | // NFD, time to test. | ||
| 679 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 680 | defer w_buf.deinit(); | ||
| 681 | |||
| 682 | var w_fields = mem.split(u8, field, " "); | ||
| 683 | while (w_fields.next()) |s| { | ||
| 684 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 685 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 686 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 687 | } | ||
| 688 | |||
| 689 | const want = w_buf.items; | ||
| 690 | var got = try n.nfd(allocator, input); | ||
| 691 | defer got.deinit(); | ||
| 692 | |||
| 693 | try testing.expectEqualStrings(want, got.slice); | ||
| 694 | } else if (field_index == 3) { | ||
| 695 | // NFKC, time to test. | ||
| 696 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 697 | defer w_buf.deinit(); | ||
| 698 | |||
| 699 | var w_fields = mem.split(u8, field, " "); | ||
| 700 | while (w_fields.next()) |s| { | ||
| 701 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 702 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 703 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 704 | } | ||
| 705 | |||
| 706 | const want = w_buf.items; | ||
| 707 | var got = try n.nfkc(allocator, input); | ||
| 708 | defer got.deinit(); | ||
| 709 | |||
| 710 | try testing.expectEqualStrings(want, got.slice); | ||
| 711 | } else if (field_index == 4) { | ||
| 712 | // NFKD, time to test. | ||
| 713 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 714 | defer w_buf.deinit(); | ||
| 715 | |||
| 716 | var w_fields = mem.split(u8, field, " "); | ||
| 717 | while (w_fields.next()) |s| { | ||
| 718 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 719 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 720 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 721 | } | ||
| 722 | |||
| 723 | const want = w_buf.items; | ||
| 724 | const got = try n.nfkd(allocator, input); | ||
| 725 | defer got.deinit(); | ||
| 726 | |||
| 727 | try testing.expectEqualStrings(want, got.slice); | ||
| 728 | } else { | ||
| 729 | continue; | ||
| 730 | } | ||
| 731 | } | ||
| 732 | } | ||
| 733 | } | ||
| 734 | |||
| 735 | /// Returns true if `str` only contains Latin-1 Supplement | 616 | /// Returns true if `str` only contains Latin-1 Supplement |
| 736 | /// code points. Uses SIMD if possible. | 617 | /// code points. Uses SIMD if possible. |
| 737 | pub fn isLatin1Only(str: []const u8) bool { | 618 | pub fn isLatin1Only(str: []const u8) bool { |
diff --git a/src/grapheme.zig b/src/grapheme.zig index ad43cfd..f4cc68c 100644 --- a/src/grapheme.zig +++ b/src/grapheme.zig | |||
| @@ -230,71 +230,6 @@ pub fn graphemeBreak( | |||
| 230 | return true; | 230 | return true; |
| 231 | } | 231 | } |
| 232 | 232 | ||
| 233 | test "Segmentation GraphemeIterator" { | ||
| 234 | const allocator = std.testing.allocator; | ||
| 235 | var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); | ||
| 236 | defer file.close(); | ||
| 237 | var buf_reader = std.io.bufferedReader(file.reader()); | ||
| 238 | var input_stream = buf_reader.reader(); | ||
| 239 | |||
| 240 | const data = try GraphemeData.init(allocator); | ||
| 241 | defer data.deinit(); | ||
| 242 | |||
| 243 | var buf: [4096]u8 = undefined; | ||
| 244 | var line_no: usize = 1; | ||
| 245 | |||
| 246 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { | ||
| 247 | // Skip comments or empty lines. | ||
| 248 | if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; | ||
| 249 | |||
| 250 | // Clean up. | ||
| 251 | var line = std.mem.trimLeft(u8, raw, "÷ "); | ||
| 252 | if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { | ||
| 253 | line = line[0..octo]; | ||
| 254 | } | ||
| 255 | // Iterate over fields. | ||
| 256 | var want = std.ArrayList(Grapheme).init(allocator); | ||
| 257 | defer want.deinit(); | ||
| 258 | |||
| 259 | var all_bytes = std.ArrayList(u8).init(allocator); | ||
| 260 | defer all_bytes.deinit(); | ||
| 261 | |||
| 262 | var graphemes = std.mem.split(u8, line, " ÷ "); | ||
| 263 | var bytes_index: u32 = 0; | ||
| 264 | |||
| 265 | while (graphemes.next()) |field| { | ||
| 266 | var code_points = std.mem.split(u8, field, " "); | ||
| 267 | var cp_buf: [4]u8 = undefined; | ||
| 268 | var cp_index: u32 = 0; | ||
| 269 | var gc_len: u8 = 0; | ||
| 270 | |||
| 271 | while (code_points.next()) |code_point| { | ||
| 272 | if (std.mem.eql(u8, code_point, "×")) continue; | ||
| 273 | const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); | ||
| 274 | const len = try unicode.utf8Encode(cp, &cp_buf); | ||
| 275 | try all_bytes.appendSlice(cp_buf[0..len]); | ||
| 276 | cp_index += len; | ||
| 277 | gc_len += len; | ||
| 278 | } | ||
| 279 | |||
| 280 | try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); | ||
| 281 | bytes_index += cp_index; | ||
| 282 | } | ||
| 283 | |||
| 284 | // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); | ||
| 285 | var iter = Iterator.init(all_bytes.items, &data); | ||
| 286 | |||
| 287 | // Chaeck. | ||
| 288 | for (want.items) |want_gc| { | ||
| 289 | const got_gc = (iter.next()).?; | ||
| 290 | try std.testing.expectEqualStrings( | ||
| 291 | want_gc.bytes(all_bytes.items), | ||
| 292 | got_gc.bytes(all_bytes.items), | ||
| 293 | ); | ||
| 294 | } | ||
| 295 | } | ||
| 296 | } | ||
| 297 | |||
| 298 | test "Segmentation ZWJ and ZWSP emoji sequences" { | 233 | test "Segmentation ZWJ and ZWSP emoji sequences" { |
| 299 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | 234 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; |
| 300 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | 235 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; |
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig new file mode 100644 index 0000000..5442f63 --- /dev/null +++ b/src/unicode_tests.zig | |||
| @@ -0,0 +1,194 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const fmt = std.fmt; | ||
| 3 | const fs = std.fs; | ||
| 4 | const io = std.io; | ||
| 5 | const heap = std.heap; | ||
| 6 | const mem = std.mem; | ||
| 7 | const testing = std.testing; | ||
| 8 | const unicode = std.unicode; | ||
| 9 | |||
| 10 | const Grapheme = @import("grapheme").Grapheme; | ||
| 11 | const GraphemeData = @import("grapheme").GraphemeData; | ||
| 12 | const GraphemeIterator = @import("grapheme").Iterator; | ||
| 13 | const Normalize = @import("Normalize"); | ||
| 14 | |||
| 15 | test "Unicode normalization tests" { | ||
| 16 | var arena = heap.ArenaAllocator.init(testing.allocator); | ||
| 17 | defer arena.deinit(); | ||
| 18 | var allocator = arena.allocator(); | ||
| 19 | |||
| 20 | const data = try Normalize.NormData.init(allocator); | ||
| 21 | const n = Normalize{ .norm_data = &data }; | ||
| 22 | |||
| 23 | var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); | ||
| 24 | defer file.close(); | ||
| 25 | var buf_reader = io.bufferedReader(file.reader()); | ||
| 26 | const input_stream = buf_reader.reader(); | ||
| 27 | |||
| 28 | var line_no: usize = 0; | ||
| 29 | var buf: [4096]u8 = undefined; | ||
| 30 | var cp_buf: [4]u8 = undefined; | ||
| 31 | |||
| 32 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| { | ||
| 33 | line_no += 1; | ||
| 34 | // Skip comments or empty lines. | ||
| 35 | if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; | ||
| 36 | // Iterate over fields. | ||
| 37 | var fields = mem.split(u8, line, ";"); | ||
| 38 | var field_index: usize = 0; | ||
| 39 | var input: []u8 = undefined; | ||
| 40 | defer allocator.free(input); | ||
| 41 | |||
| 42 | while (fields.next()) |field| : (field_index += 1) { | ||
| 43 | if (field_index == 0) { | ||
| 44 | var i_buf = std.ArrayList(u8).init(allocator); | ||
| 45 | defer i_buf.deinit(); | ||
| 46 | |||
| 47 | var i_fields = mem.split(u8, field, " "); | ||
| 48 | while (i_fields.next()) |s| { | ||
| 49 | const icp = try fmt.parseInt(u21, s, 16); | ||
| 50 | const len = try unicode.utf8Encode(icp, &cp_buf); | ||
| 51 | try i_buf.appendSlice(cp_buf[0..len]); | ||
| 52 | } | ||
| 53 | |||
| 54 | input = try i_buf.toOwnedSlice(); | ||
| 55 | } else if (field_index == 1) { | ||
| 56 | //debug.print("\n*** {s} ***\n", .{line}); | ||
| 57 | // NFC, time to test. | ||
| 58 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 59 | defer w_buf.deinit(); | ||
| 60 | |||
| 61 | var w_fields = mem.split(u8, field, " "); | ||
| 62 | while (w_fields.next()) |s| { | ||
| 63 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 64 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 65 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 66 | } | ||
| 67 | |||
| 68 | const want = w_buf.items; | ||
| 69 | var got = try n.nfc(allocator, input); | ||
| 70 | defer got.deinit(); | ||
| 71 | |||
| 72 | try testing.expectEqualStrings(want, got.slice); | ||
| 73 | } else if (field_index == 2) { | ||
| 74 | // NFD, time to test. | ||
| 75 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 76 | defer w_buf.deinit(); | ||
| 77 | |||
| 78 | var w_fields = mem.split(u8, field, " "); | ||
| 79 | while (w_fields.next()) |s| { | ||
| 80 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 81 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 82 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 83 | } | ||
| 84 | |||
| 85 | const want = w_buf.items; | ||
| 86 | var got = try n.nfd(allocator, input); | ||
| 87 | defer got.deinit(); | ||
| 88 | |||
| 89 | try testing.expectEqualStrings(want, got.slice); | ||
| 90 | } else if (field_index == 3) { | ||
| 91 | // NFKC, time to test. | ||
| 92 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 93 | defer w_buf.deinit(); | ||
| 94 | |||
| 95 | var w_fields = mem.split(u8, field, " "); | ||
| 96 | while (w_fields.next()) |s| { | ||
| 97 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 98 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 99 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 100 | } | ||
| 101 | |||
| 102 | const want = w_buf.items; | ||
| 103 | var got = try n.nfkc(allocator, input); | ||
| 104 | defer got.deinit(); | ||
| 105 | |||
| 106 | try testing.expectEqualStrings(want, got.slice); | ||
| 107 | } else if (field_index == 4) { | ||
| 108 | // NFKD, time to test. | ||
| 109 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 110 | defer w_buf.deinit(); | ||
| 111 | |||
| 112 | var w_fields = mem.split(u8, field, " "); | ||
| 113 | while (w_fields.next()) |s| { | ||
| 114 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 115 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 116 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 117 | } | ||
| 118 | |||
| 119 | const want = w_buf.items; | ||
| 120 | const got = try n.nfkd(allocator, input); | ||
| 121 | defer got.deinit(); | ||
| 122 | |||
| 123 | try testing.expectEqualStrings(want, got.slice); | ||
| 124 | } else { | ||
| 125 | continue; | ||
| 126 | } | ||
| 127 | } | ||
| 128 | } | ||
| 129 | } | ||
| 130 | |||
| 131 | test "Segmentation GraphemeIterator" { | ||
| 132 | const allocator = std.testing.allocator; | ||
| 133 | var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); | ||
| 134 | defer file.close(); | ||
| 135 | var buf_reader = std.io.bufferedReader(file.reader()); | ||
| 136 | var input_stream = buf_reader.reader(); | ||
| 137 | |||
| 138 | const data = try GraphemeData.init(allocator); | ||
| 139 | defer data.deinit(); | ||
| 140 | |||
| 141 | var buf: [4096]u8 = undefined; | ||
| 142 | var line_no: usize = 1; | ||
| 143 | |||
| 144 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { | ||
| 145 | // Skip comments or empty lines. | ||
| 146 | if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; | ||
| 147 | |||
| 148 | // Clean up. | ||
| 149 | var line = std.mem.trimLeft(u8, raw, "÷ "); | ||
| 150 | if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { | ||
| 151 | line = line[0..octo]; | ||
| 152 | } | ||
| 153 | // Iterate over fields. | ||
| 154 | var want = std.ArrayList(Grapheme).init(allocator); | ||
| 155 | defer want.deinit(); | ||
| 156 | |||
| 157 | var all_bytes = std.ArrayList(u8).init(allocator); | ||
| 158 | defer all_bytes.deinit(); | ||
| 159 | |||
| 160 | var graphemes = std.mem.split(u8, line, " ÷ "); | ||
| 161 | var bytes_index: u32 = 0; | ||
| 162 | |||
| 163 | while (graphemes.next()) |field| { | ||
| 164 | var code_points = std.mem.split(u8, field, " "); | ||
| 165 | var cp_buf: [4]u8 = undefined; | ||
| 166 | var cp_index: u32 = 0; | ||
| 167 | var gc_len: u8 = 0; | ||
| 168 | |||
| 169 | while (code_points.next()) |code_point| { | ||
| 170 | if (std.mem.eql(u8, code_point, "×")) continue; | ||
| 171 | const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); | ||
| 172 | const len = try unicode.utf8Encode(cp, &cp_buf); | ||
| 173 | try all_bytes.appendSlice(cp_buf[0..len]); | ||
| 174 | cp_index += len; | ||
| 175 | gc_len += len; | ||
| 176 | } | ||
| 177 | |||
| 178 | try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); | ||
| 179 | bytes_index += cp_index; | ||
| 180 | } | ||
| 181 | |||
| 182 | // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); | ||
| 183 | var iter = GraphemeIterator.init(all_bytes.items, &data); | ||
| 184 | |||
| 185 | // Chaeck. | ||
| 186 | for (want.items) |want_gc| { | ||
| 187 | const got_gc = (iter.next()).?; | ||
| 188 | try std.testing.expectEqualStrings( | ||
| 189 | want_gc.bytes(all_bytes.items), | ||
| 190 | got_gc.bytes(all_bytes.items), | ||
| 191 | ); | ||
| 192 | } | ||
| 193 | } | ||
| 194 | } | ||