diff options
Diffstat (limited to 'src/Normalize.zig')
| -rw-r--r-- | src/Normalize.zig | 121 |
1 files changed, 1 insertions, 120 deletions
diff --git a/src/Normalize.zig b/src/Normalize.zig index daf774d..f437f4f 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig | |||
| @@ -3,12 +3,10 @@ | |||
| 3 | //! NFKC, NFD, and NFKD normalization forms. | 3 | //! NFKC, NFD, and NFKD normalization forms. |
| 4 | 4 | ||
| 5 | const std = @import("std"); | 5 | const std = @import("std"); |
| 6 | const assert = std.debug.assert; | ||
| 7 | const debug = std.debug; | 6 | const debug = std.debug; |
| 7 | const assert = debug.assert; | ||
| 8 | const fmt = std.fmt; | 8 | const fmt = std.fmt; |
| 9 | const fs = std.fs; | ||
| 10 | const heap = std.heap; | 9 | const heap = std.heap; |
| 11 | const io = std.io; | ||
| 12 | const mem = std.mem; | 10 | const mem = std.mem; |
| 13 | const simd = std.simd; | 11 | const simd = std.simd; |
| 14 | const testing = std.testing; | 12 | const testing = std.testing; |
| @@ -615,123 +613,6 @@ test "isFcd" { | |||
| 615 | try testing.expect(!n.isFcd(not_fcd)); | 613 | try testing.expect(!n.isFcd(not_fcd)); |
| 616 | } | 614 | } |
| 617 | 615 | ||
| 618 | test "Unicode normalization tests" { | ||
| 619 | var arena = heap.ArenaAllocator.init(testing.allocator); | ||
| 620 | defer arena.deinit(); | ||
| 621 | var allocator = arena.allocator(); | ||
| 622 | |||
| 623 | const data = try NormData.init(allocator); | ||
| 624 | defer data.deinit(); | ||
| 625 | const n = Self{ .norm_data = &data }; | ||
| 626 | |||
| 627 | var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); | ||
| 628 | defer file.close(); | ||
| 629 | var buf_reader = io.bufferedReader(file.reader()); | ||
| 630 | const input_stream = buf_reader.reader(); | ||
| 631 | |||
| 632 | var line_no: usize = 0; | ||
| 633 | var buf: [4096]u8 = undefined; | ||
| 634 | var cp_buf: [4]u8 = undefined; | ||
| 635 | |||
| 636 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| { | ||
| 637 | line_no += 1; | ||
| 638 | // Skip comments or empty lines. | ||
| 639 | if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; | ||
| 640 | // Iterate over fields. | ||
| 641 | var fields = mem.split(u8, line, ";"); | ||
| 642 | var field_index: usize = 0; | ||
| 643 | var input: []u8 = undefined; | ||
| 644 | defer allocator.free(input); | ||
| 645 | |||
| 646 | while (fields.next()) |field| : (field_index += 1) { | ||
| 647 | if (field_index == 0) { | ||
| 648 | var i_buf = std.ArrayList(u8).init(allocator); | ||
| 649 | defer i_buf.deinit(); | ||
| 650 | |||
| 651 | var i_fields = mem.split(u8, field, " "); | ||
| 652 | while (i_fields.next()) |s| { | ||
| 653 | const icp = try fmt.parseInt(u21, s, 16); | ||
| 654 | const len = try unicode.utf8Encode(icp, &cp_buf); | ||
| 655 | try i_buf.appendSlice(cp_buf[0..len]); | ||
| 656 | } | ||
| 657 | |||
| 658 | input = try i_buf.toOwnedSlice(); | ||
| 659 | } else if (field_index == 1) { | ||
| 660 | //debug.print("\n*** {s} ***\n", .{line}); | ||
| 661 | // NFC, time to test. | ||
| 662 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 663 | defer w_buf.deinit(); | ||
| 664 | |||
| 665 | var w_fields = mem.split(u8, field, " "); | ||
| 666 | while (w_fields.next()) |s| { | ||
| 667 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 668 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 669 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 670 | } | ||
| 671 | |||
| 672 | const want = w_buf.items; | ||
| 673 | var got = try n.nfc(allocator, input); | ||
| 674 | defer got.deinit(); | ||
| 675 | |||
| 676 | try testing.expectEqualStrings(want, got.slice); | ||
| 677 | } else if (field_index == 2) { | ||
| 678 | // NFD, time to test. | ||
| 679 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 680 | defer w_buf.deinit(); | ||
| 681 | |||
| 682 | var w_fields = mem.split(u8, field, " "); | ||
| 683 | while (w_fields.next()) |s| { | ||
| 684 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 685 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 686 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 687 | } | ||
| 688 | |||
| 689 | const want = w_buf.items; | ||
| 690 | var got = try n.nfd(allocator, input); | ||
| 691 | defer got.deinit(); | ||
| 692 | |||
| 693 | try testing.expectEqualStrings(want, got.slice); | ||
| 694 | } else if (field_index == 3) { | ||
| 695 | // NFKC, time to test. | ||
| 696 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 697 | defer w_buf.deinit(); | ||
| 698 | |||
| 699 | var w_fields = mem.split(u8, field, " "); | ||
| 700 | while (w_fields.next()) |s| { | ||
| 701 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 702 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 703 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 704 | } | ||
| 705 | |||
| 706 | const want = w_buf.items; | ||
| 707 | var got = try n.nfkc(allocator, input); | ||
| 708 | defer got.deinit(); | ||
| 709 | |||
| 710 | try testing.expectEqualStrings(want, got.slice); | ||
| 711 | } else if (field_index == 4) { | ||
| 712 | // NFKD, time to test. | ||
| 713 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 714 | defer w_buf.deinit(); | ||
| 715 | |||
| 716 | var w_fields = mem.split(u8, field, " "); | ||
| 717 | while (w_fields.next()) |s| { | ||
| 718 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 719 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 720 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 721 | } | ||
| 722 | |||
| 723 | const want = w_buf.items; | ||
| 724 | const got = try n.nfkd(allocator, input); | ||
| 725 | defer got.deinit(); | ||
| 726 | |||
| 727 | try testing.expectEqualStrings(want, got.slice); | ||
| 728 | } else { | ||
| 729 | continue; | ||
| 730 | } | ||
| 731 | } | ||
| 732 | } | ||
| 733 | } | ||
| 734 | |||
| 735 | /// Returns true if `str` only contains Latin-1 Supplement | 616 | /// Returns true if `str` only contains Latin-1 Supplement |
| 736 | /// code points. Uses SIMD if possible. | 617 | /// code points. Uses SIMD if possible. |
| 737 | pub fn isLatin1Only(str: []const u8) bool { | 618 | pub fn isLatin1Only(str: []const u8) bool { |