summaryrefslogtreecommitdiff
path: root/src/Normalize.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/Normalize.zig')
-rw-r--r--src/Normalize.zig121
1 files changed, 1 insertions, 120 deletions
diff --git a/src/Normalize.zig b/src/Normalize.zig
index daf774d..f437f4f 100644
--- a/src/Normalize.zig
+++ b/src/Normalize.zig
@@ -3,12 +3,10 @@
3//! NFKC, NFD, and NFKD normalization forms. 3//! NFKC, NFD, and NFKD normalization forms.
4 4
5const std = @import("std"); 5const std = @import("std");
6const assert = std.debug.assert;
7const debug = std.debug; 6const debug = std.debug;
7const assert = debug.assert;
8const fmt = std.fmt; 8const fmt = std.fmt;
9const fs = std.fs;
10const heap = std.heap; 9const heap = std.heap;
11const io = std.io;
12const mem = std.mem; 10const mem = std.mem;
13const simd = std.simd; 11const simd = std.simd;
14const testing = std.testing; 12const testing = std.testing;
@@ -615,123 +613,6 @@ test "isFcd" {
615 try testing.expect(!n.isFcd(not_fcd)); 613 try testing.expect(!n.isFcd(not_fcd));
616} 614}
617 615
618test "Unicode normalization tests" {
619 var arena = heap.ArenaAllocator.init(testing.allocator);
620 defer arena.deinit();
621 var allocator = arena.allocator();
622
623 const data = try NormData.init(allocator);
624 defer data.deinit();
625 const n = Self{ .norm_data = &data };
626
627 var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
628 defer file.close();
629 var buf_reader = io.bufferedReader(file.reader());
630 const input_stream = buf_reader.reader();
631
632 var line_no: usize = 0;
633 var buf: [4096]u8 = undefined;
634 var cp_buf: [4]u8 = undefined;
635
636 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| {
637 line_no += 1;
638 // Skip comments or empty lines.
639 if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
640 // Iterate over fields.
641 var fields = mem.split(u8, line, ";");
642 var field_index: usize = 0;
643 var input: []u8 = undefined;
644 defer allocator.free(input);
645
646 while (fields.next()) |field| : (field_index += 1) {
647 if (field_index == 0) {
648 var i_buf = std.ArrayList(u8).init(allocator);
649 defer i_buf.deinit();
650
651 var i_fields = mem.split(u8, field, " ");
652 while (i_fields.next()) |s| {
653 const icp = try fmt.parseInt(u21, s, 16);
654 const len = try unicode.utf8Encode(icp, &cp_buf);
655 try i_buf.appendSlice(cp_buf[0..len]);
656 }
657
658 input = try i_buf.toOwnedSlice();
659 } else if (field_index == 1) {
660 //debug.print("\n*** {s} ***\n", .{line});
661 // NFC, time to test.
662 var w_buf = std.ArrayList(u8).init(allocator);
663 defer w_buf.deinit();
664
665 var w_fields = mem.split(u8, field, " ");
666 while (w_fields.next()) |s| {
667 const wcp = try fmt.parseInt(u21, s, 16);
668 const len = try unicode.utf8Encode(wcp, &cp_buf);
669 try w_buf.appendSlice(cp_buf[0..len]);
670 }
671
672 const want = w_buf.items;
673 var got = try n.nfc(allocator, input);
674 defer got.deinit();
675
676 try testing.expectEqualStrings(want, got.slice);
677 } else if (field_index == 2) {
678 // NFD, time to test.
679 var w_buf = std.ArrayList(u8).init(allocator);
680 defer w_buf.deinit();
681
682 var w_fields = mem.split(u8, field, " ");
683 while (w_fields.next()) |s| {
684 const wcp = try fmt.parseInt(u21, s, 16);
685 const len = try unicode.utf8Encode(wcp, &cp_buf);
686 try w_buf.appendSlice(cp_buf[0..len]);
687 }
688
689 const want = w_buf.items;
690 var got = try n.nfd(allocator, input);
691 defer got.deinit();
692
693 try testing.expectEqualStrings(want, got.slice);
694 } else if (field_index == 3) {
695 // NFKC, time to test.
696 var w_buf = std.ArrayList(u8).init(allocator);
697 defer w_buf.deinit();
698
699 var w_fields = mem.split(u8, field, " ");
700 while (w_fields.next()) |s| {
701 const wcp = try fmt.parseInt(u21, s, 16);
702 const len = try unicode.utf8Encode(wcp, &cp_buf);
703 try w_buf.appendSlice(cp_buf[0..len]);
704 }
705
706 const want = w_buf.items;
707 var got = try n.nfkc(allocator, input);
708 defer got.deinit();
709
710 try testing.expectEqualStrings(want, got.slice);
711 } else if (field_index == 4) {
712 // NFKD, time to test.
713 var w_buf = std.ArrayList(u8).init(allocator);
714 defer w_buf.deinit();
715
716 var w_fields = mem.split(u8, field, " ");
717 while (w_fields.next()) |s| {
718 const wcp = try fmt.parseInt(u21, s, 16);
719 const len = try unicode.utf8Encode(wcp, &cp_buf);
720 try w_buf.appendSlice(cp_buf[0..len]);
721 }
722
723 const want = w_buf.items;
724 const got = try n.nfkd(allocator, input);
725 defer got.deinit();
726
727 try testing.expectEqualStrings(want, got.slice);
728 } else {
729 continue;
730 }
731 }
732 }
733}
734
735/// Returns true if `str` only contains Latin-1 Supplement 616/// Returns true if `str` only contains Latin-1 Supplement
736/// code points. Uses SIMD if possible. 617/// code points. Uses SIMD if possible.
737pub fn isLatin1Only(str: []const u8) bool { 618pub fn isLatin1Only(str: []const u8) bool {