4 files changed, 209 insertions, 216 deletions
diff --git a/build.zig b/build.zig
index 5f5680c..c05b4a1 100644
--- a/build.zig
+++ b/build.zig
@@ -196,13 +196,13 @@ pub fn build(b: *std.Build) void {
    });
    // Fixed pitch font display width
-    const dw_data = b.createModule(.{
+    const width_data = b.createModule(.{
        .root_source_file = .{ .path = "src/WidthData.zig" },
        .target = target,
        .optimize = optimize,
    });
-    dw_data.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out });
+    width_data.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out });
-    dw_data.addImport("GraphemeData", grapheme_data);
+    width_data.addImport("GraphemeData", grapheme_data);
    const display_width = b.addModule("DisplayWidth", .{
        .root_source_file = .{ .path = "src/DisplayWidth.zig" },
@@ -212,7 +212,7 @@ pub fn build(b: *std.Build) void {
    display_width.addImport("ascii", ascii);
    display_width.addImport("code_point", code_point);
    display_width.addImport("grapheme", grapheme);
-    display_width.addImport("DisplayWidthData", dw_data);
+    display_width.addImport("DisplayWidthData", width_data);
    // Normalization
    const ccc_data = b.createModule(.{
@@ -324,34 +324,17 @@ pub fn build(b: *std.Build) void {
    props_data.addAnonymousImport("props", .{ .root_source_file = props_gen_out });
    props_data.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out });
-    // Tests
+    // Unicode Tests
-    const exe_unit_tests = b.addTest(.{
+    const unicode_tests = b.addTest(.{
-        .root_source_file = .{ .path = "src/PropsData.zig" },
+        .root_source_file = .{ .path = "src/unicode_tests.zig" },
        .target = target,
        .optimize = optimize,
    });
-    // exe_unit_tests.root_module.addImport("ascii", ascii);
+    unicode_tests.root_module.addImport("grapheme", grapheme);
-    // exe_unit_tests.root_module.addImport("code_point", code_point);
+    unicode_tests.root_module.addImport("Normalize", norm);
-    // exe_unit_tests.root_module.addImport("GraphemeData", grapheme_data);
-    // exe_unit_tests.root_module.addImport("grapheme", grapheme);
+    const run_unicode_tests = b.addRunArtifact(unicode_tests);
-    // exe_unit_tests.root_module.addImport("ziglyph", ziglyph.module("ziglyph"));
-    // exe_unit_tests.root_module.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out });
+    const unicode_test_step = b.step("unicode-test", "Run Unicode tests");
-    // exe_unit_tests.root_module.addImport("DisplayWidthData", dw_data);
+    unicode_test_step.dependOn(&run_unicode_tests.step);
-    // exe_unit_tests.root_module.addImport("NormData", norm_data);
-    // exe_unit_tests.root_module.addImport("Normalize", norm);
-    // exe_unit_tests.root_module.addImport("FoldData", fold_data);
-    // exe_unit_tests.root_module.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out });
-    // exe_unit_tests.root_module.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out });
-    // exe_unit_tests.root_module.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out });
-    // exe_unit_tests.root_module.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out });
-    // exe_unit_tests.root_module.addAnonymousImport("scripts", .{ .root_source_file = scripts_gen_out });
-    exe_unit_tests.root_module.addAnonymousImport("core_props", .{ .root_source_file = core_gen_out });
-    exe_unit_tests.root_module.addAnonymousImport("props", .{ .root_source_file = props_gen_out });
-    exe_unit_tests.root_module.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out });
-    // exe_unit_tests.filter = "nfd !ASCII";
-    const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
-    const test_step = b.step("test", "Run unit tests");
-    test_step.dependOn(&run_exe_unit_tests.step);
 }
diff --git a/src/Normalize.zig b/src/Normalize.zig
index daf774d..f437f4f 100644
--- a/src/Normalize.zig
+++ b/src/Normalize.zig
@@ -3,12 +3,10 @@
 //! NFKC, NFD, and NFKD normalization forms.
 const std = @import("std");
-const assert = std.debug.assert;
 const debug = std.debug;
+const assert = debug.assert;
 const fmt = std.fmt;
-const fs = std.fs;
 const heap = std.heap;
-const io = std.io;
 const mem = std.mem;
 const simd = std.simd;
 const testing = std.testing;
@@ -615,123 +613,6 @@ test "isFcd" {
    try testing.expect(!n.isFcd(not_fcd));
 }
-test "Unicode normalization tests" {
-    var arena = heap.ArenaAllocator.init(testing.allocator);
-    defer arena.deinit();
-    var allocator = arena.allocator();
-    const data = try NormData.init(allocator);
-    defer data.deinit();
-    const n = Self{ .norm_data = &data };
-    var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
-    defer file.close();
-    var buf_reader = io.bufferedReader(file.reader());
-    const input_stream = buf_reader.reader();
-    var line_no: usize = 0;
-    var buf: [4096]u8 = undefined;
-    var cp_buf: [4]u8 = undefined;
-    while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| {
-        line_no += 1;
-        // Skip comments or empty lines.
-        if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
-        // Iterate over fields.
-        var fields = mem.split(u8, line, ";");
-        var field_index: usize = 0;
-        var input: []u8 = undefined;
-        defer allocator.free(input);
-        while (fields.next()) |field| : (field_index += 1) {
-            if (field_index == 0) {
-                var i_buf = std.ArrayList(u8).init(allocator);
-                defer i_buf.deinit();
-                var i_fields = mem.split(u8, field, " ");
-                while (i_fields.next()) |s| {
-                    const icp = try fmt.parseInt(u21, s, 16);
-                    const len = try unicode.utf8Encode(icp, &cp_buf);
-                    try i_buf.appendSlice(cp_buf[0..len]);
-                }
-                input = try i_buf.toOwnedSlice();
-            } else if (field_index == 1) {
-                //debug.print("\n*** {s} ***\n", .{line});
-                // NFC, time to test.
-                var w_buf = std.ArrayList(u8).init(allocator);
-                defer w_buf.deinit();
-                var w_fields = mem.split(u8, field, " ");
-                while (w_fields.next()) |s| {
-                    const wcp = try fmt.parseInt(u21, s, 16);
-                    const len = try unicode.utf8Encode(wcp, &cp_buf);
-                    try w_buf.appendSlice(cp_buf[0..len]);
-                }
-                const want = w_buf.items;
-                var got = try n.nfc(allocator, input);
-                defer got.deinit();
-                try testing.expectEqualStrings(want, got.slice);
-            } else if (field_index == 2) {
-                // NFD, time to test.
-                var w_buf = std.ArrayList(u8).init(allocator);
-                defer w_buf.deinit();
-                var w_fields = mem.split(u8, field, " ");
-                while (w_fields.next()) |s| {
-                    const wcp = try fmt.parseInt(u21, s, 16);
-                    const len = try unicode.utf8Encode(wcp, &cp_buf);
-                    try w_buf.appendSlice(cp_buf[0..len]);
-                }
-                const want = w_buf.items;
-                var got = try n.nfd(allocator, input);
-                defer got.deinit();
-                try testing.expectEqualStrings(want, got.slice);
-            } else if (field_index == 3) {
-                // NFKC, time to test.
-                var w_buf = std.ArrayList(u8).init(allocator);
-                defer w_buf.deinit();
-                var w_fields = mem.split(u8, field, " ");
-                while (w_fields.next()) |s| {
-                    const wcp = try fmt.parseInt(u21, s, 16);
-                    const len = try unicode.utf8Encode(wcp, &cp_buf);
-                    try w_buf.appendSlice(cp_buf[0..len]);
-                }
-                const want = w_buf.items;
-                var got = try n.nfkc(allocator, input);
-                defer got.deinit();
-                try testing.expectEqualStrings(want, got.slice);
-            } else if (field_index == 4) {
-                // NFKD, time to test.
-                var w_buf = std.ArrayList(u8).init(allocator);
-                defer w_buf.deinit();
-                var w_fields = mem.split(u8, field, " ");
-                while (w_fields.next()) |s| {
-                    const wcp = try fmt.parseInt(u21, s, 16);
-                    const len = try unicode.utf8Encode(wcp, &cp_buf);
-                    try w_buf.appendSlice(cp_buf[0..len]);
-                }
-                const want = w_buf.items;
-                const got = try n.nfkd(allocator, input);
-                defer got.deinit();
-                try testing.expectEqualStrings(want, got.slice);
-            } else {
-                continue;
-            }
-        }
-    }
-}
 /// Returns true if `str` only contains Latin-1 Supplement
 /// code points. Uses SIMD if possible.
 pub fn isLatin1Only(str: []const u8) bool {
diff --git a/src/grapheme.zig b/src/grapheme.zig
index ad43cfd..f4cc68c 100644
--- a/src/grapheme.zig
+++ b/src/grapheme.zig
@@ -230,71 +230,6 @@ pub fn graphemeBreak(
    return true;
 }
-test "Segmentation GraphemeIterator" {
-    const allocator = std.testing.allocator;
-    var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{});
-    defer file.close();
-    var buf_reader = std.io.bufferedReader(file.reader());
-    var input_stream = buf_reader.reader();
-    const data = try GraphemeData.init(allocator);
-    defer data.deinit();
-    var buf: [4096]u8 = undefined;
-    var line_no: usize = 1;
-    while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) {
-        // Skip comments or empty lines.
-        if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
-        // Clean up.
-        var line = std.mem.trimLeft(u8, raw, "÷ ");
-        if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| {
-            line = line[0..octo];
-        }
-        // Iterate over fields.
-        var want = std.ArrayList(Grapheme).init(allocator);
-        defer want.deinit();
-        var all_bytes = std.ArrayList(u8).init(allocator);
-        defer all_bytes.deinit();
-        var graphemes = std.mem.split(u8, line, " ÷ ");
-        var bytes_index: u32 = 0;
-        while (graphemes.next()) |field| {
-            var code_points = std.mem.split(u8, field, " ");
-            var cp_buf: [4]u8 = undefined;
-            var cp_index: u32 = 0;
-            var gc_len: u8 = 0;
-            while (code_points.next()) |code_point| {
-                if (std.mem.eql(u8, code_point, "×")) continue;
-                const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
-                const len = try unicode.utf8Encode(cp, &cp_buf);
-                try all_bytes.appendSlice(cp_buf[0..len]);
-                cp_index += len;
-                gc_len += len;
-            }
-            try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
-            bytes_index += cp_index;
-        }
-        // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
-        var iter = Iterator.init(all_bytes.items, &data);
-        // Chaeck.
-        for (want.items) |want_gc| {
-            const got_gc = (iter.next()).?;
-            try std.testing.expectEqualStrings(
-                want_gc.bytes(all_bytes.items),
-                got_gc.bytes(all_bytes.items),
-            );
-        }
-    }
-}
 test "Segmentation ZWJ and ZWSP emoji sequences" {
    const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
    const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
new file mode 100644
index 0000000..5442f63
--- /dev/null
+++ b/src/unicode_tests.zig
@@ -0,0 +1,194 @@
+const std = @import("std");
+const fmt = std.fmt;
+const fs = std.fs;
+const io = std.io;
+const heap = std.heap;
+const mem = std.mem;
+const testing = std.testing;
+const unicode = std.unicode;
+const Grapheme = @import("grapheme").Grapheme;
+const GraphemeData = @import("grapheme").GraphemeData;
+const GraphemeIterator = @import("grapheme").Iterator;
+const Normalize = @import("Normalize");
+test "Unicode normalization tests" {
+    var arena = heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    var allocator = arena.allocator();
+    const data = try Normalize.NormData.init(allocator);
+    const n = Normalize{ .norm_data = &data };
+    var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
+    defer file.close();
+    var buf_reader = io.bufferedReader(file.reader());
+    const input_stream = buf_reader.reader();
+    var line_no: usize = 0;
+    var buf: [4096]u8 = undefined;
+    var cp_buf: [4]u8 = undefined;
+    while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| {
+        line_no += 1;
+        // Skip comments or empty lines.
+        if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
+        // Iterate over fields.
+        var fields = mem.split(u8, line, ";");
+        var field_index: usize = 0;
+        var input: []u8 = undefined;
+        defer allocator.free(input);
+        while (fields.next()) |field| : (field_index += 1) {
+            if (field_index == 0) {
+                var i_buf = std.ArrayList(u8).init(allocator);
+                defer i_buf.deinit();
+                var i_fields = mem.split(u8, field, " ");
+                while (i_fields.next()) |s| {
+                    const icp = try fmt.parseInt(u21, s, 16);
+                    const len = try unicode.utf8Encode(icp, &cp_buf);
+                    try i_buf.appendSlice(cp_buf[0..len]);
+                }
+                input = try i_buf.toOwnedSlice();
+            } else if (field_index == 1) {
+                //debug.print("\n*** {s} ***\n", .{line});
+                // NFC, time to test.
+                var w_buf = std.ArrayList(u8).init(allocator);
+                defer w_buf.deinit();
+                var w_fields = mem.split(u8, field, " ");
+                while (w_fields.next()) |s| {
+                    const wcp = try fmt.parseInt(u21, s, 16);
+                    const len = try unicode.utf8Encode(wcp, &cp_buf);
+                    try w_buf.appendSlice(cp_buf[0..len]);
+                }
+                const want = w_buf.items;
+                var got = try n.nfc(allocator, input);
+                defer got.deinit();
+                try testing.expectEqualStrings(want, got.slice);
+            } else if (field_index == 2) {
+                // NFD, time to test.
+                var w_buf = std.ArrayList(u8).init(allocator);
+                defer w_buf.deinit();
+                var w_fields = mem.split(u8, field, " ");
+                while (w_fields.next()) |s| {
+                    const wcp = try fmt.parseInt(u21, s, 16);
+                    const len = try unicode.utf8Encode(wcp, &cp_buf);
+                    try w_buf.appendSlice(cp_buf[0..len]);
+                }
+                const want = w_buf.items;
+                var got = try n.nfd(allocator, input);
+                defer got.deinit();
+                try testing.expectEqualStrings(want, got.slice);
+            } else if (field_index == 3) {
+                // NFKC, time to test.
+                var w_buf = std.ArrayList(u8).init(allocator);
+                defer w_buf.deinit();
+                var w_fields = mem.split(u8, field, " ");
+                while (w_fields.next()) |s| {
+                    const wcp = try fmt.parseInt(u21, s, 16);
+                    const len = try unicode.utf8Encode(wcp, &cp_buf);
+                    try w_buf.appendSlice(cp_buf[0..len]);
+                }
+                const want = w_buf.items;
+                var got = try n.nfkc(allocator, input);
+                defer got.deinit();
+                try testing.expectEqualStrings(want, got.slice);
+            } else if (field_index == 4) {
+                // NFKD, time to test.
+                var w_buf = std.ArrayList(u8).init(allocator);
+                defer w_buf.deinit();
+                var w_fields = mem.split(u8, field, " ");
+                while (w_fields.next()) |s| {
+                    const wcp = try fmt.parseInt(u21, s, 16);
+                    const len = try unicode.utf8Encode(wcp, &cp_buf);
+                    try w_buf.appendSlice(cp_buf[0..len]);
+                }
+                const want = w_buf.items;
+                const got = try n.nfkd(allocator, input);
+                defer got.deinit();
+                try testing.expectEqualStrings(want, got.slice);
+            } else {
+                continue;
+            }
+        }
+    }
+}
+test "Segmentation GraphemeIterator" {
+    const allocator = std.testing.allocator;
+    var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{});
+    defer file.close();
+    var buf_reader = std.io.bufferedReader(file.reader());
+    var input_stream = buf_reader.reader();
+    const data = try GraphemeData.init(allocator);
+    defer data.deinit();
+    var buf: [4096]u8 = undefined;
+    var line_no: usize = 1;
+    while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) {
+        // Skip comments or empty lines.
+        if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
+        // Clean up.
+        var line = std.mem.trimLeft(u8, raw, "÷ ");
+        if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| {
+            line = line[0..octo];
+        }
+        // Iterate over fields.
+        var want = std.ArrayList(Grapheme).init(allocator);
+        defer want.deinit();
+        var all_bytes = std.ArrayList(u8).init(allocator);
+        defer all_bytes.deinit();
+        var graphemes = std.mem.split(u8, line, " ÷ ");
+        var bytes_index: u32 = 0;
+        while (graphemes.next()) |field| {
+            var code_points = std.mem.split(u8, field, " ");
+            var cp_buf: [4]u8 = undefined;
+            var cp_index: u32 = 0;
+            var gc_len: u8 = 0;
+            while (code_points.next()) |code_point| {
+                if (std.mem.eql(u8, code_point, "×")) continue;
+                const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
+                const len = try unicode.utf8Encode(cp, &cp_buf);
+                try all_bytes.appendSlice(cp_buf[0..len]);
+                cp_index += len;
+                gc_len += len;
+            }
+            try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
+            bytes_index += cp_index;
+        }
+        // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
+        var iter = GraphemeIterator.init(all_bytes.items, &data);
+        // Chaeck.
+        for (want.items) |want_gc| {
+            const got_gc = (iter.next()).?;
+            try std.testing.expectEqualStrings(
+                want_gc.bytes(all_bytes.items),
+                got_gc.bytes(all_bytes.items),
+            );
+        }
+    }
+}

diff --git a/build.zig b/build.zig index 5f5680c..c05b4a1 100644 --- a/build.zig +++ b/build.zig
@@ -196,13 +196,13 @@ pub fn build(b: *std.Build) void {
196	});	196	});
197		197
198	// Fixed pitch font display width	198	// Fixed pitch font display width
199	const dw_data = b.createModule(.{	199	const width_data = b.createModule(.{
200	.root_source_file = .{ .path = "src/WidthData.zig" },	200	.root_source_file = .{ .path = "src/WidthData.zig" },
201	.target = target,	201	.target = target,
202	.optimize = optimize,	202	.optimize = optimize,
203	});	203	});
204	dw_data.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out });	204	width_data.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out });
205	dw_data.addImport("GraphemeData", grapheme_data);	205	width_data.addImport("GraphemeData", grapheme_data);
206		206
207	const display_width = b.addModule("DisplayWidth", .{	207	const display_width = b.addModule("DisplayWidth", .{
208	.root_source_file = .{ .path = "src/DisplayWidth.zig" },	208	.root_source_file = .{ .path = "src/DisplayWidth.zig" },
@@ -212,7 +212,7 @@ pub fn build(b: *std.Build) void {
212	display_width.addImport("ascii", ascii);	212	display_width.addImport("ascii", ascii);
213	display_width.addImport("code_point", code_point);	213	display_width.addImport("code_point", code_point);
214	display_width.addImport("grapheme", grapheme);	214	display_width.addImport("grapheme", grapheme);
215	display_width.addImport("DisplayWidthData", dw_data);	215	display_width.addImport("DisplayWidthData", width_data);
216		216
217	// Normalization	217	// Normalization
218	const ccc_data = b.createModule(.{	218	const ccc_data = b.createModule(.{
@@ -324,34 +324,17 @@ pub fn build(b: *std.Build) void {
324	props_data.addAnonymousImport("props", .{ .root_source_file = props_gen_out });	324	props_data.addAnonymousImport("props", .{ .root_source_file = props_gen_out });
325	props_data.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out });	325	props_data.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out });
326		326
327	// Tests	327	// Unicode Tests
328	const exe_unit_tests = b.addTest(.{	328	const unicode_tests = b.addTest(.{
329	.root_source_file = .{ .path = "src/PropsData.zig" },	329	.root_source_file = .{ .path = "src/unicode_tests.zig" },
330	.target = target,	330	.target = target,
331	.optimize = optimize,	331	.optimize = optimize,
332	});	332	});
333	// exe_unit_tests.root_module.addImport("ascii", ascii);	333	unicode_tests.root_module.addImport("grapheme", grapheme);
334	// exe_unit_tests.root_module.addImport("code_point", code_point);	334	unicode_tests.root_module.addImport("Normalize", norm);
335	// exe_unit_tests.root_module.addImport("GraphemeData", grapheme_data);	335
336	// exe_unit_tests.root_module.addImport("grapheme", grapheme);	336	const run_unicode_tests = b.addRunArtifact(unicode_tests);
337	// exe_unit_tests.root_module.addImport("ziglyph", ziglyph.module("ziglyph"));	337
338	// exe_unit_tests.root_module.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out });	338	const unicode_test_step = b.step("unicode-test", "Run Unicode tests");
339	// exe_unit_tests.root_module.addImport("DisplayWidthData", dw_data);	339	unicode_test_step.dependOn(&run_unicode_tests.step);
340	// exe_unit_tests.root_module.addImport("NormData", norm_data);
341	// exe_unit_tests.root_module.addImport("Normalize", norm);
342	// exe_unit_tests.root_module.addImport("FoldData", fold_data);
343	// exe_unit_tests.root_module.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out });
344	// exe_unit_tests.root_module.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out });
345	// exe_unit_tests.root_module.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out });
346	// exe_unit_tests.root_module.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out });
347	// exe_unit_tests.root_module.addAnonymousImport("scripts", .{ .root_source_file = scripts_gen_out });
348	exe_unit_tests.root_module.addAnonymousImport("core_props", .{ .root_source_file = core_gen_out });
349	exe_unit_tests.root_module.addAnonymousImport("props", .{ .root_source_file = props_gen_out });
350	exe_unit_tests.root_module.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out });
351	// exe_unit_tests.filter = "nfd !ASCII";
352
353	const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
354
355	const test_step = b.step("test", "Run unit tests");
356	test_step.dependOn(&run_exe_unit_tests.step);
357	}	340	}


diff --git a/src/Normalize.zig b/src/Normalize.zig index daf774d..f437f4f 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig
@@ -3,12 +3,10 @@
3	//! NFKC, NFD, and NFKD normalization forms.	3	//! NFKC, NFD, and NFKD normalization forms.
4		4
5	const std = @import("std");	5	const std = @import("std");
6	const assert = std.debug.assert;
7	const debug = std.debug;	6	const debug = std.debug;
		7	const assert = debug.assert;
8	const fmt = std.fmt;	8	const fmt = std.fmt;
9	const fs = std.fs;
10	const heap = std.heap;	9	const heap = std.heap;
11	const io = std.io;
12	const mem = std.mem;	10	const mem = std.mem;
13	const simd = std.simd;	11	const simd = std.simd;
14	const testing = std.testing;	12	const testing = std.testing;
@@ -615,123 +613,6 @@ test "isFcd" {
615	try testing.expect(!n.isFcd(not_fcd));	613	try testing.expect(!n.isFcd(not_fcd));
616	}	614	}
617		615
618	test "Unicode normalization tests" {
619	var arena = heap.ArenaAllocator.init(testing.allocator);
620	defer arena.deinit();
621	var allocator = arena.allocator();
622
623	const data = try NormData.init(allocator);
624	defer data.deinit();
625	const n = Self{ .norm_data = &data };
626
627	var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
628	defer file.close();
629	var buf_reader = io.bufferedReader(file.reader());
630	const input_stream = buf_reader.reader();
631
632	var line_no: usize = 0;
633	var buf: [4096]u8 = undefined;
634	var cp_buf: [4]u8 = undefined;
635
636	while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) \|line\| {
637	line_no += 1;
638	// Skip comments or empty lines.
639	if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
640	// Iterate over fields.
641	var fields = mem.split(u8, line, ";");
642	var field_index: usize = 0;
643	var input: []u8 = undefined;
644	defer allocator.free(input);
645
646	while (fields.next()) \|field\| : (field_index += 1) {
647	if (field_index == 0) {
648	var i_buf = std.ArrayList(u8).init(allocator);
649	defer i_buf.deinit();
650
651	var i_fields = mem.split(u8, field, " ");
652	while (i_fields.next()) \|s\| {
653	const icp = try fmt.parseInt(u21, s, 16);
654	const len = try unicode.utf8Encode(icp, &cp_buf);
655	try i_buf.appendSlice(cp_buf[0..len]);
656	}
657
658	input = try i_buf.toOwnedSlice();
659	} else if (field_index == 1) {
660	//debug.print("\n* {s} *\n", .{line});
661	// NFC, time to test.
662	var w_buf = std.ArrayList(u8).init(allocator);
663	defer w_buf.deinit();
664
665	var w_fields = mem.split(u8, field, " ");
666	while (w_fields.next()) \|s\| {
667	const wcp = try fmt.parseInt(u21, s, 16);
668	const len = try unicode.utf8Encode(wcp, &cp_buf);
669	try w_buf.appendSlice(cp_buf[0..len]);
670	}
671
672	const want = w_buf.items;
673	var got = try n.nfc(allocator, input);
674	defer got.deinit();
675
676	try testing.expectEqualStrings(want, got.slice);
677	} else if (field_index == 2) {
678	// NFD, time to test.
679	var w_buf = std.ArrayList(u8).init(allocator);
680	defer w_buf.deinit();
681
682	var w_fields = mem.split(u8, field, " ");
683	while (w_fields.next()) \|s\| {
684	const wcp = try fmt.parseInt(u21, s, 16);
685	const len = try unicode.utf8Encode(wcp, &cp_buf);
686	try w_buf.appendSlice(cp_buf[0..len]);
687	}
688
689	const want = w_buf.items;
690	var got = try n.nfd(allocator, input);
691	defer got.deinit();
692
693	try testing.expectEqualStrings(want, got.slice);
694	} else if (field_index == 3) {
695	// NFKC, time to test.
696	var w_buf = std.ArrayList(u8).init(allocator);
697	defer w_buf.deinit();
698
699	var w_fields = mem.split(u8, field, " ");
700	while (w_fields.next()) \|s\| {
701	const wcp = try fmt.parseInt(u21, s, 16);
702	const len = try unicode.utf8Encode(wcp, &cp_buf);
703	try w_buf.appendSlice(cp_buf[0..len]);
704	}
705
706	const want = w_buf.items;
707	var got = try n.nfkc(allocator, input);
708	defer got.deinit();
709
710	try testing.expectEqualStrings(want, got.slice);
711	} else if (field_index == 4) {
712	// NFKD, time to test.
713	var w_buf = std.ArrayList(u8).init(allocator);
714	defer w_buf.deinit();
715
716	var w_fields = mem.split(u8, field, " ");
717	while (w_fields.next()) \|s\| {
718	const wcp = try fmt.parseInt(u21, s, 16);
719	const len = try unicode.utf8Encode(wcp, &cp_buf);
720	try w_buf.appendSlice(cp_buf[0..len]);
721	}
722
723	const want = w_buf.items;
724	const got = try n.nfkd(allocator, input);
725	defer got.deinit();
726
727	try testing.expectEqualStrings(want, got.slice);
728	} else {
729	continue;
730	}
731	}
732	}
733	}
734
735	/// Returns true if `str` only contains Latin-1 Supplement	616	/// Returns true if `str` only contains Latin-1 Supplement
736	/// code points. Uses SIMD if possible.	617	/// code points. Uses SIMD if possible.
737	pub fn isLatin1Only(str: []const u8) bool {	618	pub fn isLatin1Only(str: []const u8) bool {


diff --git a/src/grapheme.zig b/src/grapheme.zig index ad43cfd..f4cc68c 100644 --- a/src/grapheme.zig +++ b/src/grapheme.zig
@@ -230,71 +230,6 @@ pub fn graphemeBreak(
230	return true;	230	return true;
231	}	231	}
232		232
233	test "Segmentation GraphemeIterator" {
234	const allocator = std.testing.allocator;
235	var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{});
236	defer file.close();
237	var buf_reader = std.io.bufferedReader(file.reader());
238	var input_stream = buf_reader.reader();
239
240	const data = try GraphemeData.init(allocator);
241	defer data.deinit();
242
243	var buf: [4096]u8 = undefined;
244	var line_no: usize = 1;
245
246	while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) \|raw\| : (line_no += 1) {
247	// Skip comments or empty lines.
248	if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
249
250	// Clean up.
251	var line = std.mem.trimLeft(u8, raw, "÷ ");
252	if (std.mem.indexOf(u8, line, " ÷\t#")) \|octo\| {
253	line = line[0..octo];
254	}
255	// Iterate over fields.
256	var want = std.ArrayList(Grapheme).init(allocator);
257	defer want.deinit();
258
259	var all_bytes = std.ArrayList(u8).init(allocator);
260	defer all_bytes.deinit();
261
262	var graphemes = std.mem.split(u8, line, " ÷ ");
263	var bytes_index: u32 = 0;
264
265	while (graphemes.next()) \|field\| {
266	var code_points = std.mem.split(u8, field, " ");
267	var cp_buf: [4]u8 = undefined;
268	var cp_index: u32 = 0;
269	var gc_len: u8 = 0;
270
271	while (code_points.next()) \|code_point\| {
272	if (std.mem.eql(u8, code_point, "×")) continue;
273	const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
274	const len = try unicode.utf8Encode(cp, &cp_buf);
275	try all_bytes.appendSlice(cp_buf[0..len]);
276	cp_index += len;
277	gc_len += len;
278	}
279
280	try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
281	bytes_index += cp_index;
282	}
283
284	// std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
285	var iter = Iterator.init(all_bytes.items, &data);
286
287	// Chaeck.
288	for (want.items) \|want_gc\| {
289	const got_gc = (iter.next()).?;
290	try std.testing.expectEqualStrings(
291	want_gc.bytes(all_bytes.items),
292	got_gc.bytes(all_bytes.items),
293	);
294	}
295	}
296	}
297
298	test "Segmentation ZWJ and ZWSP emoji sequences" {	233	test "Segmentation ZWJ and ZWSP emoji sequences" {
299	const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";	234	const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
300	const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";	235	const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";


diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig new file mode 100644 index 0000000..5442f63 --- /dev/null +++ b/src/unicode_tests.zig
@@ -0,0 +1,194 @@
		1	const std = @import("std");
		2	const fmt = std.fmt;
		3	const fs = std.fs;
		4	const io = std.io;
		5	const heap = std.heap;
		6	const mem = std.mem;
		7	const testing = std.testing;
		8	const unicode = std.unicode;
		9
		10	const Grapheme = @import("grapheme").Grapheme;
		11	const GraphemeData = @import("grapheme").GraphemeData;
		12	const GraphemeIterator = @import("grapheme").Iterator;
		13	const Normalize = @import("Normalize");
		14
		15	test "Unicode normalization tests" {
		16	var arena = heap.ArenaAllocator.init(testing.allocator);
		17	defer arena.deinit();
		18	var allocator = arena.allocator();
		19
		20	const data = try Normalize.NormData.init(allocator);
		21	const n = Normalize{ .norm_data = &data };
		22
		23	var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
		24	defer file.close();
		25	var buf_reader = io.bufferedReader(file.reader());
		26	const input_stream = buf_reader.reader();
		27
		28	var line_no: usize = 0;
		29	var buf: [4096]u8 = undefined;
		30	var cp_buf: [4]u8 = undefined;
		31
		32	while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) \|line\| {
		33	line_no += 1;
		34	// Skip comments or empty lines.
		35	if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
		36	// Iterate over fields.
		37	var fields = mem.split(u8, line, ";");
		38	var field_index: usize = 0;
		39	var input: []u8 = undefined;
		40	defer allocator.free(input);
		41
		42	while (fields.next()) \|field\| : (field_index += 1) {
		43	if (field_index == 0) {
		44	var i_buf = std.ArrayList(u8).init(allocator);
		45	defer i_buf.deinit();
		46
		47	var i_fields = mem.split(u8, field, " ");
		48	while (i_fields.next()) \|s\| {
		49	const icp = try fmt.parseInt(u21, s, 16);
		50	const len = try unicode.utf8Encode(icp, &cp_buf);
		51	try i_buf.appendSlice(cp_buf[0..len]);
		52	}
		53
		54	input = try i_buf.toOwnedSlice();
		55	} else if (field_index == 1) {
		56	//debug.print("\n* {s} *\n", .{line});
		57	// NFC, time to test.
		58	var w_buf = std.ArrayList(u8).init(allocator);
		59	defer w_buf.deinit();
		60
		61	var w_fields = mem.split(u8, field, " ");
		62	while (w_fields.next()) \|s\| {
		63	const wcp = try fmt.parseInt(u21, s, 16);
		64	const len = try unicode.utf8Encode(wcp, &cp_buf);
		65	try w_buf.appendSlice(cp_buf[0..len]);
		66	}
		67
		68	const want = w_buf.items;
		69	var got = try n.nfc(allocator, input);
		70	defer got.deinit();
		71
		72	try testing.expectEqualStrings(want, got.slice);
		73	} else if (field_index == 2) {
		74	// NFD, time to test.
		75	var w_buf = std.ArrayList(u8).init(allocator);
		76	defer w_buf.deinit();
		77
		78	var w_fields = mem.split(u8, field, " ");
		79	while (w_fields.next()) \|s\| {
		80	const wcp = try fmt.parseInt(u21, s, 16);
		81	const len = try unicode.utf8Encode(wcp, &cp_buf);
		82	try w_buf.appendSlice(cp_buf[0..len]);
		83	}
		84
		85	const want = w_buf.items;
		86	var got = try n.nfd(allocator, input);
		87	defer got.deinit();
		88
		89	try testing.expectEqualStrings(want, got.slice);
		90	} else if (field_index == 3) {
		91	// NFKC, time to test.
		92	var w_buf = std.ArrayList(u8).init(allocator);
		93	defer w_buf.deinit();
		94
		95	var w_fields = mem.split(u8, field, " ");
		96	while (w_fields.next()) \|s\| {
		97	const wcp = try fmt.parseInt(u21, s, 16);
		98	const len = try unicode.utf8Encode(wcp, &cp_buf);
		99	try w_buf.appendSlice(cp_buf[0..len]);
		100	}
		101
		102	const want = w_buf.items;
		103	var got = try n.nfkc(allocator, input);
		104	defer got.deinit();
		105
		106	try testing.expectEqualStrings(want, got.slice);
		107	} else if (field_index == 4) {
		108	// NFKD, time to test.
		109	var w_buf = std.ArrayList(u8).init(allocator);
		110	defer w_buf.deinit();
		111
		112	var w_fields = mem.split(u8, field, " ");
		113	while (w_fields.next()) \|s\| {
		114	const wcp = try fmt.parseInt(u21, s, 16);
		115	const len = try unicode.utf8Encode(wcp, &cp_buf);
		116	try w_buf.appendSlice(cp_buf[0..len]);
		117	}
		118
		119	const want = w_buf.items;
		120	const got = try n.nfkd(allocator, input);
		121	defer got.deinit();
		122
		123	try testing.expectEqualStrings(want, got.slice);
		124	} else {
		125	continue;
		126	}
		127	}
		128	}
		129	}
		130
		131	test "Segmentation GraphemeIterator" {
		132	const allocator = std.testing.allocator;
		133	var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{});
		134	defer file.close();
		135	var buf_reader = std.io.bufferedReader(file.reader());
		136	var input_stream = buf_reader.reader();
		137
		138	const data = try GraphemeData.init(allocator);
		139	defer data.deinit();
		140
		141	var buf: [4096]u8 = undefined;
		142	var line_no: usize = 1;
		143
		144	while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) \|raw\| : (line_no += 1) {
		145	// Skip comments or empty lines.
		146	if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
		147
		148	// Clean up.
		149	var line = std.mem.trimLeft(u8, raw, "÷ ");
		150	if (std.mem.indexOf(u8, line, " ÷\t#")) \|octo\| {
		151	line = line[0..octo];
		152	}
		153	// Iterate over fields.
		154	var want = std.ArrayList(Grapheme).init(allocator);
		155	defer want.deinit();
		156
		157	var all_bytes = std.ArrayList(u8).init(allocator);
		158	defer all_bytes.deinit();
		159
		160	var graphemes = std.mem.split(u8, line, " ÷ ");
		161	var bytes_index: u32 = 0;
		162
		163	while (graphemes.next()) \|field\| {
		164	var code_points = std.mem.split(u8, field, " ");
		165	var cp_buf: [4]u8 = undefined;
		166	var cp_index: u32 = 0;
		167	var gc_len: u8 = 0;
		168
		169	while (code_points.next()) \|code_point\| {
		170	if (std.mem.eql(u8, code_point, "×")) continue;
		171	const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
		172	const len = try unicode.utf8Encode(cp, &cp_buf);
		173	try all_bytes.appendSlice(cp_buf[0..len]);
		174	cp_index += len;
		175	gc_len += len;
		176	}
		177
		178	try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
		179	bytes_index += cp_index;
		180	}
		181
		182	// std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
		183	var iter = GraphemeIterator.init(all_bytes.items, &data);
		184
		185	// Chaeck.
		186	for (want.items) \|want_gc\| {
		187	const got_gc = (iter.next()).?;
		188	try std.testing.expectEqualStrings(
		189	want_gc.bytes(all_bytes.items),
		190	got_gc.bytes(all_bytes.items),
		191	);
		192	}
		193	}
		194	}