summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--build.zig45
-rw-r--r--src/Normalize.zig121
-rw-r--r--src/grapheme.zig65
-rw-r--r--src/unicode_tests.zig194
4 files changed, 209 insertions, 216 deletions
diff --git a/build.zig b/build.zig
index 5f5680c..c05b4a1 100644
--- a/build.zig
+++ b/build.zig
@@ -196,13 +196,13 @@ pub fn build(b: *std.Build) void {
196 }); 196 });
197 197
198 // Fixed pitch font display width 198 // Fixed pitch font display width
199 const dw_data = b.createModule(.{ 199 const width_data = b.createModule(.{
200 .root_source_file = .{ .path = "src/WidthData.zig" }, 200 .root_source_file = .{ .path = "src/WidthData.zig" },
201 .target = target, 201 .target = target,
202 .optimize = optimize, 202 .optimize = optimize,
203 }); 203 });
204 dw_data.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out }); 204 width_data.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out });
205 dw_data.addImport("GraphemeData", grapheme_data); 205 width_data.addImport("GraphemeData", grapheme_data);
206 206
207 const display_width = b.addModule("DisplayWidth", .{ 207 const display_width = b.addModule("DisplayWidth", .{
208 .root_source_file = .{ .path = "src/DisplayWidth.zig" }, 208 .root_source_file = .{ .path = "src/DisplayWidth.zig" },
@@ -212,7 +212,7 @@ pub fn build(b: *std.Build) void {
212 display_width.addImport("ascii", ascii); 212 display_width.addImport("ascii", ascii);
213 display_width.addImport("code_point", code_point); 213 display_width.addImport("code_point", code_point);
214 display_width.addImport("grapheme", grapheme); 214 display_width.addImport("grapheme", grapheme);
215 display_width.addImport("DisplayWidthData", dw_data); 215 display_width.addImport("DisplayWidthData", width_data);
216 216
217 // Normalization 217 // Normalization
218 const ccc_data = b.createModule(.{ 218 const ccc_data = b.createModule(.{
@@ -324,34 +324,17 @@ pub fn build(b: *std.Build) void {
324 props_data.addAnonymousImport("props", .{ .root_source_file = props_gen_out }); 324 props_data.addAnonymousImport("props", .{ .root_source_file = props_gen_out });
325 props_data.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out }); 325 props_data.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out });
326 326
327 // Tests 327 // Unicode Tests
328 const exe_unit_tests = b.addTest(.{ 328 const unicode_tests = b.addTest(.{
329 .root_source_file = .{ .path = "src/PropsData.zig" }, 329 .root_source_file = .{ .path = "src/unicode_tests.zig" },
330 .target = target, 330 .target = target,
331 .optimize = optimize, 331 .optimize = optimize,
332 }); 332 });
333 // exe_unit_tests.root_module.addImport("ascii", ascii); 333 unicode_tests.root_module.addImport("grapheme", grapheme);
334 // exe_unit_tests.root_module.addImport("code_point", code_point); 334 unicode_tests.root_module.addImport("Normalize", norm);
335 // exe_unit_tests.root_module.addImport("GraphemeData", grapheme_data); 335
336 // exe_unit_tests.root_module.addImport("grapheme", grapheme); 336 const run_unicode_tests = b.addRunArtifact(unicode_tests);
337 // exe_unit_tests.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); 337
338 // exe_unit_tests.root_module.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out }); 338 const unicode_test_step = b.step("unicode-test", "Run Unicode tests");
339 // exe_unit_tests.root_module.addImport("DisplayWidthData", dw_data); 339 unicode_test_step.dependOn(&run_unicode_tests.step);
340 // exe_unit_tests.root_module.addImport("NormData", norm_data);
341 // exe_unit_tests.root_module.addImport("Normalize", norm);
342 // exe_unit_tests.root_module.addImport("FoldData", fold_data);
343 // exe_unit_tests.root_module.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out });
344 // exe_unit_tests.root_module.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out });
345 // exe_unit_tests.root_module.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out });
346 // exe_unit_tests.root_module.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out });
347 // exe_unit_tests.root_module.addAnonymousImport("scripts", .{ .root_source_file = scripts_gen_out });
348 exe_unit_tests.root_module.addAnonymousImport("core_props", .{ .root_source_file = core_gen_out });
349 exe_unit_tests.root_module.addAnonymousImport("props", .{ .root_source_file = props_gen_out });
350 exe_unit_tests.root_module.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out });
351 // exe_unit_tests.filter = "nfd !ASCII";
352
353 const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
354
355 const test_step = b.step("test", "Run unit tests");
356 test_step.dependOn(&run_exe_unit_tests.step);
357} 340}
diff --git a/src/Normalize.zig b/src/Normalize.zig
index daf774d..f437f4f 100644
--- a/src/Normalize.zig
+++ b/src/Normalize.zig
@@ -3,12 +3,10 @@
3//! NFKC, NFD, and NFKD normalization forms. 3//! NFKC, NFD, and NFKD normalization forms.
4 4
5const std = @import("std"); 5const std = @import("std");
6const assert = std.debug.assert;
7const debug = std.debug; 6const debug = std.debug;
7const assert = debug.assert;
8const fmt = std.fmt; 8const fmt = std.fmt;
9const fs = std.fs;
10const heap = std.heap; 9const heap = std.heap;
11const io = std.io;
12const mem = std.mem; 10const mem = std.mem;
13const simd = std.simd; 11const simd = std.simd;
14const testing = std.testing; 12const testing = std.testing;
@@ -615,123 +613,6 @@ test "isFcd" {
615 try testing.expect(!n.isFcd(not_fcd)); 613 try testing.expect(!n.isFcd(not_fcd));
616} 614}
617 615
618test "Unicode normalization tests" {
619 var arena = heap.ArenaAllocator.init(testing.allocator);
620 defer arena.deinit();
621 var allocator = arena.allocator();
622
623 const data = try NormData.init(allocator);
624 defer data.deinit();
625 const n = Self{ .norm_data = &data };
626
627 var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
628 defer file.close();
629 var buf_reader = io.bufferedReader(file.reader());
630 const input_stream = buf_reader.reader();
631
632 var line_no: usize = 0;
633 var buf: [4096]u8 = undefined;
634 var cp_buf: [4]u8 = undefined;
635
636 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| {
637 line_no += 1;
638 // Skip comments or empty lines.
639 if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
640 // Iterate over fields.
641 var fields = mem.split(u8, line, ";");
642 var field_index: usize = 0;
643 var input: []u8 = undefined;
644 defer allocator.free(input);
645
646 while (fields.next()) |field| : (field_index += 1) {
647 if (field_index == 0) {
648 var i_buf = std.ArrayList(u8).init(allocator);
649 defer i_buf.deinit();
650
651 var i_fields = mem.split(u8, field, " ");
652 while (i_fields.next()) |s| {
653 const icp = try fmt.parseInt(u21, s, 16);
654 const len = try unicode.utf8Encode(icp, &cp_buf);
655 try i_buf.appendSlice(cp_buf[0..len]);
656 }
657
658 input = try i_buf.toOwnedSlice();
659 } else if (field_index == 1) {
660 //debug.print("\n*** {s} ***\n", .{line});
661 // NFC, time to test.
662 var w_buf = std.ArrayList(u8).init(allocator);
663 defer w_buf.deinit();
664
665 var w_fields = mem.split(u8, field, " ");
666 while (w_fields.next()) |s| {
667 const wcp = try fmt.parseInt(u21, s, 16);
668 const len = try unicode.utf8Encode(wcp, &cp_buf);
669 try w_buf.appendSlice(cp_buf[0..len]);
670 }
671
672 const want = w_buf.items;
673 var got = try n.nfc(allocator, input);
674 defer got.deinit();
675
676 try testing.expectEqualStrings(want, got.slice);
677 } else if (field_index == 2) {
678 // NFD, time to test.
679 var w_buf = std.ArrayList(u8).init(allocator);
680 defer w_buf.deinit();
681
682 var w_fields = mem.split(u8, field, " ");
683 while (w_fields.next()) |s| {
684 const wcp = try fmt.parseInt(u21, s, 16);
685 const len = try unicode.utf8Encode(wcp, &cp_buf);
686 try w_buf.appendSlice(cp_buf[0..len]);
687 }
688
689 const want = w_buf.items;
690 var got = try n.nfd(allocator, input);
691 defer got.deinit();
692
693 try testing.expectEqualStrings(want, got.slice);
694 } else if (field_index == 3) {
695 // NFKC, time to test.
696 var w_buf = std.ArrayList(u8).init(allocator);
697 defer w_buf.deinit();
698
699 var w_fields = mem.split(u8, field, " ");
700 while (w_fields.next()) |s| {
701 const wcp = try fmt.parseInt(u21, s, 16);
702 const len = try unicode.utf8Encode(wcp, &cp_buf);
703 try w_buf.appendSlice(cp_buf[0..len]);
704 }
705
706 const want = w_buf.items;
707 var got = try n.nfkc(allocator, input);
708 defer got.deinit();
709
710 try testing.expectEqualStrings(want, got.slice);
711 } else if (field_index == 4) {
712 // NFKD, time to test.
713 var w_buf = std.ArrayList(u8).init(allocator);
714 defer w_buf.deinit();
715
716 var w_fields = mem.split(u8, field, " ");
717 while (w_fields.next()) |s| {
718 const wcp = try fmt.parseInt(u21, s, 16);
719 const len = try unicode.utf8Encode(wcp, &cp_buf);
720 try w_buf.appendSlice(cp_buf[0..len]);
721 }
722
723 const want = w_buf.items;
724 const got = try n.nfkd(allocator, input);
725 defer got.deinit();
726
727 try testing.expectEqualStrings(want, got.slice);
728 } else {
729 continue;
730 }
731 }
732 }
733}
734
735/// Returns true if `str` only contains Latin-1 Supplement 616/// Returns true if `str` only contains Latin-1 Supplement
736/// code points. Uses SIMD if possible. 617/// code points. Uses SIMD if possible.
737pub fn isLatin1Only(str: []const u8) bool { 618pub fn isLatin1Only(str: []const u8) bool {
diff --git a/src/grapheme.zig b/src/grapheme.zig
index ad43cfd..f4cc68c 100644
--- a/src/grapheme.zig
+++ b/src/grapheme.zig
@@ -230,71 +230,6 @@ pub fn graphemeBreak(
230 return true; 230 return true;
231} 231}
232 232
233test "Segmentation GraphemeIterator" {
234 const allocator = std.testing.allocator;
235 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{});
236 defer file.close();
237 var buf_reader = std.io.bufferedReader(file.reader());
238 var input_stream = buf_reader.reader();
239
240 const data = try GraphemeData.init(allocator);
241 defer data.deinit();
242
243 var buf: [4096]u8 = undefined;
244 var line_no: usize = 1;
245
246 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) {
247 // Skip comments or empty lines.
248 if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
249
250 // Clean up.
251 var line = std.mem.trimLeft(u8, raw, "÷ ");
252 if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| {
253 line = line[0..octo];
254 }
255 // Iterate over fields.
256 var want = std.ArrayList(Grapheme).init(allocator);
257 defer want.deinit();
258
259 var all_bytes = std.ArrayList(u8).init(allocator);
260 defer all_bytes.deinit();
261
262 var graphemes = std.mem.split(u8, line, " ÷ ");
263 var bytes_index: u32 = 0;
264
265 while (graphemes.next()) |field| {
266 var code_points = std.mem.split(u8, field, " ");
267 var cp_buf: [4]u8 = undefined;
268 var cp_index: u32 = 0;
269 var gc_len: u8 = 0;
270
271 while (code_points.next()) |code_point| {
272 if (std.mem.eql(u8, code_point, "×")) continue;
273 const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
274 const len = try unicode.utf8Encode(cp, &cp_buf);
275 try all_bytes.appendSlice(cp_buf[0..len]);
276 cp_index += len;
277 gc_len += len;
278 }
279
280 try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
281 bytes_index += cp_index;
282 }
283
284 // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
285 var iter = Iterator.init(all_bytes.items, &data);
286
287 // Chaeck.
288 for (want.items) |want_gc| {
289 const got_gc = (iter.next()).?;
290 try std.testing.expectEqualStrings(
291 want_gc.bytes(all_bytes.items),
292 got_gc.bytes(all_bytes.items),
293 );
294 }
295 }
296}
297
298test "Segmentation ZWJ and ZWSP emoji sequences" { 233test "Segmentation ZWJ and ZWSP emoji sequences" {
299 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; 234 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
300 const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; 235 const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
new file mode 100644
index 0000000..5442f63
--- /dev/null
+++ b/src/unicode_tests.zig
@@ -0,0 +1,194 @@
1const std = @import("std");
2const fmt = std.fmt;
3const fs = std.fs;
4const io = std.io;
5const heap = std.heap;
6const mem = std.mem;
7const testing = std.testing;
8const unicode = std.unicode;
9
10const Grapheme = @import("grapheme").Grapheme;
11const GraphemeData = @import("grapheme").GraphemeData;
12const GraphemeIterator = @import("grapheme").Iterator;
13const Normalize = @import("Normalize");
14
15test "Unicode normalization tests" {
16 var arena = heap.ArenaAllocator.init(testing.allocator);
17 defer arena.deinit();
18 var allocator = arena.allocator();
19
20 const data = try Normalize.NormData.init(allocator);
21 const n = Normalize{ .norm_data = &data };
22
23 var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
24 defer file.close();
25 var buf_reader = io.bufferedReader(file.reader());
26 const input_stream = buf_reader.reader();
27
28 var line_no: usize = 0;
29 var buf: [4096]u8 = undefined;
30 var cp_buf: [4]u8 = undefined;
31
32 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| {
33 line_no += 1;
34 // Skip comments or empty lines.
35 if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
36 // Iterate over fields.
37 var fields = mem.split(u8, line, ";");
38 var field_index: usize = 0;
39 var input: []u8 = undefined;
40 defer allocator.free(input);
41
42 while (fields.next()) |field| : (field_index += 1) {
43 if (field_index == 0) {
44 var i_buf = std.ArrayList(u8).init(allocator);
45 defer i_buf.deinit();
46
47 var i_fields = mem.split(u8, field, " ");
48 while (i_fields.next()) |s| {
49 const icp = try fmt.parseInt(u21, s, 16);
50 const len = try unicode.utf8Encode(icp, &cp_buf);
51 try i_buf.appendSlice(cp_buf[0..len]);
52 }
53
54 input = try i_buf.toOwnedSlice();
55 } else if (field_index == 1) {
56 //debug.print("\n*** {s} ***\n", .{line});
57 // NFC, time to test.
58 var w_buf = std.ArrayList(u8).init(allocator);
59 defer w_buf.deinit();
60
61 var w_fields = mem.split(u8, field, " ");
62 while (w_fields.next()) |s| {
63 const wcp = try fmt.parseInt(u21, s, 16);
64 const len = try unicode.utf8Encode(wcp, &cp_buf);
65 try w_buf.appendSlice(cp_buf[0..len]);
66 }
67
68 const want = w_buf.items;
69 var got = try n.nfc(allocator, input);
70 defer got.deinit();
71
72 try testing.expectEqualStrings(want, got.slice);
73 } else if (field_index == 2) {
74 // NFD, time to test.
75 var w_buf = std.ArrayList(u8).init(allocator);
76 defer w_buf.deinit();
77
78 var w_fields = mem.split(u8, field, " ");
79 while (w_fields.next()) |s| {
80 const wcp = try fmt.parseInt(u21, s, 16);
81 const len = try unicode.utf8Encode(wcp, &cp_buf);
82 try w_buf.appendSlice(cp_buf[0..len]);
83 }
84
85 const want = w_buf.items;
86 var got = try n.nfd(allocator, input);
87 defer got.deinit();
88
89 try testing.expectEqualStrings(want, got.slice);
90 } else if (field_index == 3) {
91 // NFKC, time to test.
92 var w_buf = std.ArrayList(u8).init(allocator);
93 defer w_buf.deinit();
94
95 var w_fields = mem.split(u8, field, " ");
96 while (w_fields.next()) |s| {
97 const wcp = try fmt.parseInt(u21, s, 16);
98 const len = try unicode.utf8Encode(wcp, &cp_buf);
99 try w_buf.appendSlice(cp_buf[0..len]);
100 }
101
102 const want = w_buf.items;
103 var got = try n.nfkc(allocator, input);
104 defer got.deinit();
105
106 try testing.expectEqualStrings(want, got.slice);
107 } else if (field_index == 4) {
108 // NFKD, time to test.
109 var w_buf = std.ArrayList(u8).init(allocator);
110 defer w_buf.deinit();
111
112 var w_fields = mem.split(u8, field, " ");
113 while (w_fields.next()) |s| {
114 const wcp = try fmt.parseInt(u21, s, 16);
115 const len = try unicode.utf8Encode(wcp, &cp_buf);
116 try w_buf.appendSlice(cp_buf[0..len]);
117 }
118
119 const want = w_buf.items;
120 const got = try n.nfkd(allocator, input);
121 defer got.deinit();
122
123 try testing.expectEqualStrings(want, got.slice);
124 } else {
125 continue;
126 }
127 }
128 }
129}
130
131test "Segmentation GraphemeIterator" {
132 const allocator = std.testing.allocator;
133 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{});
134 defer file.close();
135 var buf_reader = std.io.bufferedReader(file.reader());
136 var input_stream = buf_reader.reader();
137
138 const data = try GraphemeData.init(allocator);
139 defer data.deinit();
140
141 var buf: [4096]u8 = undefined;
142 var line_no: usize = 1;
143
144 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) {
145 // Skip comments or empty lines.
146 if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
147
148 // Clean up.
149 var line = std.mem.trimLeft(u8, raw, "÷ ");
150 if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| {
151 line = line[0..octo];
152 }
153 // Iterate over fields.
154 var want = std.ArrayList(Grapheme).init(allocator);
155 defer want.deinit();
156
157 var all_bytes = std.ArrayList(u8).init(allocator);
158 defer all_bytes.deinit();
159
160 var graphemes = std.mem.split(u8, line, " ÷ ");
161 var bytes_index: u32 = 0;
162
163 while (graphemes.next()) |field| {
164 var code_points = std.mem.split(u8, field, " ");
165 var cp_buf: [4]u8 = undefined;
166 var cp_index: u32 = 0;
167 var gc_len: u8 = 0;
168
169 while (code_points.next()) |code_point| {
170 if (std.mem.eql(u8, code_point, "×")) continue;
171 const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
172 const len = try unicode.utf8Encode(cp, &cp_buf);
173 try all_bytes.appendSlice(cp_buf[0..len]);
174 cp_index += len;
175 gc_len += len;
176 }
177
178 try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
179 bytes_index += cp_index;
180 }
181
182 // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
183 var iter = GraphemeIterator.init(all_bytes.items, &data);
184
185 // Chaeck.
186 for (want.items) |want_gc| {
187 const got_gc = (iter.next()).?;
188 try std.testing.expectEqualStrings(
189 want_gc.bytes(all_bytes.items),
190 got_gc.bytes(all_bytes.items),
191 );
192 }
193 }
194}