summaryrefslogtreecommitdiff
path: root/src/unicode_tests.zig
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-03-28 22:19:50 -0400
committerGravatar Jose Colon Rodriguez2024-03-28 22:19:50 -0400
commita2c4b7a57fe6b64bdd7c71305d408e5030af3157 (patch)
treec7af1ed4381ab0eeea52e2a9081cb19469b8c0e6 /src/unicode_tests.zig
parentMerged NumericData into PropsData (diff)
downloadzg-a2c4b7a57fe6b64bdd7c71305d408e5030af3157.tar.gz
zg-a2c4b7a57fe6b64bdd7c71305d408e5030af3157.tar.xz
zg-a2c4b7a57fe6b64bdd7c71305d408e5030af3157.zip
Split out Unicode tests to separate file
Diffstat (limited to 'src/unicode_tests.zig')
-rw-r--r--src/unicode_tests.zig194
1 files changed, 194 insertions, 0 deletions
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
new file mode 100644
index 0000000..5442f63
--- /dev/null
+++ b/src/unicode_tests.zig
@@ -0,0 +1,194 @@
1const std = @import("std");
2const fmt = std.fmt;
3const fs = std.fs;
4const io = std.io;
5const heap = std.heap;
6const mem = std.mem;
7const testing = std.testing;
8const unicode = std.unicode;
9
10const Grapheme = @import("grapheme").Grapheme;
11const GraphemeData = @import("grapheme").GraphemeData;
12const GraphemeIterator = @import("grapheme").Iterator;
13const Normalize = @import("Normalize");
14
15test "Unicode normalization tests" {
16 var arena = heap.ArenaAllocator.init(testing.allocator);
17 defer arena.deinit();
18 var allocator = arena.allocator();
19
20 const data = try Normalize.NormData.init(allocator);
21 const n = Normalize{ .norm_data = &data };
22
23 var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
24 defer file.close();
25 var buf_reader = io.bufferedReader(file.reader());
26 const input_stream = buf_reader.reader();
27
28 var line_no: usize = 0;
29 var buf: [4096]u8 = undefined;
30 var cp_buf: [4]u8 = undefined;
31
32 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| {
33 line_no += 1;
34 // Skip comments or empty lines.
35 if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
36 // Iterate over fields.
37 var fields = mem.split(u8, line, ";");
38 var field_index: usize = 0;
39 var input: []u8 = undefined;
40 defer allocator.free(input);
41
42 while (fields.next()) |field| : (field_index += 1) {
43 if (field_index == 0) {
44 var i_buf = std.ArrayList(u8).init(allocator);
45 defer i_buf.deinit();
46
47 var i_fields = mem.split(u8, field, " ");
48 while (i_fields.next()) |s| {
49 const icp = try fmt.parseInt(u21, s, 16);
50 const len = try unicode.utf8Encode(icp, &cp_buf);
51 try i_buf.appendSlice(cp_buf[0..len]);
52 }
53
54 input = try i_buf.toOwnedSlice();
55 } else if (field_index == 1) {
56 //debug.print("\n*** {s} ***\n", .{line});
57 // NFC, time to test.
58 var w_buf = std.ArrayList(u8).init(allocator);
59 defer w_buf.deinit();
60
61 var w_fields = mem.split(u8, field, " ");
62 while (w_fields.next()) |s| {
63 const wcp = try fmt.parseInt(u21, s, 16);
64 const len = try unicode.utf8Encode(wcp, &cp_buf);
65 try w_buf.appendSlice(cp_buf[0..len]);
66 }
67
68 const want = w_buf.items;
69 var got = try n.nfc(allocator, input);
70 defer got.deinit();
71
72 try testing.expectEqualStrings(want, got.slice);
73 } else if (field_index == 2) {
74 // NFD, time to test.
75 var w_buf = std.ArrayList(u8).init(allocator);
76 defer w_buf.deinit();
77
78 var w_fields = mem.split(u8, field, " ");
79 while (w_fields.next()) |s| {
80 const wcp = try fmt.parseInt(u21, s, 16);
81 const len = try unicode.utf8Encode(wcp, &cp_buf);
82 try w_buf.appendSlice(cp_buf[0..len]);
83 }
84
85 const want = w_buf.items;
86 var got = try n.nfd(allocator, input);
87 defer got.deinit();
88
89 try testing.expectEqualStrings(want, got.slice);
90 } else if (field_index == 3) {
91 // NFKC, time to test.
92 var w_buf = std.ArrayList(u8).init(allocator);
93 defer w_buf.deinit();
94
95 var w_fields = mem.split(u8, field, " ");
96 while (w_fields.next()) |s| {
97 const wcp = try fmt.parseInt(u21, s, 16);
98 const len = try unicode.utf8Encode(wcp, &cp_buf);
99 try w_buf.appendSlice(cp_buf[0..len]);
100 }
101
102 const want = w_buf.items;
103 var got = try n.nfkc(allocator, input);
104 defer got.deinit();
105
106 try testing.expectEqualStrings(want, got.slice);
107 } else if (field_index == 4) {
108 // NFKD, time to test.
109 var w_buf = std.ArrayList(u8).init(allocator);
110 defer w_buf.deinit();
111
112 var w_fields = mem.split(u8, field, " ");
113 while (w_fields.next()) |s| {
114 const wcp = try fmt.parseInt(u21, s, 16);
115 const len = try unicode.utf8Encode(wcp, &cp_buf);
116 try w_buf.appendSlice(cp_buf[0..len]);
117 }
118
119 const want = w_buf.items;
120 const got = try n.nfkd(allocator, input);
121 defer got.deinit();
122
123 try testing.expectEqualStrings(want, got.slice);
124 } else {
125 continue;
126 }
127 }
128 }
129}
130
131test "Segmentation GraphemeIterator" {
132 const allocator = std.testing.allocator;
133 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{});
134 defer file.close();
135 var buf_reader = std.io.bufferedReader(file.reader());
136 var input_stream = buf_reader.reader();
137
138 const data = try GraphemeData.init(allocator);
139 defer data.deinit();
140
141 var buf: [4096]u8 = undefined;
142 var line_no: usize = 1;
143
144 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) {
145 // Skip comments or empty lines.
146 if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
147
148 // Clean up.
149 var line = std.mem.trimLeft(u8, raw, "÷ ");
150 if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| {
151 line = line[0..octo];
152 }
153 // Iterate over fields.
154 var want = std.ArrayList(Grapheme).init(allocator);
155 defer want.deinit();
156
157 var all_bytes = std.ArrayList(u8).init(allocator);
158 defer all_bytes.deinit();
159
160 var graphemes = std.mem.split(u8, line, " ÷ ");
161 var bytes_index: u32 = 0;
162
163 while (graphemes.next()) |field| {
164 var code_points = std.mem.split(u8, field, " ");
165 var cp_buf: [4]u8 = undefined;
166 var cp_index: u32 = 0;
167 var gc_len: u8 = 0;
168
169 while (code_points.next()) |code_point| {
170 if (std.mem.eql(u8, code_point, "×")) continue;
171 const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
172 const len = try unicode.utf8Encode(cp, &cp_buf);
173 try all_bytes.appendSlice(cp_buf[0..len]);
174 cp_index += len;
175 gc_len += len;
176 }
177
178 try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
179 bytes_index += cp_index;
180 }
181
182 // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
183 var iter = GraphemeIterator.init(all_bytes.items, &data);
184
185 // Chaeck.
186 for (want.items) |want_gc| {
187 const got_gc = (iter.next()).?;
188 try std.testing.expectEqualStrings(
189 want_gc.bytes(all_bytes.items),
190 got_gc.bytes(all_bytes.items),
191 );
192 }
193 }
194}