summaryrefslogtreecommitdiff
path: root/src/unicode_tests.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/unicode_tests.zig')
-rw-r--r--src/unicode_tests.zig102
1 files changed, 70 insertions, 32 deletions
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index ee259a3..7ce2b4e 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -1,31 +1,5 @@
1const dbg_print = false; 1const dbg_print = false;
2 2
3comptime {
4 testing.refAllDecls(grapheme);
5}
6
7test "Iterator.peek" {
8 const peek_seq = "aΔ👨🏻‍🌾→";
9 const data = try Graphemes.init(std.testing.allocator);
10 defer data.deinit(std.testing.allocator);
11
12 var iter = data.iterator(peek_seq);
13 const peek_a = iter.peek().?;
14 const next_a = iter.next().?;
15 try std.testing.expectEqual(peek_a, next_a);
16 try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq));
17 const peek_d1 = iter.peek().?;
18 const peek_d2 = iter.peek().?;
19 try std.testing.expectEqual(peek_d1, peek_d2);
20 const next_d = iter.next().?;
21 try std.testing.expectEqual(peek_d2, next_d);
22 try std.testing.expectEqual(iter.peek(), iter.next());
23 try std.testing.expectEqual(iter.peek(), iter.next());
24 try std.testing.expectEqual(null, iter.peek());
25 try std.testing.expectEqual(null, iter.peek());
26 try std.testing.expectEqual(iter.peek(), iter.next());
27}
28
29test "Unicode normalization tests" { 3test "Unicode normalization tests" {
30 var arena = heap.ArenaAllocator.init(testing.allocator); 4 var arena = heap.ArenaAllocator.init(testing.allocator);
31 defer arena.deinit(); 5 defer arena.deinit();
@@ -147,15 +121,13 @@ test "Segmentation GraphemeIterator" {
147 var buf_reader = std.io.bufferedReader(file.reader()); 121 var buf_reader = std.io.bufferedReader(file.reader());
148 var input_stream = buf_reader.reader(); 122 var input_stream = buf_reader.reader();
149 123
150 const data = try Graphemes.init(allocator); 124 const graph = try Graphemes.init(allocator);
151 defer data.deinit(allocator); 125 defer graph.deinit(allocator);
152 126
153 var buf: [4096]u8 = undefined; 127 var buf: [4096]u8 = undefined;
154 var line_iter: IterRead = .{ .read = &input_stream }; 128 var line_iter: IterRead = .{ .read = &input_stream };
155 129
156 while (try line_iter.next(&buf)) |raw| { 130 while (try line_iter.next(&buf)) |raw| {
157 // Skip comments or empty lines.
158 // if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
159 // Clean up. 131 // Clean up.
160 var line = std.mem.trimLeft(u8, raw, "÷ "); 132 var line = std.mem.trimLeft(u8, raw, "÷ ");
161 if (std.mem.indexOf(u8, line, " ÷\t")) |final| { 133 if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
@@ -190,7 +162,7 @@ test "Segmentation GraphemeIterator" {
190 bytes_index += cp_index; 162 bytes_index += cp_index;
191 } 163 }
192 164
193 var iter = data.iterator(all_bytes.items); 165 var iter = graph.iterator(all_bytes.items);
194 166
195 // Check. 167 // Check.
196 for (want.items) |want_gc| { 168 for (want.items) |want_gc| {
@@ -203,6 +175,71 @@ test "Segmentation GraphemeIterator" {
203 } 175 }
204} 176}
205 177
178test "Segmentation Word Iterator" {
179 const allocator = std.testing.allocator;
180 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{});
181 defer file.close();
182 var buf_reader = std.io.bufferedReader(file.reader());
183 var input_stream = buf_reader.reader();
184
185 const wb = try WordBreak.init(allocator);
186 defer wb.deinit(allocator);
187
188 var buf: [4096]u8 = undefined;
189 var line_iter: IterRead = .{ .read = &input_stream };
190
191 while (try line_iter.next(&buf)) |raw| {
192 // Clean up.
193 var line = std.mem.trimLeft(u8, raw, "÷ ");
194 if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
195 line = line[0..final];
196 }
197 // Iterate over fields.
198 var want = std.ArrayList(Grapheme).init(allocator);
199 defer want.deinit();
200
201 var all_bytes = std.ArrayList(u8).init(allocator);
202 defer all_bytes.deinit();
203
204 var words = std.mem.splitSequence(u8, line, " ÷ ");
205 var bytes_index: u32 = 0;
206
207 while (words.next()) |field| {
208 var code_points = std.mem.splitScalar(u8, field, ' ');
209 var cp_buf: [4]u8 = undefined;
210 var cp_index: u32 = 0;
211 var gc_len: u8 = 0;
212
213 while (code_points.next()) |code_point| {
214 if (std.mem.eql(u8, code_point, "×")) continue;
215 const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
216 const len = try unicode.utf8Encode(cp, &cp_buf);
217 try all_bytes.appendSlice(cp_buf[0..len]);
218 cp_index += len;
219 gc_len += len;
220 }
221
222 try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
223 bytes_index += cp_index;
224 }
225
226 var iter = wb.iterator(all_bytes.items);
227
228 // Check.
229 for (want.items, 1..) |want_word, i| {
230 const got_word = (iter.next()).?;
231 std.testing.expectEqualSlices(
232 u8,
233 want_word.bytes(all_bytes.items),
234 got_word.bytes(all_bytes.items),
235 ) catch |err| {
236 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, i });
237 return err;
238 };
239 }
240 }
241}
242
206const IterRead = struct { 243const IterRead = struct {
207 read: *Reader, 244 read: *Reader,
208 line: usize = 0, 245 line: usize = 0,
@@ -235,8 +272,9 @@ const debug = std.debug;
235const testing = std.testing; 272const testing = std.testing;
236const unicode = std.unicode; 273const unicode = std.unicode;
237 274
238const grapheme = @import("Graphemes");
239const Grapheme = @import("Graphemes").Grapheme; 275const Grapheme = @import("Graphemes").Grapheme;
240const Graphemes = @import("Graphemes"); 276const Graphemes = @import("Graphemes");
241const GraphemeIterator = @import("Graphemes").Iterator; 277const GraphemeIterator = @import("Graphemes").Iterator;
242const Normalize = @import("Normalize"); 278const Normalize = @import("Normalize");
279
280const WordBreak = @import("WordBreak");