summaryrefslogtreecommitdiff
path: root/src/unicode_tests.zig
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-07-08 12:15:32 -0400
committerGravatar Sam Atman2025-07-08 12:15:32 -0400
commit9427a9e53aaa29ee071f4dcb35b809a699d75aa9 (patch)
tree2607c185fd8053b84d60041fadc35c05a0225d34 /src/unicode_tests.zig
parentMerge pull request 'Fix benchmarks' (#56) from jacobsandlund/zg:benchmarks in... (diff)
parentAdd Words.zig example to README (diff)
downloadzg-master.tar.gz
zg-master.tar.xz
zg-master.zip
Merge branch 'develop-next'HEADv0.14.1master
Diffstat (limited to 'src/unicode_tests.zig')
-rw-r--r--src/unicode_tests.zig398
1 files changed, 331 insertions, 67 deletions
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 2249007..ae177a9 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -1,43 +1,4 @@
1const std = @import("std"); 1const dbg_print = false;
2const fmt = std.fmt;
3const fs = std.fs;
4const io = std.io;
5const heap = std.heap;
6const mem = std.mem;
7const testing = std.testing;
8const unicode = std.unicode;
9
10const grapheme = @import("Graphemes");
11const Grapheme = @import("Graphemes").Grapheme;
12const Graphemes = @import("Graphemes");
13const GraphemeIterator = @import("Graphemes").Iterator;
14const Normalize = @import("Normalize");
15
16comptime {
17 testing.refAllDecls(grapheme);
18}
19
20test "Iterator.peek" {
21 const peek_seq = "aΔ👨🏻‍🌾→";
22 const data = try Graphemes.init(std.testing.allocator);
23 defer data.deinit(std.testing.allocator);
24
25 var iter = data.iterator(peek_seq);
26 const peek_a = iter.peek().?;
27 const next_a = iter.next().?;
28 try std.testing.expectEqual(peek_a, next_a);
29 try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq));
30 const peek_d1 = iter.peek().?;
31 const peek_d2 = iter.peek().?;
32 try std.testing.expectEqual(peek_d1, peek_d2);
33 const next_d = iter.next().?;
34 try std.testing.expectEqual(peek_d2, next_d);
35 try std.testing.expectEqual(iter.peek(), iter.next());
36 try std.testing.expectEqual(iter.peek(), iter.next());
37 try std.testing.expectEqual(null, iter.peek());
38 try std.testing.expectEqual(null, iter.peek());
39 try std.testing.expectEqual(iter.peek(), iter.next());
40}
41 2
42test "Unicode normalization tests" { 3test "Unicode normalization tests" {
43 var arena = heap.ArenaAllocator.init(testing.allocator); 4 var arena = heap.ArenaAllocator.init(testing.allocator);
@@ -50,16 +11,14 @@ test "Unicode normalization tests" {
50 var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); 11 var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
51 defer file.close(); 12 defer file.close();
52 var buf_reader = io.bufferedReader(file.reader()); 13 var buf_reader = io.bufferedReader(file.reader());
53 const input_stream = buf_reader.reader(); 14 var input_stream = buf_reader.reader();
54 15
55 var line_no: usize = 0;
56 var buf: [4096]u8 = undefined; 16 var buf: [4096]u8 = undefined;
57 var cp_buf: [4]u8 = undefined; 17 var cp_buf: [4]u8 = undefined;
58 18
59 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| { 19 var line_iter: IterRead = .{ .read = &input_stream };
60 line_no += 1; 20
61 // Skip comments or empty lines. 21 while (try line_iter.next(&buf)) |line| {
62 if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
63 // Iterate over fields. 22 // Iterate over fields.
64 var fields = mem.splitScalar(u8, line, ';'); 23 var fields = mem.splitScalar(u8, line, ';');
65 var field_index: usize = 0; 24 var field_index: usize = 0;
@@ -80,7 +39,7 @@ test "Unicode normalization tests" {
80 39
81 input = try i_buf.toOwnedSlice(); 40 input = try i_buf.toOwnedSlice();
82 } else if (field_index == 1) { 41 } else if (field_index == 1) {
83 //debug.print("\n*** {s} ***\n", .{line}); 42 if (dbg_print) debug.print("\n*** {s} ***\n", .{line});
84 // NFC, time to test. 43 // NFC, time to test.
85 var w_buf = std.ArrayList(u8).init(allocator); 44 var w_buf = std.ArrayList(u8).init(allocator);
86 defer w_buf.deinit(); 45 defer w_buf.deinit();
@@ -162,20 +121,17 @@ test "Segmentation GraphemeIterator" {
162 var buf_reader = std.io.bufferedReader(file.reader()); 121 var buf_reader = std.io.bufferedReader(file.reader());
163 var input_stream = buf_reader.reader(); 122 var input_stream = buf_reader.reader();
164 123
165 const data = try Graphemes.init(allocator); 124 const graph = try Graphemes.init(allocator);
166 defer data.deinit(allocator); 125 defer graph.deinit(allocator);
167 126
168 var buf: [4096]u8 = undefined; 127 var buf: [4096]u8 = undefined;
169 var line_no: usize = 1; 128 var line_iter: IterRead = .{ .read = &input_stream };
170
171 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) {
172 // Skip comments or empty lines.
173 if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
174 129
130 while (try line_iter.next(&buf)) |raw| {
175 // Clean up. 131 // Clean up.
176 var line = std.mem.trimLeft(u8, raw, "÷ "); 132 var line = std.mem.trimLeft(u8, raw, "÷ ");
177 if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { 133 if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
178 line = line[0..octo]; 134 line = line[0..final];
179 } 135 }
180 // Iterate over fields. 136 // Iterate over fields.
181 var want = std.ArrayList(Grapheme).init(allocator); 137 var want = std.ArrayList(Grapheme).init(allocator);
@@ -185,12 +141,12 @@ test "Segmentation GraphemeIterator" {
185 defer all_bytes.deinit(); 141 defer all_bytes.deinit();
186 142
187 var graphemes = std.mem.splitSequence(u8, line, " ÷ "); 143 var graphemes = std.mem.splitSequence(u8, line, " ÷ ");
188 var bytes_index: u32 = 0; 144 var bytes_index: uoffset = 0;
189 145
190 while (graphemes.next()) |field| { 146 while (graphemes.next()) |field| {
191 var code_points = std.mem.splitScalar(u8, field, ' '); 147 var code_points = std.mem.splitScalar(u8, field, ' ');
192 var cp_buf: [4]u8 = undefined; 148 var cp_buf: [4]u8 = undefined;
193 var cp_index: u32 = 0; 149 var cp_index: uoffset = 0;
194 var gc_len: u8 = 0; 150 var gc_len: u8 = 0;
195 151
196 while (code_points.next()) |code_point| { 152 while (code_points.next()) |code_point| {
@@ -206,16 +162,324 @@ test "Segmentation GraphemeIterator" {
206 bytes_index += cp_index; 162 bytes_index += cp_index;
207 } 163 }
208 164
209 // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); 165 const this_str = all_bytes.items;
210 var iter = data.iterator(all_bytes.items); 166
167 {
168 var iter = graph.iterator(this_str);
169
170 // Check.
171 for (want.items, 1..) |want_gc, idx| {
172 const got_gc = (iter.next()).?;
173 try std.testing.expectEqualStrings(
174 want_gc.bytes(this_str),
175 got_gc.bytes(this_str),
176 );
177 for (got_gc.offset..got_gc.offset + got_gc.len) |i| {
178 const this_gc = graph.graphemeAtIndex(this_str, i);
179 std.testing.expectEqualSlices(
180 u8,
181 got_gc.bytes(this_str),
182 this_gc.bytes(this_str),
183 ) catch |err| {
184 debug.print("Wrong grapheme on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i });
185 return err;
186 };
187 }
188 var after_iter = graph.iterateAfterGrapheme(this_str, got_gc);
189 if (after_iter.next()) |next_gc| {
190 if (iter.peek()) |next_peek| {
191 std.testing.expectEqualSlices(
192 u8,
193 next_gc.bytes(this_str),
194 next_peek.bytes(this_str),
195 ) catch |err| {
196 debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, idx });
197 return err;
198 };
199 } else {
200 debug.print("Mismatch: peek missing, next found, line {d} #{d}\n", .{ line_iter.line, idx });
201 try testing.expect(false);
202 }
203 } else {
204 try testing.expectEqual(null, iter.peek());
205 }
206 }
207 }
208 {
209 var iter = graph.reverseIterator(this_str);
210
211 // Check.
212 var i: usize = want.items.len;
213 while (i > 0) {
214 i -= 1;
215 const want_gc = want.items[i];
216 const got_gc = iter.prev() orelse {
217 std.debug.print(
218 "line {d} grapheme {d}: expected {any} found null\n",
219 .{ line_iter.line, i, want_gc },
220 );
221 return error.TestExpectedEqual;
222 };
223 std.testing.expectEqualStrings(
224 want_gc.bytes(this_str),
225 got_gc.bytes(this_str),
226 ) catch |err| {
227 std.debug.print(
228 "line {d} grapheme {d}: expected {any} found {any}\n",
229 .{ line_iter.line, i, want_gc, got_gc },
230 );
231 return err;
232 };
233 var before_iter = graph.iterateBeforeGrapheme(this_str, got_gc);
234 if (before_iter.prev()) |prev_gc| {
235 if (iter.peek()) |prev_peek| {
236 std.testing.expectEqualSlices(
237 u8,
238 prev_gc.bytes(this_str),
239 prev_peek.bytes(this_str),
240 ) catch |err| {
241 debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, i });
242 return err;
243 };
244 } else {
245 debug.print("Mismatch: peek missing, prev found, line {d} #{d}\n", .{ line_iter.line, i });
246 try testing.expect(false);
247 }
248 } else {
249 try testing.expectEqual(null, iter.peek());
250 }
251 }
252 }
253 }
254}
255
256test "Segmentation Word Iterator" {
257 const allocator = std.testing.allocator;
258 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{});
259 defer file.close();
260 var buf_reader = std.io.bufferedReader(file.reader());
261 var input_stream = buf_reader.reader();
262
263 const wb = try Words.init(allocator);
264 defer wb.deinit(allocator);
265
266 var buf: [4096]u8 = undefined;
267 var line_iter: IterRead = .{ .read = &input_stream };
268
269 while (try line_iter.next(&buf)) |raw| {
270 // Clean up.
271 var line = std.mem.trimLeft(u8, raw, "÷ ");
272 if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
273 line = line[0..final];
274 }
275 // Iterate over fields.
276 var want = std.ArrayList(Word).init(allocator);
277 defer want.deinit();
278
279 var all_bytes = std.ArrayList(u8).init(allocator);
280 defer all_bytes.deinit();
281
282 var words = std.mem.splitSequence(u8, line, " ÷ ");
283 var bytes_index: uoffset = 0;
284
285 while (words.next()) |field| {
286 var code_points = std.mem.splitScalar(u8, field, ' ');
287 var cp_buf: [4]u8 = undefined;
288 var cp_index: uoffset = 0;
289 var gc_len: u8 = 0;
211 290
212 // Check. 291 while (code_points.next()) |code_point| {
213 for (want.items) |want_gc| { 292 if (std.mem.eql(u8, code_point, "×")) continue;
214 const got_gc = (iter.next()).?; 293 const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
215 try std.testing.expectEqualStrings( 294 const len = try unicode.utf8Encode(cp, &cp_buf);
216 want_gc.bytes(all_bytes.items), 295 try all_bytes.appendSlice(cp_buf[0..len]);
217 got_gc.bytes(all_bytes.items), 296 cp_index += len;
218 ); 297 gc_len += len;
298 }
299
300 try want.append(Word{ .len = gc_len, .offset = bytes_index });
301 bytes_index += cp_index;
302 }
303 const this_str = all_bytes.items;
304
305 {
306 var iter = wb.iterator(this_str);
307 var peeked: ?Word = iter.peek();
308
309 // Check.
310 for (want.items, 1..) |want_word, idx| {
311 const got_word = (iter.next()).?;
312 std.testing.expectEqualStrings(
313 want_word.bytes(this_str),
314 got_word.bytes(this_str),
315 ) catch |err| {
316 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx });
317 return err;
318 };
319 std.testing.expectEqualStrings(
320 peeked.?.bytes(this_str),
321 got_word.bytes(this_str),
322 ) catch |err| {
323 debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx });
324 return err;
325 };
326 var r_iter = iter.reverseIterator();
327 const if_r_word = r_iter.prev();
328 if (if_r_word) |r_word| {
329 std.testing.expectEqualStrings(
330 want_word.bytes(this_str),
331 r_word.bytes(this_str),
332 ) catch |err| {
333 debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx });
334 return err;
335 };
336 } else {
337 try testing.expect(false);
338 }
339 var peek_iter = wb.iterateAfterWord(this_str, got_word);
340 const peek_1 = peek_iter.next();
341 if (peek_1) |p1| {
342 const peek_2 = iter.peek();
343 if (peek_2) |p2| {
344 std.testing.expectEqualSlices(
345 u8,
346 p1.bytes(this_str),
347 p2.bytes(this_str),
348 ) catch |err| {
349 debug.print("Bad peek on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, idx });
350 return err;
351 };
352 } else {
353 try testing.expect(false);
354 }
355 } else {
356 try testing.expectEqual(null, iter.peek());
357 }
358 for (got_word.offset..got_word.offset + got_word.len) |i| {
359 const this_word = wb.wordAtIndex(this_str, i);
360 std.testing.expectEqualSlices(
361 u8,
362 got_word.bytes(this_str),
363 this_word.bytes(this_str),
364 ) catch |err| {
365 debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i });
366 return err;
367 };
368 }
369 peeked = iter.peek();
370 }
371 }
372 {
373 var r_iter = wb.reverseIterator(this_str);
374 var peeked: ?Word = r_iter.peek();
375 var idx = want.items.len - 1;
376
377 while (true) : (idx -= 1) {
378 const want_word = want.items[idx];
379 const got_word = r_iter.prev().?;
380 std.testing.expectEqualSlices(
381 u8,
382 want_word.bytes(this_str),
383 got_word.bytes(this_str),
384 ) catch |err| {
385 debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 });
386 return err;
387 };
388 std.testing.expectEqualStrings(
389 peeked.?.bytes(this_str),
390 got_word.bytes(this_str),
391 ) catch |err| {
392 debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx + 1 });
393 return err;
394 };
395 var f_iter = r_iter.forwardIterator();
396 const if_f_word = f_iter.next();
397 if (if_f_word) |f_word| {
398 std.testing.expectEqualStrings(
399 want_word.bytes(this_str),
400 f_word.bytes(this_str),
401 ) catch |err| {
402 debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx });
403 return err;
404 };
405 } else {
406 try testing.expect(false);
407 }
408 var peek_iter = wb.iterateBeforeWord(this_str, got_word);
409 const peek_1 = peek_iter.prev();
410 if (peek_1) |p1| {
411 const peek_2 = r_iter.peek();
412 if (peek_2) |p2| {
413 std.testing.expectEqualSlices(
414 u8,
415 p1.bytes(this_str),
416 p2.bytes(this_str),
417 ) catch |err| {
418 debug.print("Bad peek on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, idx });
419 return err;
420 };
421 } else {
422 try testing.expect(false);
423 }
424 } else {
425 try testing.expectEqual(null, r_iter.peek());
426 }
427 for (got_word.offset..got_word.offset + got_word.len) |i| {
428 const this_word = wb.wordAtIndex(this_str, i);
429 std.testing.expectEqualSlices(
430 u8,
431 got_word.bytes(this_str),
432 this_word.bytes(this_str),
433 ) catch |err| {
434 debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i });
435 return err;
436 };
437 }
438 peeked = r_iter.peek();
439 if (idx == 0) break;
440 }
219 } 441 }
220 } 442 }
221} 443}
444
445const IterRead = struct {
446 read: *Reader,
447 line: usize = 0,
448
449 pub fn next(iter: *IterRead, buf: []u8) !?[]const u8 {
450 defer iter.line += 1;
451 const maybe_line = try iter.read.readUntilDelimiterOrEof(buf, '#');
452 if (maybe_line) |this_line| {
453 try iter.read.skipUntilDelimiterOrEof('\n');
454 if (this_line.len == 0 or this_line[0] == '@') {
455 // comment, next line
456 return iter.next(buf);
457 } else {
458 return this_line;
459 }
460 } else {
461 return null;
462 }
463 }
464};
465
466const std = @import("std");
467const fmt = std.fmt;
468const fs = std.fs;
469const io = std.io;
470const Reader = io.BufferedReader(4096, fs.File.Reader).Reader;
471const heap = std.heap;
472const mem = std.mem;
473const debug = std.debug;
474const testing = std.testing;
475const unicode = std.unicode;
476
477const uoffset = @FieldType(Word, "offset");
478
479const Grapheme = @import("Graphemes").Grapheme;
480const Graphemes = @import("Graphemes");
481const GraphemeIterator = @import("Graphemes").Iterator;
482const Normalize = @import("Normalize");
483
484const Words = @import("Words");
485const Word = Words.Word;