summaryrefslogtreecommitdiff
path: root/src/unicode_tests.zig
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-11 17:26:50 -0400
committerGravatar Sam Atman2025-05-15 15:31:15 -0400
commit470e896483300d099c7650f9cd8a13e236c63864 (patch)
tree84b833525430b5603698b3096121a188b3bfe409 /src/unicode_tests.zig
parentAdd WordBreakPropertyData (diff)
downloadzg-470e896483300d099c7650f9cd8a13e236c63864.tar.gz
zg-470e896483300d099c7650f9cd8a13e236c63864.tar.xz
zg-470e896483300d099c7650f9cd8a13e236c63864.zip
Refactor in unicode_tests
The comments in WordBreak and SentenceBreak tests get really long, the provided buffer would be inadequate. So this just provides a sub- iterator which will strip comments and comment lines, while keeping an eye on line numbers for any debugging.
Diffstat (limited to 'src/unicode_tests.zig')
-rw-r--r--src/unicode_tests.zig77
1 files changed, 49 insertions, 28 deletions
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 2249007..ee259a3 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -1,17 +1,4 @@
1const std = @import("std"); 1const dbg_print = false;
2const fmt = std.fmt;
3const fs = std.fs;
4const io = std.io;
5const heap = std.heap;
6const mem = std.mem;
7const testing = std.testing;
8const unicode = std.unicode;
9
10const grapheme = @import("Graphemes");
11const Grapheme = @import("Graphemes").Grapheme;
12const Graphemes = @import("Graphemes");
13const GraphemeIterator = @import("Graphemes").Iterator;
14const Normalize = @import("Normalize");
15 2
16comptime { 3comptime {
17 testing.refAllDecls(grapheme); 4 testing.refAllDecls(grapheme);
@@ -50,16 +37,14 @@ test "Unicode normalization tests" {
50 var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); 37 var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
51 defer file.close(); 38 defer file.close();
52 var buf_reader = io.bufferedReader(file.reader()); 39 var buf_reader = io.bufferedReader(file.reader());
53 const input_stream = buf_reader.reader(); 40 var input_stream = buf_reader.reader();
54 41
55 var line_no: usize = 0;
56 var buf: [4096]u8 = undefined; 42 var buf: [4096]u8 = undefined;
57 var cp_buf: [4]u8 = undefined; 43 var cp_buf: [4]u8 = undefined;
58 44
59 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| { 45 var line_iter: IterRead = .{ .read = &input_stream };
60 line_no += 1; 46
61 // Skip comments or empty lines. 47 while (try line_iter.next(&buf)) |line| {
62 if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
63 // Iterate over fields. 48 // Iterate over fields.
64 var fields = mem.splitScalar(u8, line, ';'); 49 var fields = mem.splitScalar(u8, line, ';');
65 var field_index: usize = 0; 50 var field_index: usize = 0;
@@ -80,7 +65,7 @@ test "Unicode normalization tests" {
80 65
81 input = try i_buf.toOwnedSlice(); 66 input = try i_buf.toOwnedSlice();
82 } else if (field_index == 1) { 67 } else if (field_index == 1) {
83 //debug.print("\n*** {s} ***\n", .{line}); 68 if (dbg_print) debug.print("\n*** {s} ***\n", .{line});
84 // NFC, time to test. 69 // NFC, time to test.
85 var w_buf = std.ArrayList(u8).init(allocator); 70 var w_buf = std.ArrayList(u8).init(allocator);
86 defer w_buf.deinit(); 71 defer w_buf.deinit();
@@ -166,16 +151,15 @@ test "Segmentation GraphemeIterator" {
166 defer data.deinit(allocator); 151 defer data.deinit(allocator);
167 152
168 var buf: [4096]u8 = undefined; 153 var buf: [4096]u8 = undefined;
169 var line_no: usize = 1; 154 var line_iter: IterRead = .{ .read = &input_stream };
170 155
171 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { 156 while (try line_iter.next(&buf)) |raw| {
172 // Skip comments or empty lines. 157 // Skip comments or empty lines.
173 if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; 158 // if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
174
175 // Clean up. 159 // Clean up.
176 var line = std.mem.trimLeft(u8, raw, "÷ "); 160 var line = std.mem.trimLeft(u8, raw, "÷ ");
177 if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { 161 if (std.mem.indexOf(u8, line, " ÷\t")) |final| {
178 line = line[0..octo]; 162 line = line[0..final];
179 } 163 }
180 // Iterate over fields. 164 // Iterate over fields.
181 var want = std.ArrayList(Grapheme).init(allocator); 165 var want = std.ArrayList(Grapheme).init(allocator);
@@ -206,7 +190,6 @@ test "Segmentation GraphemeIterator" {
206 bytes_index += cp_index; 190 bytes_index += cp_index;
207 } 191 }
208 192
209 // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
210 var iter = data.iterator(all_bytes.items); 193 var iter = data.iterator(all_bytes.items);
211 194
212 // Check. 195 // Check.
@@ -219,3 +202,41 @@ test "Segmentation GraphemeIterator" {
219 } 202 }
220 } 203 }
221} 204}
205
206const IterRead = struct {
207 read: *Reader,
208 line: usize = 0,
209
210 pub fn next(iter: *IterRead, buf: []u8) !?[]const u8 {
211 defer iter.line += 1;
212 const maybe_line = try iter.read.readUntilDelimiterOrEof(buf, '#');
213 if (maybe_line) |this_line| {
214 try iter.read.skipUntilDelimiterOrEof('\n');
215 if (this_line.len == 0 or this_line[0] == '@') {
216 // comment, next line
217 return iter.next(buf);
218 } else {
219 return this_line;
220 }
221 } else {
222 return null;
223 }
224 }
225};
226
227const std = @import("std");
228const fmt = std.fmt;
229const fs = std.fs;
230const io = std.io;
231const Reader = io.BufferedReader(4096, fs.File.Reader).Reader;
232const heap = std.heap;
233const mem = std.mem;
234const debug = std.debug;
235const testing = std.testing;
236const unicode = std.unicode;
237
238const grapheme = @import("Graphemes");
239const Grapheme = @import("Graphemes").Grapheme;
240const Graphemes = @import("Graphemes");
241const GraphemeIterator = @import("Graphemes").Iterator;
242const Normalize = @import("Normalize");