diff options
| author | 2025-05-11 17:26:50 -0400 | |
|---|---|---|
| committer | 2025-05-15 15:31:15 -0400 | |
| commit | 470e896483300d099c7650f9cd8a13e236c63864 (patch) | |
| tree | 84b833525430b5603698b3096121a188b3bfe409 /src/unicode_tests.zig | |
| parent | Add WordBreakPropertyData (diff) | |
| download | zg-470e896483300d099c7650f9cd8a13e236c63864.tar.gz zg-470e896483300d099c7650f9cd8a13e236c63864.tar.xz zg-470e896483300d099c7650f9cd8a13e236c63864.zip | |
Refactor in unicode_tests
The comments in WordBreak and SentenceBreak tests get really long, the
provided buffer would be inadequate. So this just provides a sub-
iterator which will strip comments and comment lines, while keeping an
eye on line numbers for any debugging.
Diffstat (limited to 'src/unicode_tests.zig')
| -rw-r--r-- | src/unicode_tests.zig | 77 |
1 files changed, 49 insertions, 28 deletions
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 2249007..ee259a3 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -1,17 +1,4 @@ | |||
| 1 | const std = @import("std"); | 1 | const dbg_print = false; |
| 2 | const fmt = std.fmt; | ||
| 3 | const fs = std.fs; | ||
| 4 | const io = std.io; | ||
| 5 | const heap = std.heap; | ||
| 6 | const mem = std.mem; | ||
| 7 | const testing = std.testing; | ||
| 8 | const unicode = std.unicode; | ||
| 9 | |||
| 10 | const grapheme = @import("Graphemes"); | ||
| 11 | const Grapheme = @import("Graphemes").Grapheme; | ||
| 12 | const Graphemes = @import("Graphemes"); | ||
| 13 | const GraphemeIterator = @import("Graphemes").Iterator; | ||
| 14 | const Normalize = @import("Normalize"); | ||
| 15 | 2 | ||
| 16 | comptime { | 3 | comptime { |
| 17 | testing.refAllDecls(grapheme); | 4 | testing.refAllDecls(grapheme); |
| @@ -50,16 +37,14 @@ test "Unicode normalization tests" { | |||
| 50 | var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); | 37 | var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); |
| 51 | defer file.close(); | 38 | defer file.close(); |
| 52 | var buf_reader = io.bufferedReader(file.reader()); | 39 | var buf_reader = io.bufferedReader(file.reader()); |
| 53 | const input_stream = buf_reader.reader(); | 40 | var input_stream = buf_reader.reader(); |
| 54 | 41 | ||
| 55 | var line_no: usize = 0; | ||
| 56 | var buf: [4096]u8 = undefined; | 42 | var buf: [4096]u8 = undefined; |
| 57 | var cp_buf: [4]u8 = undefined; | 43 | var cp_buf: [4]u8 = undefined; |
| 58 | 44 | ||
| 59 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| { | 45 | var line_iter: IterRead = .{ .read = &input_stream }; |
| 60 | line_no += 1; | 46 | |
| 61 | // Skip comments or empty lines. | 47 | while (try line_iter.next(&buf)) |line| { |
| 62 | if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; | ||
| 63 | // Iterate over fields. | 48 | // Iterate over fields. |
| 64 | var fields = mem.splitScalar(u8, line, ';'); | 49 | var fields = mem.splitScalar(u8, line, ';'); |
| 65 | var field_index: usize = 0; | 50 | var field_index: usize = 0; |
| @@ -80,7 +65,7 @@ test "Unicode normalization tests" { | |||
| 80 | 65 | ||
| 81 | input = try i_buf.toOwnedSlice(); | 66 | input = try i_buf.toOwnedSlice(); |
| 82 | } else if (field_index == 1) { | 67 | } else if (field_index == 1) { |
| 83 | //debug.print("\n*** {s} ***\n", .{line}); | 68 | if (dbg_print) debug.print("\n*** {s} ***\n", .{line}); |
| 84 | // NFC, time to test. | 69 | // NFC, time to test. |
| 85 | var w_buf = std.ArrayList(u8).init(allocator); | 70 | var w_buf = std.ArrayList(u8).init(allocator); |
| 86 | defer w_buf.deinit(); | 71 | defer w_buf.deinit(); |
| @@ -166,16 +151,15 @@ test "Segmentation GraphemeIterator" { | |||
| 166 | defer data.deinit(allocator); | 151 | defer data.deinit(allocator); |
| 167 | 152 | ||
| 168 | var buf: [4096]u8 = undefined; | 153 | var buf: [4096]u8 = undefined; |
| 169 | var line_no: usize = 1; | 154 | var line_iter: IterRead = .{ .read = &input_stream }; |
| 170 | 155 | ||
| 171 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { | 156 | while (try line_iter.next(&buf)) |raw| { |
| 172 | // Skip comments or empty lines. | 157 | // Skip comments or empty lines. |
| 173 | if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; | 158 | // if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; |
| 174 | |||
| 175 | // Clean up. | 159 | // Clean up. |
| 176 | var line = std.mem.trimLeft(u8, raw, "÷ "); | 160 | var line = std.mem.trimLeft(u8, raw, "÷ "); |
| 177 | if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { | 161 | if (std.mem.indexOf(u8, line, " ÷\t")) |final| { |
| 178 | line = line[0..octo]; | 162 | line = line[0..final]; |
| 179 | } | 163 | } |
| 180 | // Iterate over fields. | 164 | // Iterate over fields. |
| 181 | var want = std.ArrayList(Grapheme).init(allocator); | 165 | var want = std.ArrayList(Grapheme).init(allocator); |
| @@ -206,7 +190,6 @@ test "Segmentation GraphemeIterator" { | |||
| 206 | bytes_index += cp_index; | 190 | bytes_index += cp_index; |
| 207 | } | 191 | } |
| 208 | 192 | ||
| 209 | // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); | ||
| 210 | var iter = data.iterator(all_bytes.items); | 193 | var iter = data.iterator(all_bytes.items); |
| 211 | 194 | ||
| 212 | // Check. | 195 | // Check. |
| @@ -219,3 +202,41 @@ test "Segmentation GraphemeIterator" { | |||
| 219 | } | 202 | } |
| 220 | } | 203 | } |
| 221 | } | 204 | } |
| 205 | |||
| 206 | const IterRead = struct { | ||
| 207 | read: *Reader, | ||
| 208 | line: usize = 0, | ||
| 209 | |||
| 210 | pub fn next(iter: *IterRead, buf: []u8) !?[]const u8 { | ||
| 211 | defer iter.line += 1; | ||
| 212 | const maybe_line = try iter.read.readUntilDelimiterOrEof(buf, '#'); | ||
| 213 | if (maybe_line) |this_line| { | ||
| 214 | try iter.read.skipUntilDelimiterOrEof('\n'); | ||
| 215 | if (this_line.len == 0 or this_line[0] == '@') { | ||
| 216 | // comment, next line | ||
| 217 | return iter.next(buf); | ||
| 218 | } else { | ||
| 219 | return this_line; | ||
| 220 | } | ||
| 221 | } else { | ||
| 222 | return null; | ||
| 223 | } | ||
| 224 | } | ||
| 225 | }; | ||
| 226 | |||
| 227 | const std = @import("std"); | ||
| 228 | const fmt = std.fmt; | ||
| 229 | const fs = std.fs; | ||
| 230 | const io = std.io; | ||
| 231 | const Reader = io.BufferedReader(4096, fs.File.Reader).Reader; | ||
| 232 | const heap = std.heap; | ||
| 233 | const mem = std.mem; | ||
| 234 | const debug = std.debug; | ||
| 235 | const testing = std.testing; | ||
| 236 | const unicode = std.unicode; | ||
| 237 | |||
| 238 | const grapheme = @import("Graphemes"); | ||
| 239 | const Grapheme = @import("Graphemes").Grapheme; | ||
| 240 | const Graphemes = @import("Graphemes"); | ||
| 241 | const GraphemeIterator = @import("Graphemes").Iterator; | ||
| 242 | const Normalize = @import("Normalize"); | ||