diff options
| author | 2024-03-28 22:19:50 -0400 | |
|---|---|---|
| committer | 2024-03-28 22:19:50 -0400 | |
| commit | a2c4b7a57fe6b64bdd7c71305d408e5030af3157 (patch) | |
| tree | c7af1ed4381ab0eeea52e2a9081cb19469b8c0e6 /src/grapheme.zig | |
| parent | Merged NumericData into PropsData (diff) | |
| download | zg-a2c4b7a57fe6b64bdd7c71305d408e5030af3157.tar.gz zg-a2c4b7a57fe6b64bdd7c71305d408e5030af3157.tar.xz zg-a2c4b7a57fe6b64bdd7c71305d408e5030af3157.zip | |
Split out Unicode tests to separate file
Diffstat (limited to 'src/grapheme.zig')
| -rw-r--r-- | src/grapheme.zig | 65 |
1 files changed, 0 insertions, 65 deletions
diff --git a/src/grapheme.zig b/src/grapheme.zig index ad43cfd..f4cc68c 100644 --- a/src/grapheme.zig +++ b/src/grapheme.zig | |||
| @@ -230,71 +230,6 @@ pub fn graphemeBreak( | |||
| 230 | return true; | 230 | return true; |
| 231 | } | 231 | } |
| 232 | 232 | ||
| 233 | test "Segmentation GraphemeIterator" { | ||
| 234 | const allocator = std.testing.allocator; | ||
| 235 | var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); | ||
| 236 | defer file.close(); | ||
| 237 | var buf_reader = std.io.bufferedReader(file.reader()); | ||
| 238 | var input_stream = buf_reader.reader(); | ||
| 239 | |||
| 240 | const data = try GraphemeData.init(allocator); | ||
| 241 | defer data.deinit(); | ||
| 242 | |||
| 243 | var buf: [4096]u8 = undefined; | ||
| 244 | var line_no: usize = 1; | ||
| 245 | |||
| 246 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { | ||
| 247 | // Skip comments or empty lines. | ||
| 248 | if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; | ||
| 249 | |||
| 250 | // Clean up. | ||
| 251 | var line = std.mem.trimLeft(u8, raw, "÷ "); | ||
| 252 | if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { | ||
| 253 | line = line[0..octo]; | ||
| 254 | } | ||
| 255 | // Iterate over fields. | ||
| 256 | var want = std.ArrayList(Grapheme).init(allocator); | ||
| 257 | defer want.deinit(); | ||
| 258 | |||
| 259 | var all_bytes = std.ArrayList(u8).init(allocator); | ||
| 260 | defer all_bytes.deinit(); | ||
| 261 | |||
| 262 | var graphemes = std.mem.split(u8, line, " ÷ "); | ||
| 263 | var bytes_index: u32 = 0; | ||
| 264 | |||
| 265 | while (graphemes.next()) |field| { | ||
| 266 | var code_points = std.mem.split(u8, field, " "); | ||
| 267 | var cp_buf: [4]u8 = undefined; | ||
| 268 | var cp_index: u32 = 0; | ||
| 269 | var gc_len: u8 = 0; | ||
| 270 | |||
| 271 | while (code_points.next()) |code_point| { | ||
| 272 | if (std.mem.eql(u8, code_point, "×")) continue; | ||
| 273 | const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); | ||
| 274 | const len = try unicode.utf8Encode(cp, &cp_buf); | ||
| 275 | try all_bytes.appendSlice(cp_buf[0..len]); | ||
| 276 | cp_index += len; | ||
| 277 | gc_len += len; | ||
| 278 | } | ||
| 279 | |||
| 280 | try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); | ||
| 281 | bytes_index += cp_index; | ||
| 282 | } | ||
| 283 | |||
| 284 | // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); | ||
| 285 | var iter = Iterator.init(all_bytes.items, &data); | ||
| 286 | |||
| 287 | // Chaeck. | ||
| 288 | for (want.items) |want_gc| { | ||
| 289 | const got_gc = (iter.next()).?; | ||
| 290 | try std.testing.expectEqualStrings( | ||
| 291 | want_gc.bytes(all_bytes.items), | ||
| 292 | got_gc.bytes(all_bytes.items), | ||
| 293 | ); | ||
| 294 | } | ||
| 295 | } | ||
| 296 | } | ||
| 297 | |||
| 298 | test "Segmentation ZWJ and ZWSP emoji sequences" { | 233 | test "Segmentation ZWJ and ZWSP emoji sequences" { |
| 299 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | 234 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; |
| 300 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | 235 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; |