From a2c4b7a57fe6b64bdd7c71305d408e5030af3157 Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Thu, 28 Mar 2024 22:19:50 -0400 Subject: Split out Unicode tests to separate file --- src/grapheme.zig | 65 -------------------------------------------------------- 1 file changed, 65 deletions(-) (limited to 'src/grapheme.zig') diff --git a/src/grapheme.zig b/src/grapheme.zig index ad43cfd..f4cc68c 100644 --- a/src/grapheme.zig +++ b/src/grapheme.zig @@ -230,71 +230,6 @@ pub fn graphemeBreak( return true; } -test "Segmentation GraphemeIterator" { - const allocator = std.testing.allocator; - var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); - defer file.close(); - var buf_reader = std.io.bufferedReader(file.reader()); - var input_stream = buf_reader.reader(); - - const data = try GraphemeData.init(allocator); - defer data.deinit(); - - var buf: [4096]u8 = undefined; - var line_no: usize = 1; - - while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { - // Skip comments or empty lines. - if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; - - // Clean up. - var line = std.mem.trimLeft(u8, raw, "÷ "); - if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { - line = line[0..octo]; - } - // Iterate over fields. - var want = std.ArrayList(Grapheme).init(allocator); - defer want.deinit(); - - var all_bytes = std.ArrayList(u8).init(allocator); - defer all_bytes.deinit(); - - var graphemes = std.mem.split(u8, line, " ÷ "); - var bytes_index: u32 = 0; - - while (graphemes.next()) |field| { - var code_points = std.mem.split(u8, field, " "); - var cp_buf: [4]u8 = undefined; - var cp_index: u32 = 0; - var gc_len: u8 = 0; - - while (code_points.next()) |code_point| { - if (std.mem.eql(u8, code_point, "×")) continue; - const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); - const len = try unicode.utf8Encode(cp, &cp_buf); - try all_bytes.appendSlice(cp_buf[0..len]); - cp_index += len; - gc_len += len; - } - - try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); - bytes_index += cp_index; - } - - // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); - var iter = Iterator.init(all_bytes.items, &data); - - // Chaeck. - for (want.items) |want_gc| { - const got_gc = (iter.next()).?; - try std.testing.expectEqualStrings( - want_gc.bytes(all_bytes.items), - got_gc.bytes(all_bytes.items), - ); - } - } -} - test "Segmentation ZWJ and ZWSP emoji sequences" { const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; -- cgit v1.2.3