From a2c4b7a57fe6b64bdd7c71305d408e5030af3157 Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Thu, 28 Mar 2024 22:19:50 -0400 Subject: Split out Unicode tests to separate file --- src/Normalize.zig | 121 +------------------------------ src/grapheme.zig | 65 ----------------- src/unicode_tests.zig | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 195 insertions(+), 185 deletions(-) create mode 100644 src/unicode_tests.zig (limited to 'src') diff --git a/src/Normalize.zig b/src/Normalize.zig index daf774d..f437f4f 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig @@ -3,12 +3,10 @@ //! NFKC, NFD, and NFKD normalization forms. const std = @import("std"); -const assert = std.debug.assert; const debug = std.debug; +const assert = debug.assert; const fmt = std.fmt; -const fs = std.fs; const heap = std.heap; -const io = std.io; const mem = std.mem; const simd = std.simd; const testing = std.testing; @@ -615,123 +613,6 @@ test "isFcd" { try testing.expect(!n.isFcd(not_fcd)); } -test "Unicode normalization tests" { - var arena = heap.ArenaAllocator.init(testing.allocator); - defer arena.deinit(); - var allocator = arena.allocator(); - - const data = try NormData.init(allocator); - defer data.deinit(); - const n = Self{ .norm_data = &data }; - - var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); - defer file.close(); - var buf_reader = io.bufferedReader(file.reader()); - const input_stream = buf_reader.reader(); - - var line_no: usize = 0; - var buf: [4096]u8 = undefined; - var cp_buf: [4]u8 = undefined; - - while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| { - line_no += 1; - // Skip comments or empty lines. - if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; - // Iterate over fields. - var fields = mem.split(u8, line, ";"); - var field_index: usize = 0; - var input: []u8 = undefined; - defer allocator.free(input); - - while (fields.next()) |field| : (field_index += 1) { - if (field_index == 0) { - var i_buf = std.ArrayList(u8).init(allocator); - defer i_buf.deinit(); - - var i_fields = mem.split(u8, field, " "); - while (i_fields.next()) |s| { - const icp = try fmt.parseInt(u21, s, 16); - const len = try unicode.utf8Encode(icp, &cp_buf); - try i_buf.appendSlice(cp_buf[0..len]); - } - - input = try i_buf.toOwnedSlice(); - } else if (field_index == 1) { - //debug.print("\n*** {s} ***\n", .{line}); - // NFC, time to test. - var w_buf = std.ArrayList(u8).init(allocator); - defer w_buf.deinit(); - - var w_fields = mem.split(u8, field, " "); - while (w_fields.next()) |s| { - const wcp = try fmt.parseInt(u21, s, 16); - const len = try unicode.utf8Encode(wcp, &cp_buf); - try w_buf.appendSlice(cp_buf[0..len]); - } - - const want = w_buf.items; - var got = try n.nfc(allocator, input); - defer got.deinit(); - - try testing.expectEqualStrings(want, got.slice); - } else if (field_index == 2) { - // NFD, time to test. - var w_buf = std.ArrayList(u8).init(allocator); - defer w_buf.deinit(); - - var w_fields = mem.split(u8, field, " "); - while (w_fields.next()) |s| { - const wcp = try fmt.parseInt(u21, s, 16); - const len = try unicode.utf8Encode(wcp, &cp_buf); - try w_buf.appendSlice(cp_buf[0..len]); - } - - const want = w_buf.items; - var got = try n.nfd(allocator, input); - defer got.deinit(); - - try testing.expectEqualStrings(want, got.slice); - } else if (field_index == 3) { - // NFKC, time to test. - var w_buf = std.ArrayList(u8).init(allocator); - defer w_buf.deinit(); - - var w_fields = mem.split(u8, field, " "); - while (w_fields.next()) |s| { - const wcp = try fmt.parseInt(u21, s, 16); - const len = try unicode.utf8Encode(wcp, &cp_buf); - try w_buf.appendSlice(cp_buf[0..len]); - } - - const want = w_buf.items; - var got = try n.nfkc(allocator, input); - defer got.deinit(); - - try testing.expectEqualStrings(want, got.slice); - } else if (field_index == 4) { - // NFKD, time to test. - var w_buf = std.ArrayList(u8).init(allocator); - defer w_buf.deinit(); - - var w_fields = mem.split(u8, field, " "); - while (w_fields.next()) |s| { - const wcp = try fmt.parseInt(u21, s, 16); - const len = try unicode.utf8Encode(wcp, &cp_buf); - try w_buf.appendSlice(cp_buf[0..len]); - } - - const want = w_buf.items; - const got = try n.nfkd(allocator, input); - defer got.deinit(); - - try testing.expectEqualStrings(want, got.slice); - } else { - continue; - } - } - } -} - /// Returns true if `str` only contains Latin-1 Supplement /// code points. Uses SIMD if possible. pub fn isLatin1Only(str: []const u8) bool { diff --git a/src/grapheme.zig b/src/grapheme.zig index ad43cfd..f4cc68c 100644 --- a/src/grapheme.zig +++ b/src/grapheme.zig @@ -230,71 +230,6 @@ pub fn graphemeBreak( return true; } -test "Segmentation GraphemeIterator" { - const allocator = std.testing.allocator; - var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); - defer file.close(); - var buf_reader = std.io.bufferedReader(file.reader()); - var input_stream = buf_reader.reader(); - - const data = try GraphemeData.init(allocator); - defer data.deinit(); - - var buf: [4096]u8 = undefined; - var line_no: usize = 1; - - while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { - // Skip comments or empty lines. - if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; - - // Clean up. - var line = std.mem.trimLeft(u8, raw, "÷ "); - if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { - line = line[0..octo]; - } - // Iterate over fields. - var want = std.ArrayList(Grapheme).init(allocator); - defer want.deinit(); - - var all_bytes = std.ArrayList(u8).init(allocator); - defer all_bytes.deinit(); - - var graphemes = std.mem.split(u8, line, " ÷ "); - var bytes_index: u32 = 0; - - while (graphemes.next()) |field| { - var code_points = std.mem.split(u8, field, " "); - var cp_buf: [4]u8 = undefined; - var cp_index: u32 = 0; - var gc_len: u8 = 0; - - while (code_points.next()) |code_point| { - if (std.mem.eql(u8, code_point, "×")) continue; - const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); - const len = try unicode.utf8Encode(cp, &cp_buf); - try all_bytes.appendSlice(cp_buf[0..len]); - cp_index += len; - gc_len += len; - } - - try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); - bytes_index += cp_index; - } - - // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); - var iter = Iterator.init(all_bytes.items, &data); - - // Chaeck. - for (want.items) |want_gc| { - const got_gc = (iter.next()).?; - try std.testing.expectEqualStrings( - want_gc.bytes(all_bytes.items), - got_gc.bytes(all_bytes.items), - ); - } - } -} - test "Segmentation ZWJ and ZWSP emoji sequences" { const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig new file mode 100644 index 0000000..5442f63 --- /dev/null +++ b/src/unicode_tests.zig @@ -0,0 +1,194 @@ +const std = @import("std"); +const fmt = std.fmt; +const fs = std.fs; +const io = std.io; +const heap = std.heap; +const mem = std.mem; +const testing = std.testing; +const unicode = std.unicode; + +const Grapheme = @import("grapheme").Grapheme; +const GraphemeData = @import("grapheme").GraphemeData; +const GraphemeIterator = @import("grapheme").Iterator; +const Normalize = @import("Normalize"); + +test "Unicode normalization tests" { + var arena = heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + var allocator = arena.allocator(); + + const data = try Normalize.NormData.init(allocator); + const n = Normalize{ .norm_data = &data }; + + var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); + defer file.close(); + var buf_reader = io.bufferedReader(file.reader()); + const input_stream = buf_reader.reader(); + + var line_no: usize = 0; + var buf: [4096]u8 = undefined; + var cp_buf: [4]u8 = undefined; + + while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| { + line_no += 1; + // Skip comments or empty lines. + if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; + // Iterate over fields. + var fields = mem.split(u8, line, ";"); + var field_index: usize = 0; + var input: []u8 = undefined; + defer allocator.free(input); + + while (fields.next()) |field| : (field_index += 1) { + if (field_index == 0) { + var i_buf = std.ArrayList(u8).init(allocator); + defer i_buf.deinit(); + + var i_fields = mem.split(u8, field, " "); + while (i_fields.next()) |s| { + const icp = try fmt.parseInt(u21, s, 16); + const len = try unicode.utf8Encode(icp, &cp_buf); + try i_buf.appendSlice(cp_buf[0..len]); + } + + input = try i_buf.toOwnedSlice(); + } else if (field_index == 1) { + //debug.print("\n*** {s} ***\n", .{line}); + // NFC, time to test. + var w_buf = std.ArrayList(u8).init(allocator); + defer w_buf.deinit(); + + var w_fields = mem.split(u8, field, " "); + while (w_fields.next()) |s| { + const wcp = try fmt.parseInt(u21, s, 16); + const len = try unicode.utf8Encode(wcp, &cp_buf); + try w_buf.appendSlice(cp_buf[0..len]); + } + + const want = w_buf.items; + var got = try n.nfc(allocator, input); + defer got.deinit(); + + try testing.expectEqualStrings(want, got.slice); + } else if (field_index == 2) { + // NFD, time to test. + var w_buf = std.ArrayList(u8).init(allocator); + defer w_buf.deinit(); + + var w_fields = mem.split(u8, field, " "); + while (w_fields.next()) |s| { + const wcp = try fmt.parseInt(u21, s, 16); + const len = try unicode.utf8Encode(wcp, &cp_buf); + try w_buf.appendSlice(cp_buf[0..len]); + } + + const want = w_buf.items; + var got = try n.nfd(allocator, input); + defer got.deinit(); + + try testing.expectEqualStrings(want, got.slice); + } else if (field_index == 3) { + // NFKC, time to test. + var w_buf = std.ArrayList(u8).init(allocator); + defer w_buf.deinit(); + + var w_fields = mem.split(u8, field, " "); + while (w_fields.next()) |s| { + const wcp = try fmt.parseInt(u21, s, 16); + const len = try unicode.utf8Encode(wcp, &cp_buf); + try w_buf.appendSlice(cp_buf[0..len]); + } + + const want = w_buf.items; + var got = try n.nfkc(allocator, input); + defer got.deinit(); + + try testing.expectEqualStrings(want, got.slice); + } else if (field_index == 4) { + // NFKD, time to test. + var w_buf = std.ArrayList(u8).init(allocator); + defer w_buf.deinit(); + + var w_fields = mem.split(u8, field, " "); + while (w_fields.next()) |s| { + const wcp = try fmt.parseInt(u21, s, 16); + const len = try unicode.utf8Encode(wcp, &cp_buf); + try w_buf.appendSlice(cp_buf[0..len]); + } + + const want = w_buf.items; + const got = try n.nfkd(allocator, input); + defer got.deinit(); + + try testing.expectEqualStrings(want, got.slice); + } else { + continue; + } + } + } +} + +test "Segmentation GraphemeIterator" { + const allocator = std.testing.allocator; + var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); + defer file.close(); + var buf_reader = std.io.bufferedReader(file.reader()); + var input_stream = buf_reader.reader(); + + const data = try GraphemeData.init(allocator); + defer data.deinit(); + + var buf: [4096]u8 = undefined; + var line_no: usize = 1; + + while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { + // Skip comments or empty lines. + if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; + + // Clean up. + var line = std.mem.trimLeft(u8, raw, "÷ "); + if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { + line = line[0..octo]; + } + // Iterate over fields. + var want = std.ArrayList(Grapheme).init(allocator); + defer want.deinit(); + + var all_bytes = std.ArrayList(u8).init(allocator); + defer all_bytes.deinit(); + + var graphemes = std.mem.split(u8, line, " ÷ "); + var bytes_index: u32 = 0; + + while (graphemes.next()) |field| { + var code_points = std.mem.split(u8, field, " "); + var cp_buf: [4]u8 = undefined; + var cp_index: u32 = 0; + var gc_len: u8 = 0; + + while (code_points.next()) |code_point| { + if (std.mem.eql(u8, code_point, "×")) continue; + const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); + const len = try unicode.utf8Encode(cp, &cp_buf); + try all_bytes.appendSlice(cp_buf[0..len]); + cp_index += len; + gc_len += len; + } + + try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); + bytes_index += cp_index; + } + + // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); + var iter = GraphemeIterator.init(all_bytes.items, &data); + + // Chaeck. + for (want.items) |want_gc| { + const got_gc = (iter.next()).?; + try std.testing.expectEqualStrings( + want_gc.bytes(all_bytes.items), + got_gc.bytes(all_bytes.items), + ); + } + } +} -- cgit v1.2.3