From 1ecfd06469ed4c2503034796faf4e7dca4196238 Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Fri, 1 Mar 2024 20:31:52 -0400 Subject: Moved case fold stuff to src/Caser.zig --- src/Caser.zig | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/NormData.zig | 3 -- src/Normalizer.zig | 98 ++--------------------------------------------- src/main.zig | 21 +++++++---- 4 files changed, 125 insertions(+), 106 deletions(-) create mode 100644 src/Caser.zig (limited to 'src') diff --git a/src/Caser.zig b/src/Caser.zig new file mode 100644 index 0000000..d02370a --- /dev/null +++ b/src/Caser.zig @@ -0,0 +1,109 @@ +const std = @import("std"); +const mem = std.mem; +const testing = std.testing; + +const ascii = @import("ascii"); +pub const FoldData = @import("FoldData"); +const Normalizer = @import("Normalizer"); + +fold_data: *const FoldData, + +const Self = @This(); + +fn caseFold( + self: Self, + allocator: mem.Allocator, + cps: []const u21, +) ![]const u21 { + var cfcps = std.ArrayList(u21).init(allocator); + defer cfcps.deinit(); + + for (cps) |cp| { + const cf = self.fold_data.caseFold(cp); + + if (cf.len == 0) { + try cfcps.append(cp); + } else { + try cfcps.appendSlice(cf); + } + } + + return try cfcps.toOwnedSlice(); +} + +fn changesWhenCaseFolded(self: Self, cps: []const u21) bool { + return for (cps) |cp| { + if (self.fold_data.changesWhenCaseFolded(cp)) break true; + } else false; +} + +pub fn compatCaselessMatch( + self: Self, + allocator: mem.Allocator, + normalizer: *const Normalizer, + a: []const u8, + b: []const u8, +) !bool { + if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); + + // Process a + const nfd_a = try normalizer.nfxdCodePoints(allocator, a, .nfd); + defer allocator.free(nfd_a); + + var need_frr_cf_nfd_a = false; + var cf_nfd_a: []const u21 = nfd_a; + if (self.changesWhenCaseFolded(nfd_a)) { + cf_nfd_a = try self.caseFold(allocator, nfd_a); + need_frr_cf_nfd_a = true; + } + defer if (need_frr_cf_nfd_a) allocator.free(cf_nfd_a); + + const nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfd_a); + defer allocator.free(nfkd_cf_nfd_a); + const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a); + defer allocator.free(cf_nfkd_cf_nfd_a); + const nfkd_cf_nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); + defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); + + // Process b + const nfd_b = try normalizer.nfxdCodePoints(allocator, b, .nfd); + defer allocator.free(nfd_b); + + var need_frr_cf_nfd_b = false; + var cf_nfd_b: []const u21 = nfd_b; + if (self.changesWhenCaseFolded(nfd_b)) { + cf_nfd_b = try self.caseFold(allocator, nfd_b); + need_frr_cf_nfd_b = true; + } + defer if (need_frr_cf_nfd_b) allocator.free(cf_nfd_b); + + const nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfd_b); + defer allocator.free(nfkd_cf_nfd_b); + const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b); + defer allocator.free(cf_nfkd_cf_nfd_b); + const nfkd_cf_nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); + defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); + + return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); +} + +test "compatCaselessMatch" { + const allocator = testing.allocator; + + var norm_data = try Normalizer.NormData.init(allocator); + defer norm_data.deinit(); + const n = Normalizer{ .norm_data = &norm_data }; + + var fold_data = try FoldData.init(allocator); + defer fold_data.deinit(); + const caser = Self{ .fold_data = &fold_data }; + + try testing.expect(try caser.compatCaselessMatch(allocator, &n, "ascii only!", "ASCII Only!")); + + const a = "Héllo World! \u{3d3}"; + const b = "He\u{301}llo World! \u{3a5}\u{301}"; + try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, b)); + + const c = "He\u{301}llo World! \u{3d2}\u{301}"; + try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, c)); +} diff --git a/src/NormData.zig b/src/NormData.zig index 3c2f614..8a7fa49 100644 --- a/src/NormData.zig +++ b/src/NormData.zig @@ -13,7 +13,6 @@ ccc_data: CccData, compat_data: CompatData, hangul_data: HangulData, normp_data: NormPropsData, -fold_data: FoldData, const Self = @This(); @@ -22,7 +21,6 @@ pub fn init(allocator: std.mem.Allocator) !Self { .canon_data = try CanonData.init(allocator), .ccc_data = try CccData.init(allocator), .compat_data = try CompatData.init(allocator), - .fold_data = try FoldData.init(allocator), .hangul_data = try HangulData.init(allocator), .normp_data = try NormPropsData.init(allocator), }; @@ -33,6 +31,5 @@ pub fn deinit(self: *Self) void { self.ccc_data.deinit(); self.compat_data.deinit(); self.hangul_data.deinit(); - self.fold_data.deinit(); self.normp_data.deinit(); } diff --git a/src/Normalizer.zig b/src/Normalizer.zig index 5a26dfa..3ff157c 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig @@ -18,7 +18,7 @@ const ascii = @import("ascii"); const CodePointIterator = @import("code_point").Iterator; pub const NormData = @import("NormData"); -norm_data: *NormData, +norm_data: *const NormData, const Self = @This(); @@ -255,7 +255,7 @@ pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) !Result { return self.nfxd(allocator, str, .nfkd); } -fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) ![]u21 { +pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) ![]u21 { var dcp_list = std.ArrayList(u21).init(allocator); defer dcp_list.deinit(); @@ -343,28 +343,7 @@ test "nfkd !ASCII / alloc" { try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); } -fn caseFold( - self: Self, - allocator: mem.Allocator, - cps: []const u21, -) ![]const u21 { - var cfcps = std.ArrayList(u21).init(allocator); - defer cfcps.deinit(); - - for (cps) |cp| { - const cf = self.norm_data.fold_data.caseFold(cp); - - if (cf.len == 0) { - try cfcps.append(cp); - } else { - try cfcps.appendSlice(cf); - } - } - - return try cfcps.toOwnedSlice(); -} - -fn nfkdCodePoints( +pub fn nfkdCodePoints( self: Self, allocator: mem.Allocator, cps: []const u21, @@ -389,77 +368,6 @@ fn nfkdCodePoints( return try dcp_list.toOwnedSlice(); } -fn changesWhenCaseFolded(self: Self, cps: []const u21) bool { - return for (cps) |cp| { - if (self.norm_data.fold_data.changesWhenCaseFolded(cp)) break true; - } else false; -} - -pub fn eqlIgnoreCase( - self: Self, - allocator: mem.Allocator, - a: []const u8, - b: []const u8, -) !bool { - if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); - - // Process a - const nfd_a = try self.nfxdCodePoints(allocator, a, .nfd); - defer allocator.free(nfd_a); - - var need_frr_cf_nfd_a = false; - var cf_nfd_a: []const u21 = nfd_a; - if (self.changesWhenCaseFolded(nfd_a)) { - cf_nfd_a = try self.caseFold(allocator, nfd_a); - need_frr_cf_nfd_a = true; - } - defer if (need_frr_cf_nfd_a) allocator.free(cf_nfd_a); - - const nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfd_a); - defer allocator.free(nfkd_cf_nfd_a); - const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a); - defer allocator.free(cf_nfkd_cf_nfd_a); - const nfkd_cf_nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); - defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); - - // Process b - const nfd_b = try self.nfxdCodePoints(allocator, b, .nfd); - defer allocator.free(nfd_b); - - var need_frr_cf_nfd_b = false; - var cf_nfd_b: []const u21 = nfd_b; - if (self.changesWhenCaseFolded(nfd_b)) { - cf_nfd_b = try self.caseFold(allocator, nfd_b); - need_frr_cf_nfd_b = true; - } - defer if (need_frr_cf_nfd_b) allocator.free(cf_nfd_b); - - const nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfd_b); - defer allocator.free(nfkd_cf_nfd_b); - const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b); - defer allocator.free(cf_nfkd_cf_nfd_b); - const nfkd_cf_nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); - defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); - - return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); -} - -test "eqlIgnoreCase" { - const allocator = testing.allocator; - var data = try NormData.init(allocator); - defer data.deinit(); - var n = Self{ .norm_data = &data }; - - try testing.expect(try n.eqlIgnoreCase(allocator, "ascii only!", "ASCII Only!")); - - const a = "Héllo World! \u{3d3}"; - const b = "He\u{301}llo World! \u{3a5}\u{301}"; - try testing.expect(try n.eqlIgnoreCase(allocator, a, b)); - - const c = "He\u{301}llo World! \u{3d2}\u{301}"; - try testing.expect(try n.eqlIgnoreCase(allocator, a, c)); -} - // Composition (NFC, NFKC) fn isHangul(self: Self, cp: u21) bool { diff --git a/src/main.zig b/src/main.zig index a5afa66..46e7c9d 100644 --- a/src/main.zig +++ b/src/main.zig @@ -11,15 +11,16 @@ const std = @import("std"); // const strWidth = @import("display_width").strWidth; // const CodePointIterator = @import("ziglyph").CodePointIterator; -const CodePointIterator = @import("code_point").Iterator; +// const CodePointIterator = @import("code_point").Iterator; // const ascii = @import("ascii"); // const ascii = std.ascii; // const Normalizer = @import("ziglyph").Normalizer; -const NormData = @import("Normalizer").NormData; const Normalizer = @import("Normalizer"); +const Caser = @import("Caser"); + // const GenCatData = @import("GenCatData"); pub fn main() !void { @@ -34,15 +35,19 @@ pub fn main() !void { const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32)); defer allocator.free(input); - var data = try NormData.init(allocator); - defer data.deinit(); - var n = Normalizer{ .norm_data = &data }; - // var n = try Normalizer.init(allocator); - // defer n.deinit(); + var norm_data = try Normalizer.NormData.init(allocator); + defer norm_data.deinit(); + var norm = Normalizer{ .norm_data = &norm_data }; + // var norm = try Normalizer.init(allocator); + // defer norm.deinit(); // var gencat_data = try GenCatData.init(allocator); // defer gencat_data.deinit(); + var fold_data = try Caser.FoldData.init(allocator); + defer fold_data.deinit(); + var caser = Caser{ .fold_data = &fold_data }; + // var iter = GraphemeIterator.init(input, &data); // defer iter.deinit(); // var iter = CodePointIterator{ .bytes = input }; @@ -66,7 +71,7 @@ pub fn main() !void { // result += 1; // } while (iter.next()) |line| { - if (try n.eqlIgnoreCase(allocator, prev_line, line)) { + if (try caser.compatCaselessMatch(allocator, &norm, prev_line, line)) { result += line.len; } prev_line = line; -- cgit v1.2.3