From ba5d9081b479e95ffa7f3baf751beedd370cec14 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 4 Feb 2026 18:01:36 -0500 Subject: Normalization and case folding Both of which deserve some further attention. --- src/Normalize.zig | 119 +++++++++++++++++------------------------------------- 1 file changed, 38 insertions(+), 81 deletions(-) (limited to 'src/Normalize.zig') diff --git a/src/Normalize.zig b/src/Normalize.zig index 4a1bae8..3191a8c 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig @@ -3,64 +3,22 @@ //! NFKC, NFD, and NFKD normalization forms. canon_data: CanonData = undefined, -ccc_data: CccData = undefined, -compat_data: CompatData = undefined, -hangul_data: HangulData = undefined, -normp_data: NormPropsData = undefined, const Normalize = @This(); -pub fn init(allocator: Allocator) Allocator.Error!Normalize { +pub fn init(allocator: Allocator) !Normalize { var norm: Normalize = undefined; try norm.setup(allocator); return norm; } -pub fn setup(self: *Normalize, allocator: Allocator) Allocator.Error!void { - self.canon_data = CanonData.init(allocator) catch |err| { - switch (err) { - error.OutOfMemory => |e| return e, - else => unreachable, - } - }; - errdefer self.canon_data.deinit(allocator); - self.ccc_data = CccData.init(allocator) catch |err| { - switch (err) { - error.OutOfMemory => |e| return e, - else => unreachable, - } - }; - errdefer self.ccc_data.deinit(allocator); - self.compat_data = CompatData.init(allocator) catch |err| { - switch (err) { - error.OutOfMemory => |e| return e, - else => unreachable, - } - }; - errdefer self.compat_data.deinit(allocator); - self.hangul_data = HangulData.init(allocator) catch |err| { - switch (err) { - error.OutOfMemory => |e| return e, - else => unreachable, - } - }; - errdefer self.hangul_data.deinit(allocator); - self.normp_data = NormPropsData.init(allocator) catch |err| { - switch (err) { - error.OutOfMemory => |e| return e, - else => unreachable, - } - }; +pub fn setup(self: *Normalize, allocator: Allocator) !void { + self.canon_data = try CanonData.init(allocator); } pub fn deinit(norm: *const Normalize, allocator: Allocator) void { - // Reasonably safe (?) - var mut_norm = @constCast(norm); + const mut_norm = @constCast(norm); mut_norm.canon_data.deinit(allocator); - mut_norm.ccc_data.deinit(allocator); - mut_norm.compat_data.deinit(allocator); - mut_norm.hangul_data.deinit(allocator); - mut_norm.normp_data.deinit(allocator); } const SBase: u21 = 0xAC00; @@ -73,8 +31,8 @@ const TCount: u21 = 28; const NCount: u21 = 588; // VCount * TCount const SCount: u21 = 11172; // LCount * NCount -fn decomposeHangul(self: Normalize, cp: u21, buf: []u21) ?Decomp { - const kind = self.hangul_data.syllable(cp); +fn decomposeHangul(cp: u21, buf: []u21) ?Decomp { + const kind = HangulData.syllable(cp); if (kind != .LV and kind != .LVT) return null; const SIndex: u21 = cp - SBase; @@ -143,7 +101,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp { }, .nfkd => { - dc.cps = self.compat_data.toNfkd(cp); + dc.cps = CompatData.toNfkd(cp); if (dc.cps.len != 0) { dc.form = .nfkd; } else { @@ -170,13 +128,13 @@ fn decompose( // NFD / NFKD quick checks. switch (form) { - .nfd => if (self.normp_data.isNfd(cp)) return .{}, - .nfkd => if (self.normp_data.isNfkd(cp)) return .{}, + .nfd => if (NormPropsData.isNfd(cp)) return .{}, + .nfkd => if (NormPropsData.isNfkd(cp)) return .{}, else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."), } // Hangul precomposed syllable full decomposition. - if (self.decomposeHangul(cp, buf)) |dc| return dc; + if (decomposeHangul(cp, buf)) |dc| return dc; // Full decomposition. var dc = Decomp{ .form = form }; @@ -218,9 +176,8 @@ fn decompose( test "decompose" { const allocator = testing.allocator; - const n = try Normalize.init(allocator); + var n = try Normalize.init(allocator); defer n.deinit(allocator); - var buf: [18]u21 = undefined; var dc = n.decompose('é', .nfd, &buf); @@ -280,17 +237,17 @@ pub const Result = struct { }; // Compares code points by Canonical Combining Class order. -fn cccLess(self: Normalize, lhs: u21, rhs: u21) bool { - return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs); +fn cccLess(_: void, lhs: u21, rhs: u21) bool { + return CombiningData.ccc(lhs) < CombiningData.ccc(rhs); } // Applies the Canonical Sorting Algorithm. -fn canonicalSort(self: Normalize, cps: []u21) void { +fn canonicalSort(cps: []u21) void { var i: usize = 0; while (i < cps.len) : (i += 1) { const start: usize = i; - while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} - mem.sort(u21, cps[start..i], self, cccLess); + while (i < cps.len and CombiningData.ccc(cps[i]) != 0) : (i += 1) {} + mem.sort(u21, cps[start..i], {}, cccLess); } } @@ -320,7 +277,7 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo } } - self.canonicalSort(dcp_list.items); + canonicalSort(dcp_list.items); return try dcp_list.toOwnedSlice(); } @@ -346,7 +303,7 @@ fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo test "nfd ASCII / no-alloc" { const allocator = testing.allocator; - const n = try Normalize.init(allocator); + var n = try Normalize.init(allocator); defer n.deinit(allocator); const result = try n.nfd(allocator, "Hello World!"); @@ -357,7 +314,7 @@ test "nfd ASCII / no-alloc" { test "nfd !ASCII / alloc" { const allocator = testing.allocator; - const n = try Normalize.init(allocator); + var n = try Normalize.init(allocator); defer n.deinit(allocator); const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); @@ -368,7 +325,7 @@ test "nfd !ASCII / alloc" { test "nfkd ASCII / no-alloc" { const allocator = testing.allocator; - const n = try Normalize.init(allocator); + var n = try Normalize.init(allocator); defer n.deinit(allocator); const result = try n.nfkd(allocator, "Hello World!"); @@ -379,7 +336,7 @@ test "nfkd ASCII / no-alloc" { test "nfkd !ASCII / alloc" { const allocator = testing.allocator; - const n = try Normalize.init(allocator); + var n = try Normalize.init(allocator); defer n.deinit(allocator); const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); @@ -408,7 +365,7 @@ pub fn nfdCodePoints( } } - self.canonicalSort(dcp_list.items); + canonicalSort(dcp_list.items); return try dcp_list.toOwnedSlice(); } @@ -433,15 +390,15 @@ pub fn nfkdCodePoints( } } - self.canonicalSort(dcp_list.items); + canonicalSort(dcp_list.items); return try dcp_list.toOwnedSlice(); } // Composition (NFC, NFKC) -fn isHangul(self: Normalize, cp: u21) bool { - return cp >= 0x1100 and self.hangul_data.syllable(cp) != .none; +fn isHangul(cp: u21) bool { + return cp >= 0x1100 and HangulData.syllable(cp) != .none; } /// Normalizes `str` to NFC. @@ -479,7 +436,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo block_check: while (i < dcps.len) : (i += 1) { const C = dcps[i]; if (C == tombstone) continue :block_check; - const cc_C = self.ccc_data.ccc(C); + const cc_C = CombiningData.ccc(C); var starter_index: ?usize = null; var j: usize = i; @@ -489,12 +446,12 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo if (dcps[j] == tombstone) continue; // Check for starter. - if (self.ccc_data.isStarter(dcps[j])) { + if (CombiningData.isStarter(dcps[j])) { // Check for blocking conditions. for (dcps[(j + 1)..i]) |B| { if (B == tombstone) continue; - const cc_B = self.ccc_data.ccc(B); - if (cc_B != 0 and self.isHangul(C)) continue :block_check; + const cc_B = CombiningData.ccc(B); + if (cc_B != 0 and isHangul(C)) continue :block_check; if (cc_B >= cc_C) continue :block_check; } @@ -515,10 +472,10 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo // If L and C are Hangul syllables, we can compose // them algorithmically if possible. - if (self.isHangul(L) and self.isHangul(C)) { + if (isHangul(L) and isHangul(C)) { // Get Hangul syllable types. - const l_stype = self.hangul_data.syllable(L); - const c_stype = self.hangul_data.syllable(C); + const l_stype = HangulData.syllable(L); + const c_stype = HangulData.syllable(C); if (l_stype == .LV and c_stype == .T) { // LV, T canonical composition. @@ -547,7 +504,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo // Composition Exclusions (FCX) list, // preventing it from appearing in any // composed form (NFC, NFKC). - if (!self.normp_data.isFcx(P)) { + if (!NormPropsData.isFcx(P)) { dcps[sidx] = P; dcps[i] = tombstone; // Mark for deletion. deleted += 1; @@ -577,7 +534,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo test "nfc" { const allocator = testing.allocator; - const n = try Normalize.init(allocator); + var n = try Normalize.init(allocator); defer n.deinit(allocator); const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); @@ -588,7 +545,7 @@ test "nfc" { test "nfkc" { const allocator = testing.allocator; - const n = try Normalize.init(allocator); + var n = try Normalize.init(allocator); defer n.deinit(allocator); const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); @@ -609,7 +566,7 @@ pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8) test "eql" { const allocator = testing.allocator; - const n = try Normalize.init(allocator); + var n = try Normalize.init(allocator); defer n.deinit(allocator); try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); @@ -666,13 +623,13 @@ const mem = std.mem; const simd = std.simd; const testing = std.testing; const unicode = std.unicode; -const Allocator = std.mem.Allocator; +const Allocator = mem.Allocator; const ascii = @import("ascii"); const CodePointIterator = @import("code_point").Iterator; const CanonData = @import("CanonData"); -const CccData = @import("CombiningData"); +const CombiningData = @import("CombiningData"); const CompatData = @import("CompatData"); const HangulData = @import("HangulData"); const NormPropsData = @import("NormPropsData"); -- cgit v1.2.3