From ba5d9081b479e95ffa7f3baf751beedd370cec14 Mon Sep 17 00:00:00 2001
From: Sam Atman
Date: Wed, 4 Feb 2026 18:01:36 -0500
Subject: Normalization and case folding

Both of which deserve some further attention.
---
 src/Normalize.zig | 119 +++++++++++++++++-------------------------------------
 1 file changed, 38 insertions(+), 81 deletions(-)

(limited to 'src/Normalize.zig')

diff --git a/src/Normalize.zig b/src/Normalize.zig
index 4a1bae8..3191a8c 100644
--- a/src/Normalize.zig
+++ b/src/Normalize.zig
@@ -3,64 +3,22 @@
 //! NFKC, NFD, and NFKD normalization forms.
 
 canon_data: CanonData = undefined,
-ccc_data: CccData = undefined,
-compat_data: CompatData = undefined,
-hangul_data: HangulData = undefined,
-normp_data: NormPropsData = undefined,
 
 const Normalize = @This();
 
-pub fn init(allocator: Allocator) Allocator.Error!Normalize {
+pub fn init(allocator: Allocator) !Normalize {
     var norm: Normalize = undefined;
     try norm.setup(allocator);
     return norm;
 }
 
-pub fn setup(self: *Normalize, allocator: Allocator) Allocator.Error!void {
-    self.canon_data = CanonData.init(allocator) catch |err| {
-        switch (err) {
-            error.OutOfMemory => |e| return e,
-            else => unreachable,
-        }
-    };
-    errdefer self.canon_data.deinit(allocator);
-    self.ccc_data = CccData.init(allocator) catch |err| {
-        switch (err) {
-            error.OutOfMemory => |e| return e,
-            else => unreachable,
-        }
-    };
-    errdefer self.ccc_data.deinit(allocator);
-    self.compat_data = CompatData.init(allocator) catch |err| {
-        switch (err) {
-            error.OutOfMemory => |e| return e,
-            else => unreachable,
-        }
-    };
-    errdefer self.compat_data.deinit(allocator);
-    self.hangul_data = HangulData.init(allocator) catch |err| {
-        switch (err) {
-            error.OutOfMemory => |e| return e,
-            else => unreachable,
-        }
-    };
-    errdefer self.hangul_data.deinit(allocator);
-    self.normp_data = NormPropsData.init(allocator) catch |err| {
-        switch (err) {
-            error.OutOfMemory => |e| return e,
-            else => unreachable,
-        }
-    };
+pub fn setup(self: *Normalize, allocator: Allocator) !void {
+    self.canon_data = try CanonData.init(allocator);
 }
 
 pub fn deinit(norm: *const Normalize, allocator: Allocator) void {
-    // Reasonably safe (?)
-    var mut_norm = @constCast(norm);
+    const mut_norm = @constCast(norm);
     mut_norm.canon_data.deinit(allocator);
-    mut_norm.ccc_data.deinit(allocator);
-    mut_norm.compat_data.deinit(allocator);
-    mut_norm.hangul_data.deinit(allocator);
-    mut_norm.normp_data.deinit(allocator);
 }
 
 const SBase: u21 = 0xAC00;
@@ -73,8 +31,8 @@ const TCount: u21 = 28;
 const NCount: u21 = 588; // VCount * TCount
 const SCount: u21 = 11172; // LCount * NCount
 
-fn decomposeHangul(self: Normalize, cp: u21, buf: []u21) ?Decomp {
-    const kind = self.hangul_data.syllable(cp);
+fn decomposeHangul(cp: u21, buf: []u21) ?Decomp {
+    const kind = HangulData.syllable(cp);
     if (kind != .LV and kind != .LVT) return null;
 
     const SIndex: u21 = cp - SBase;
@@ -143,7 +101,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp {
         },
 
         .nfkd => {
-            dc.cps = self.compat_data.toNfkd(cp);
+            dc.cps = CompatData.toNfkd(cp);
             if (dc.cps.len != 0) {
                 dc.form = .nfkd;
             } else {
@@ -170,13 +128,13 @@ fn decompose(
 
     // NFD / NFKD quick checks.
     switch (form) {
-        .nfd => if (self.normp_data.isNfd(cp)) return .{},
-        .nfkd => if (self.normp_data.isNfkd(cp)) return .{},
+        .nfd => if (NormPropsData.isNfd(cp)) return .{},
+        .nfkd => if (NormPropsData.isNfkd(cp)) return .{},
         else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."),
     }
 
     // Hangul precomposed syllable full decomposition.
-    if (self.decomposeHangul(cp, buf)) |dc| return dc;
+    if (decomposeHangul(cp, buf)) |dc| return dc;
 
     // Full decomposition.
     var dc = Decomp{ .form = form };
@@ -218,9 +176,8 @@ fn decompose(
 
 test "decompose" {
     const allocator = testing.allocator;
-    const n = try Normalize.init(allocator);
+    var n = try Normalize.init(allocator);
     defer n.deinit(allocator);
-
     var buf: [18]u21 = undefined;
 
     var dc = n.decompose('é', .nfd, &buf);
@@ -280,17 +237,17 @@ pub const Result = struct {
 };
 
 // Compares code points by Canonical Combining Class order.
-fn cccLess(self: Normalize, lhs: u21, rhs: u21) bool {
-    return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs);
+fn cccLess(_: void, lhs: u21, rhs: u21) bool {
+    return CombiningData.ccc(lhs) < CombiningData.ccc(rhs);
 }
 
 // Applies the Canonical Sorting Algorithm.
-fn canonicalSort(self: Normalize, cps: []u21) void {
+fn canonicalSort(cps: []u21) void {
     var i: usize = 0;
     while (i < cps.len) : (i += 1) {
         const start: usize = i;
-        while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {}
-        mem.sort(u21, cps[start..i], self, cccLess);
+        while (i < cps.len and CombiningData.ccc(cps[i]) != 0) : (i += 1) {}
+        mem.sort(u21, cps[start..i], {}, cccLess);
     }
 }
 
@@ -320,7 +277,7 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo
         }
     }
 
-    self.canonicalSort(dcp_list.items);
+    canonicalSort(dcp_list.items);
 
     return try dcp_list.toOwnedSlice();
 }
@@ -346,7 +303,7 @@ fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
 
 test "nfd ASCII / no-alloc" {
     const allocator = testing.allocator;
-    const n = try Normalize.init(allocator);
+    var n = try Normalize.init(allocator);
     defer n.deinit(allocator);
 
     const result = try n.nfd(allocator, "Hello World!");
@@ -357,7 +314,7 @@ test "nfd ASCII / no-alloc" {
 
 test "nfd !ASCII / alloc" {
     const allocator = testing.allocator;
-    const n = try Normalize.init(allocator);
+    var n = try Normalize.init(allocator);
     defer n.deinit(allocator);
 
     const result = try n.nfd(allocator, "Héllo World! \u{3d3}");
@@ -368,7 +325,7 @@ test "nfd !ASCII / alloc" {
 
 test "nfkd ASCII / no-alloc" {
     const allocator = testing.allocator;
-    const n = try Normalize.init(allocator);
+    var n = try Normalize.init(allocator);
     defer n.deinit(allocator);
 
     const result = try n.nfkd(allocator, "Hello World!");
@@ -379,7 +336,7 @@ test "nfkd ASCII / no-alloc" {
 
 test "nfkd !ASCII / alloc" {
     const allocator = testing.allocator;
-    const n = try Normalize.init(allocator);
+    var n = try Normalize.init(allocator);
     defer n.deinit(allocator);
 
     const result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
@@ -408,7 +365,7 @@ pub fn nfdCodePoints(
         }
     }
 
-    self.canonicalSort(dcp_list.items);
+    canonicalSort(dcp_list.items);
 
     return try dcp_list.toOwnedSlice();
 }
@@ -433,15 +390,15 @@ pub fn nfkdCodePoints(
         }
     }
 
-    self.canonicalSort(dcp_list.items);
+    canonicalSort(dcp_list.items);
 
     return try dcp_list.toOwnedSlice();
 }
 
 // Composition (NFC, NFKC)
 
-fn isHangul(self: Normalize, cp: u21) bool {
-    return cp >= 0x1100 and self.hangul_data.syllable(cp) != .none;
+fn isHangul(cp: u21) bool {
+    return cp >= 0x1100 and HangulData.syllable(cp) != .none;
 }
 
 /// Normalizes `str` to NFC.
@@ -479,7 +436,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
         block_check: while (i < dcps.len) : (i += 1) {
             const C = dcps[i];
             if (C == tombstone) continue :block_check;
-            const cc_C = self.ccc_data.ccc(C);
+            const cc_C = CombiningData.ccc(C);
             var starter_index: ?usize = null;
             var j: usize = i;
 
@@ -489,12 +446,12 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
                 if (dcps[j] == tombstone) continue;
 
                 // Check for starter.
-                if (self.ccc_data.isStarter(dcps[j])) {
+                if (CombiningData.isStarter(dcps[j])) {
                     // Check for blocking conditions.
                     for (dcps[(j + 1)..i]) |B| {
                         if (B == tombstone) continue;
-                        const cc_B = self.ccc_data.ccc(B);
-                        if (cc_B != 0 and self.isHangul(C)) continue :block_check;
+                        const cc_B = CombiningData.ccc(B);
+                        if (cc_B != 0 and isHangul(C)) continue :block_check;
                         if (cc_B >= cc_C) continue :block_check;
                     }
 
@@ -515,10 +472,10 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
 
                 // If L and C are Hangul syllables, we can compose
                 // them algorithmically if possible.
-                if (self.isHangul(L) and self.isHangul(C)) {
+                if (isHangul(L) and isHangul(C)) {
                     // Get Hangul syllable types.
-                    const l_stype = self.hangul_data.syllable(L);
-                    const c_stype = self.hangul_data.syllable(C);
+                    const l_stype = HangulData.syllable(L);
+                    const c_stype = HangulData.syllable(C);
 
                     if (l_stype == .LV and c_stype == .T) {
                         // LV, T canonical composition.
@@ -547,7 +504,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
                         // Composition Exclusions  (FCX) list,
                         // preventing it from appearing in any
                         // composed form (NFC, NFKC).
-                        if (!self.normp_data.isFcx(P)) {
+                        if (!NormPropsData.isFcx(P)) {
                             dcps[sidx] = P;
                             dcps[i] = tombstone; // Mark for deletion.
                             deleted += 1;
@@ -577,7 +534,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
 
 test "nfc" {
     const allocator = testing.allocator;
-    const n = try Normalize.init(allocator);
+    var n = try Normalize.init(allocator);
     defer n.deinit(allocator);
 
     const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
@@ -588,7 +545,7 @@ test "nfc" {
 
 test "nfkc" {
     const allocator = testing.allocator;
-    const n = try Normalize.init(allocator);
+    var n = try Normalize.init(allocator);
     defer n.deinit(allocator);
 
     const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
@@ -609,7 +566,7 @@ pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8)
 
 test "eql" {
     const allocator = testing.allocator;
-    const n = try Normalize.init(allocator);
+    var n = try Normalize.init(allocator);
     defer n.deinit(allocator);
 
     try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
@@ -666,13 +623,13 @@ const mem = std.mem;
 const simd = std.simd;
 const testing = std.testing;
 const unicode = std.unicode;
-const Allocator = std.mem.Allocator;
+const Allocator = mem.Allocator;
 
 const ascii = @import("ascii");
 const CodePointIterator = @import("code_point").Iterator;
 
 const CanonData = @import("CanonData");
-const CccData = @import("CombiningData");
+const CombiningData = @import("CombiningData");
 const CompatData = @import("CompatData");
 const HangulData = @import("HangulData");
 const NormPropsData = @import("NormPropsData");
-- 
cgit v1.2.3