Base units do not allocate

CanonData included. I may still sort out caseless matching without allocation, but that's a stretch goal. Closes #86 Closes #85
author: Sam Atman 2026-02-05 07:07:40 -0500
committer: Sam Atman 2026-02-05 07:07:40 -0500
commit: 95f9487f6a7bde2d7266399bdf6843b97cc1b301 (patch)
tree: 122cd20fa574861e807844974b49eb2f91285d3c
parent: Teasing out canonicalization (diff)
download: zg-95f9487f6a7bde2d7266399bdf6843b97cc1b301.tar.gz
zg-95f9487f6a7bde2d7266399bdf6843b97cc1b301.tar.xz
zg-95f9487f6a7bde2d7266399bdf6843b97cc1b301.zip
4 files changed, 85 insertions, 166 deletions
diff --git a/src/CanonData.zig b/src/CanonData.zig
index c972534..5c1ffa6 100644
--- a/src/CanonData.zig
+++ b/src/CanonData.zig
@@ -1,13 +1,23 @@
 //! Canonicalization Data
-s1: []const u16 = undefined,
+const Data = struct {
-s2: []const @import("canon").Canonicalization = undefined,
+    s1: []const u16 = undefined,
-nfc: std.AutoHashMapUnmanaged([2]u21, u21),
+    s2: []const @import("canon").Canonicalization = undefined,
+};
+const canon_data = canon_data: {
+    const canon_ = @import("canon");
+    break :canon_data Data{
+        .s1 = &canon_.s1,
+        .s2 = &canon_.s2,
+    };
+};
 const CanonData = @This();
 // There's a bug here, which is down to how static u21 vs. runtime are handled,
-// the "unique representation" claim is not working out.  So we do this:
+// the "unique representation" claim is not working out.  AutoHash casts to bytes,
+// and that won't fly.  So we do this:
 const Context = struct {
    pub fn hash(_: Context, cps: [2]u21) u64 {
@@ -22,47 +32,14 @@ const Context = struct {
 const c_map = comptime_map.ComptimeHashMap([2]u21, u21, Context, @import("canon").c_map);
-pub fn init(allocator: mem.Allocator) !CanonData {
-    var cdata = CanonData{
-        .nfc = .empty,
-    };
-    errdefer cdata.deinit(allocator);
-    const data = @import("canon");
-    cdata.s1 = &data.s1;
-    cdata.s2 = &data.s2;
-    var count: usize = 0;
-    for (data.composite) |cp| {
-        count += 1;
-        const cps = cdata.toNfd(cp);
-        std.debug.assert(cps.len == 2);
-        try cdata.nfc.put(allocator, cps[0..2].*, cp);
-    }
-    // var keys = cdata.nfc.keyIterator();
-    // while (keys.next()) |key| {
-    //     const c32: [2]u32 = .{ key[0], key[1] };
-    //     if (c_map.get(c32)) |_| {
-    //         std.debug.print("got", .{});
-    //     }
-    // }
-    return cdata;
-}
-pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void {
-    cdata.nfc.deinit(allocator);
-}
 /// Returns canonical decomposition for `cp`.
-pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 {
+pub fn toNfd(cp: u21) []const u21 {
-    const canon = &cdata.s2[cdata.s1[cp >> 8] + (cp & 0xff)];
+    const canon = &canon_data.s2[canon_data.s1[cp >> 8] + (cp & 0xff)];
    return canon.cps[0..canon.len];
 }
 // Returns the primary composite for the codepoints in `cp`.
-pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 {
+pub fn toNfc(cps: [2]u21) ?u21 {
-    _ = cdata;
    if (c_map.get(cps)) |cpp| {
        return cpp.*;
    } else {
diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig
index 88f047c..d69cddc 100644
--- a/src/CaseFolding.zig
+++ b/src/CaseFolding.zig
@@ -100,14 +100,13 @@ fn isCwcfException(cp: u21) bool {
 /// comprehensive comparison possible, but slower than `canonCaselessMatch`.
 pub fn compatCaselessMatch(
    allocator: Allocator,
-    normalize: Normalize,
    a: []const u8,
    b: []const u8,
 ) Allocator.Error!bool {
    if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
    // Process a
-    const nfd_a = try normalize.nfxdCodePoints(allocator, a, .nfd);
+    const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd);
    defer allocator.free(nfd_a);
    var need_free_cf_nfd_a = false;
@@ -118,15 +117,15 @@ pub fn compatCaselessMatch(
    }
    defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a);
-    const nfkd_cf_nfd_a = try normalize.nfkdCodePoints(allocator, cf_nfd_a);
+    const nfkd_cf_nfd_a = try Normalize.nfkdCodePoints(allocator, cf_nfd_a);
    defer allocator.free(nfkd_cf_nfd_a);
    const cf_nfkd_cf_nfd_a = try CaseFolding.caseFoldAlloc(allocator, nfkd_cf_nfd_a);
    defer allocator.free(cf_nfkd_cf_nfd_a);
-    const nfkd_cf_nfkd_cf_nfd_a = try normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a);
+    const nfkd_cf_nfkd_cf_nfd_a = try Normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a);
    defer allocator.free(nfkd_cf_nfkd_cf_nfd_a);
    // Process b
-    const nfd_b = try normalize.nfxdCodePoints(allocator, b, .nfd);
+    const nfd_b = try Normalize.nfxdCodePoints(allocator, b, .nfd);
    defer allocator.free(nfd_b);
    var need_free_cf_nfd_b = false;
@@ -137,11 +136,11 @@ pub fn compatCaselessMatch(
    }
    defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b);
-    const nfkd_cf_nfd_b = try normalize.nfkdCodePoints(allocator, cf_nfd_b);
+    const nfkd_cf_nfd_b = try Normalize.nfkdCodePoints(allocator, cf_nfd_b);
    defer allocator.free(nfkd_cf_nfd_b);
    const cf_nfkd_cf_nfd_b = try CaseFolding.caseFoldAlloc(allocator, nfkd_cf_nfd_b);
    defer allocator.free(cf_nfkd_cf_nfd_b);
-    const nfkd_cf_nfkd_cf_nfd_b = try normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b);
+    const nfkd_cf_nfkd_cf_nfd_b = try Normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b);
    defer allocator.free(nfkd_cf_nfkd_cf_nfd_b);
    return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b);
@@ -176,31 +175,27 @@ test "caseFold" {
 test "compatCaselessMatch" {
    const allocator = testing.allocator;
-    var normalize = try Normalize.init(allocator);
+    try testing.expect(try compatCaselessMatch(allocator, "ascii only!", "ASCII Only!"));
-    defer normalize.deinit(allocator);
-    try testing.expect(try compatCaselessMatch(allocator, normalize, "ascii only!", "ASCII Only!"));
    const a = "Héllo World! \u{3d3}";
    const b = "He\u{301}llo World! \u{3a5}\u{301}";
-    try testing.expect(try compatCaselessMatch(allocator, normalize, a, b));
+    try testing.expect(try compatCaselessMatch(allocator, a, b));
    const c = "He\u{301}llo World! \u{3d2}\u{301}";
-    try testing.expect(try compatCaselessMatch(allocator, normalize, a, c));
+    try testing.expect(try compatCaselessMatch(allocator, a, c));
 }
 /// Performs canonical caseless string matching by decomposing to NFD. This is
 /// faster than `compatCaselessMatch`, but less comprehensive.
 pub fn canonCaselessMatch(
    allocator: Allocator,
-    normalize: Normalize,
    a: []const u8,
    b: []const u8,
 ) Allocator.Error!bool {
    if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
    // Process a
-    const nfd_a = try normalize.nfxdCodePoints(allocator, a, .nfd);
+    const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd);
    defer allocator.free(nfd_a);
    var need_free_cf_nfd_a = false;
@@ -214,13 +209,13 @@ pub fn canonCaselessMatch(
    var need_free_nfd_cf_nfd_a = false;
    var nfd_cf_nfd_a = cf_nfd_a;
    if (!need_free_cf_nfd_a) {
-        nfd_cf_nfd_a = try normalize.nfdCodePoints(allocator, cf_nfd_a);
+        nfd_cf_nfd_a = try Normalize.nfdCodePoints(allocator, cf_nfd_a);
        need_free_nfd_cf_nfd_a = true;
    }
    defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a);
    // Process b
-    const nfd_b = try normalize.nfxdCodePoints(allocator, b, .nfd);
+    const nfd_b = try Normalize.nfxdCodePoints(allocator, b, .nfd);
    defer allocator.free(nfd_b);
    var need_free_cf_nfd_b = false;
@@ -234,7 +229,7 @@ pub fn canonCaselessMatch(
    var need_free_nfd_cf_nfd_b = false;
    var nfd_cf_nfd_b = cf_nfd_b;
    if (!need_free_cf_nfd_b) {
-        nfd_cf_nfd_b = try normalize.nfdCodePoints(allocator, cf_nfd_b);
+        nfd_cf_nfd_b = try Normalize.nfdCodePoints(allocator, cf_nfd_b);
        need_free_nfd_cf_nfd_b = true;
    }
    defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b);
@@ -245,17 +240,14 @@ pub fn canonCaselessMatch(
 test "canonCaselessMatch" {
    const allocator = testing.allocator;
-    var normalize = try Normalize.init(allocator);
+    try testing.expect(try canonCaselessMatch(allocator, "ascii only!", "ASCII Only!"));
-    defer normalize.deinit(allocator);
-    try testing.expect(try canonCaselessMatch(allocator, normalize, "ascii only!", "ASCII Only!"));
    const a = "Héllo World! \u{3d3}";
    const b = "He\u{301}llo World! \u{3a5}\u{301}";
-    try testing.expect(!try canonCaselessMatch(allocator, normalize, a, b));
+    try testing.expect(!try canonCaselessMatch(allocator, a, b));
    const c = "He\u{301}llo World! \u{3d2}\u{301}";
-    try testing.expect(try canonCaselessMatch(allocator, normalize, a, c));
+    try testing.expect(try canonCaselessMatch(allocator, a, c));
 }
 const std = @import("std");
diff --git a/src/Normalize.zig b/src/Normalize.zig
index 3191a8c..865318f 100644
--- a/src/Normalize.zig
+++ b/src/Normalize.zig
@@ -2,25 +2,8 @@
 //! Unicode Normalization. You can normalize strings into NFC,
 //! NFKC, NFD, and NFKD normalization forms.
-canon_data: CanonData = undefined,
 const Normalize = @This();
-pub fn init(allocator: Allocator) !Normalize {
-    var norm: Normalize = undefined;
-    try norm.setup(allocator);
-    return norm;
-}
-pub fn setup(self: *Normalize, allocator: Allocator) !void {
-    self.canon_data = try CanonData.init(allocator);
-}
-pub fn deinit(norm: *const Normalize, allocator: Allocator) void {
-    const mut_norm = @constCast(norm);
-    mut_norm.canon_data.deinit(allocator);
-}
 const SBase: u21 = 0xAC00;
 const LBase: u21 = 0x1100;
 const VBase: u21 = 0x1161;
@@ -91,12 +74,12 @@ const Decomp = struct {
 };
 // `mapping` retrieves the decomposition mapping for a code point as per the UCD.
-fn mapping(self: Normalize, cp: u21, form: Form) Decomp {
+fn mapping(cp: u21, form: Form) Decomp {
    var dc = Decomp{};
    switch (form) {
        .nfd => {
-            dc.cps = self.canon_data.toNfd(cp);
+            dc.cps = CanonData.toNfd(cp);
            if (dc.cps.len != 0) dc.form = .nfd;
        },
@@ -105,7 +88,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp {
            if (dc.cps.len != 0) {
                dc.form = .nfkd;
            } else {
-                dc.cps = self.canon_data.toNfd(cp);
+                dc.cps = CanonData.toNfd(cp);
                if (dc.cps.len != 0) dc.form = .nfkd;
            }
        },
@@ -117,12 +100,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp {
 }
 // `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`.
-fn decompose(
+fn decompose(cp: u21, form: Form, buf: []u21) Decomp {
-    self: Normalize,
-    cp: u21,
-    form: Form,
-    buf: []u21,
-) Decomp {
    // ASCII
    if (cp < 128) return .{};
@@ -149,7 +127,7 @@ fn decompose(
        // Look at previous code point in work queue.
        work_index -= 1;
        const next = work[work_index];
-        const m = self.mapping(next, form);
+        const m = Normalize.mapping(next, form);
        // No more of decompositions for this code point.
        if (m.form == .same) {
@@ -175,44 +153,41 @@ fn decompose(
 }
 test "decompose" {
-    const allocator = testing.allocator;
-    var n = try Normalize.init(allocator);
-    defer n.deinit(allocator);
    var buf: [18]u21 = undefined;
-    var dc = n.decompose('é', .nfd, &buf);
+    var dc = Normalize.decompose('é', .nfd, &buf);
    try testing.expect(dc.form == .nfd);
    try testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]);
-    dc = n.decompose('\u{1e0a}', .nfd, &buf);
+    dc = Normalize.decompose('\u{1e0a}', .nfd, &buf);
    try testing.expect(dc.form == .nfd);
    try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
-    dc = n.decompose('\u{1e0a}', .nfkd, &buf);
+    dc = Normalize.decompose('\u{1e0a}', .nfkd, &buf);
    try testing.expect(dc.form == .nfkd);
    try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
-    dc = n.decompose('\u{3189}', .nfd, &buf);
+    dc = Normalize.decompose('\u{3189}', .nfd, &buf);
    try testing.expect(dc.form == .same);
    try testing.expect(dc.cps.len == 0);
-    dc = n.decompose('\u{3189}', .nfkd, &buf);
+    dc = Normalize.decompose('\u{3189}', .nfkd, &buf);
    try testing.expect(dc.form == .nfkd);
    try testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]);
-    dc = n.decompose('\u{ace1}', .nfd, &buf);
+    dc = Normalize.decompose('\u{ace1}', .nfd, &buf);
    try testing.expect(dc.form == .nfd);
    try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
-    dc = n.decompose('\u{ace1}', .nfkd, &buf);
+    dc = Normalize.decompose('\u{ace1}', .nfkd, &buf);
    try testing.expect(dc.form == .nfd);
    try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
-    dc = n.decompose('\u{3d3}', .nfd, &buf);
+    dc = Normalize.decompose('\u{3d3}', .nfd, &buf);
    try testing.expect(dc.form == .nfd);
    try testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]);
-    dc = n.decompose('\u{3d3}', .nfkd, &buf);
+    dc = Normalize.decompose('\u{3d3}', .nfkd, &buf);
    try testing.expect(dc.form == .nfkd);
    try testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]);
 }
@@ -231,8 +206,8 @@ pub const Result = struct {
        return .{ .allocated = true, .slice = try allocator.dupe(u8, result.slice) };
    }
-    pub fn deinit(self: *const Result, allocator: Allocator) void {
+    pub fn deinit(result: *const Result, allocator: Allocator) void {
-        if (self.allocated) allocator.free(self.slice);
+        if (result.allocated) allocator.free(result.slice);
    }
 };
@@ -252,16 +227,16 @@ fn canonicalSort(cps: []u21) void {
 }
 /// Normalize `str` to NFD.
-pub fn nfd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result {
+pub fn nfd(allocator: Allocator, str: []const u8) Allocator.Error!Result {
-    return self.nfxd(allocator, str, .nfd);
+    return Normalize.nfxd(allocator, str, .nfd);
 }
 /// Normalize `str` to NFKD.
-pub fn nfkd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result {
+pub fn nfkd(allocator: Allocator, str: []const u8) Allocator.Error!Result {
-    return self.nfxd(allocator, str, .nfkd);
+    return Normalize.nfxd(allocator, str, .nfkd);
 }
-pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error![]u21 {
+pub fn nfxdCodePoints(allocator: Allocator, str: []const u8, form: Form) Allocator.Error![]u21 {
    var dcp_list = std.array_list.Managed(u21).init(allocator);
    defer dcp_list.deinit();
@@ -269,7 +244,7 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo
    var dc_buf: [18]u21 = undefined;
    while (cp_iter.next()) |cp| {
-        const dc = self.decompose(cp.code, form, &dc_buf);
+        const dc = Normalize.decompose(cp.code, form, &dc_buf);
        if (dc.form == .same) {
            try dcp_list.append(cp.code);
        } else {
@@ -282,11 +257,11 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo
    return try dcp_list.toOwnedSlice();
 }
-fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result {
+fn nfxd(allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result {
    // Quick checks.
    if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
-    const dcps = try self.nfxdCodePoints(allocator, str, form);
+    const dcps = try Normalize.nfxdCodePoints(allocator, str, form);
    defer allocator.free(dcps);
    var dstr_list = std.array_list.Managed(u8).init(allocator);
@@ -303,10 +278,8 @@ fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
 test "nfd ASCII / no-alloc" {
    const allocator = testing.allocator;
-    var n = try Normalize.init(allocator);
-    defer n.deinit(allocator);
-    const result = try n.nfd(allocator, "Hello World!");
+    const result = try Normalize.nfd(allocator, "Hello World!");
    defer result.deinit(allocator);
    try testing.expectEqualStrings("Hello World!", result.slice);
@@ -314,10 +287,8 @@ test "nfd ASCII / no-alloc" {
 test "nfd !ASCII / alloc" {
    const allocator = testing.allocator;
-    var n = try Normalize.init(allocator);
-    defer n.deinit(allocator);
-    const result = try n.nfd(allocator, "Héllo World! \u{3d3}");
+    const result = try Normalize.nfd(allocator, "Héllo World! \u{3d3}");
    defer result.deinit(allocator);
    try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice);
@@ -325,10 +296,8 @@ test "nfd !ASCII / alloc" {
 test "nfkd ASCII / no-alloc" {
    const allocator = testing.allocator;
-    var n = try Normalize.init(allocator);
-    defer n.deinit(allocator);
-    const result = try n.nfkd(allocator, "Hello World!");
+    const result = try Normalize.nfkd(allocator, "Hello World!");
    defer result.deinit(allocator);
    try testing.expectEqualStrings("Hello World!", result.slice);
@@ -336,27 +305,21 @@ test "nfkd ASCII / no-alloc" {
 test "nfkd !ASCII / alloc" {
    const allocator = testing.allocator;
-    var n = try Normalize.init(allocator);
-    defer n.deinit(allocator);
-    const result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
+    const result = try Normalize.nfkd(allocator, "Héllo World! \u{3d3}");
    defer result.deinit(allocator);
    try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice);
 }
-pub fn nfdCodePoints(
+pub fn nfdCodePoints(allocator: Allocator, cps: []const u21) Allocator.Error![]u21 {
-    self: Normalize,
-    allocator: Allocator,
-    cps: []const u21,
-) Allocator.Error![]u21 {
    var dcp_list = std.array_list.Managed(u21).init(allocator);
    defer dcp_list.deinit();
    var dc_buf: [18]u21 = undefined;
    for (cps) |cp| {
-        const dc = self.decompose(cp, .nfd, &dc_buf);
+        const dc = Normalize.decompose(cp, .nfd, &dc_buf);
        if (dc.form == .same) {
            try dcp_list.append(cp);
@@ -370,18 +333,14 @@ pub fn nfdCodePoints(
    return try dcp_list.toOwnedSlice();
 }
-pub fn nfkdCodePoints(
+pub fn nfkdCodePoints(allocator: Allocator, cps: []const u21) Allocator.Error![]u21 {
-    self: Normalize,
-    allocator: Allocator,
-    cps: []const u21,
-) Allocator.Error![]u21 {
    var dcp_list = std.array_list.Managed(u21).init(allocator);
    defer dcp_list.deinit();
    var dc_buf: [18]u21 = undefined;
    for (cps) |cp| {
-        const dc = self.decompose(cp, .nfkd, &dc_buf);
+        const dc = Normalize.decompose(cp, .nfkd, &dc_buf);
        if (dc.form == .same) {
            try dcp_list.append(cp);
@@ -402,29 +361,29 @@ fn isHangul(cp: u21) bool {
 }
 /// Normalizes `str` to NFC.
-pub fn nfc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result {
+pub fn nfc(allocator: Allocator, str: []const u8) Allocator.Error!Result {
-    return self.nfxc(allocator, str, .nfc);
+    return Normalize.nfxc(allocator, str, .nfc);
 }
 /// Normalizes `str` to NFKC.
-pub fn nfkc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result {
+pub fn nfkc(allocator: Allocator, str: []const u8) Allocator.Error!Result {
-    return self.nfxc(allocator, str, .nfkc);
+    return Normalize.nfxc(allocator, str, .nfkc);
 }
-fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result {
+fn nfxc(allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result {
    // Quick checks.
    if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
    if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str };
    // Decompose first.
    var dcps = if (form == .nfc)
-        try self.nfxdCodePoints(allocator, str, .nfd)
+        try Normalize.nfxdCodePoints(allocator, str, .nfd)
    else
-        try self.nfxdCodePoints(allocator, str, .nfkd);
+        try Normalize.nfxdCodePoints(allocator, str, .nfkd);
    defer allocator.free(dcps);
    // Compose
-    const tombstone = 0xe000; // Start of BMP Private Use Area
+    const tombstone = 0x1FFFF; // Convenient Cn noncharacter point
    // Loop over all decomposed code points.
    while (true) {
@@ -498,7 +457,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
                if (!processed_hangul) {
                    // L, C are not Hangul, so check for primary composite
                    // in the Unicode Character Database.
-                    if (self.canon_data.toNfc(.{ L, C })) |P| {
+                    if (CanonData.toNfc(.{ L, C })) |P| {
                        // We have a primary composite P for L, C.
                        // We must check if P is not in the Full
                        // Composition Exclusions  (FCX) list,
@@ -534,10 +493,8 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
 test "nfc" {
    const allocator = testing.allocator;
-    var n = try Normalize.init(allocator);
-    defer n.deinit(allocator);
-    const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
+    const result = try Normalize.nfc(allocator, "Complex char: \u{3D2}\u{301}");
    defer result.deinit(allocator);
    try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice);
@@ -545,20 +502,18 @@ test "nfc" {
 test "nfkc" {
    const allocator = testing.allocator;
-    var n = try Normalize.init(allocator);
-    defer n.deinit(allocator);
-    const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
+    const result = try Normalize.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
    defer result.deinit(allocator);
    try testing.expectEqualStrings("Complex char: \u{038E}", result.slice);
 }
 /// Tests for equality of `a` and `b` after normalizing to NFC.
-pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8) !bool {
+pub fn eql(allocator: Allocator, a: []const u8, b: []const u8) !bool {
-    const norm_result_a = try self.nfc(allocator, a);
+    const norm_result_a = try Normalize.nfc(allocator, a);
    defer norm_result_a.deinit(allocator);
-    const norm_result_b = try self.nfc(allocator, b);
+    const norm_result_b = try Normalize.nfc(allocator, b);
    defer norm_result_b.deinit(allocator);
    return mem.eql(u8, norm_result_a.slice, norm_result_b.slice);
@@ -566,11 +521,9 @@ pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8)
 test "eql" {
    const allocator = testing.allocator;
-    var n = try Normalize.init(allocator);
-    defer n.deinit(allocator);
-    try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
+    try testing.expect(try Normalize.eql(allocator, "foé", "foe\u{0301}"));
-    try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
+    try testing.expect(try Normalize.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
 }
 /// Returns true if `str` only contains Latin-1 Supplement
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 50b8824..81ea90d 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -5,9 +5,6 @@ test "Unicode normalization tests" {
    defer arena.deinit();
    const allocator = arena.allocator();
-    const n = try Normalize.init(allocator);
-    defer n.deinit(allocator);
    var reader = std.io.Reader.fixed(@embedFile("NormalizationTest.txt"));
    var cp_buf: [4]u8 = undefined;
@@ -47,7 +44,7 @@ test "Unicode normalization tests" {
                }
                const want = w_buf.items;
-                var got = try n.nfc(allocator, input);
+                var got = try Normalize.nfc(allocator, input);
                defer got.deinit(allocator);
                try testing.expectEqualStrings(want, got.slice);
@@ -64,7 +61,7 @@ test "Unicode normalization tests" {
                }
                const want = w_buf.items;
-                var got = try n.nfd(allocator, input);
+                var got = try Normalize.nfd(allocator, input);
                defer got.deinit(allocator);
                try testing.expectEqualStrings(want, got.slice);
@@ -81,7 +78,7 @@ test "Unicode normalization tests" {
                }
                const want = w_buf.items;
-                var got = try n.nfkc(allocator, input);
+                var got = try Normalize.nfkc(allocator, input);
                defer got.deinit(allocator);
                try testing.expectEqualStrings(want, got.slice);
@@ -98,7 +95,7 @@ test "Unicode normalization tests" {
                }
                const want = w_buf.items;
-                const got = try n.nfkd(allocator, input);
+                const got = try Normalize.nfkd(allocator, input);
                defer got.deinit(allocator);
                try testing.expectEqualStrings(want, got.slice);
author	Sam Atman	2026-02-05 07:07:40 -0500
committer	Sam Atman	2026-02-05 07:07:40 -0500
commit	95f9487f6a7bde2d7266399bdf6843b97cc1b301 (patch)
tree	122cd20fa574861e807844974b49eb2f91285d3c
parent	Teasing out canonicalization (diff)
download	zg-95f9487f6a7bde2d7266399bdf6843b97cc1b301.tar.gz zg-95f9487f6a7bde2d7266399bdf6843b97cc1b301.tar.xz zg-95f9487f6a7bde2d7266399bdf6843b97cc1b301.zip