Normalization and case folding

Both of which deserve some further attention.
author: Sam Atman 2026-02-04 18:01:36 -0500
committer: Sam Atman 2026-02-04 18:01:36 -0500
commit: ba5d9081b479e95ffa7f3baf751beedd370cec14 (patch)
tree: c12041d8aab9f9ff68b25a2e2c9042073c3d5f61
parent: Convert Words module to no-allocation (diff)
download: zg-ba5d9081b479e95ffa7f3baf751beedd370cec14.tar.gz
zg-ba5d9081b479e95ffa7f3baf751beedd370cec14.tar.xz
zg-ba5d9081b479e95ffa7f3baf751beedd370cec14.zip
13 files changed, 571 insertions, 445 deletions
diff --git a/build.zig b/build.zig
index 1ae7005..fa8c490 100644
--- a/build.zig
+++ b/build.zig
@@ -103,7 +103,7 @@ pub fn build(b: *std.Build) void {
    });
    compat_gen_exe.root_module.addAnonymousImport("UnicodeData.txt", .{ .root_source_file = b.path("data/unicode/UnicodeData.txt") });
    const run_compat_gen_exe = b.addRunArtifact(compat_gen_exe);
-    const compat_gen_out = run_compat_gen_exe.addOutputFileArg("compat.bin.z");
+    const compat_gen_out = run_compat_gen_exe.addOutputFileArg("compat.zig");
    const hangul_gen_exe = b.addExecutable(.{
        .name = "hangul",
@@ -115,7 +115,7 @@ pub fn build(b: *std.Build) void {
    });
    hangul_gen_exe.root_module.addAnonymousImport("HangulSyllableType.txt", .{ .root_source_file = b.path("data/unicode/HangulSyllableType.txt") });
    const run_hangul_gen_exe = b.addRunArtifact(hangul_gen_exe);
-    const hangul_gen_out = run_hangul_gen_exe.addOutputFileArg("hangul.bin.z");
+    const hangul_gen_out = run_hangul_gen_exe.addOutputFileArg("hangul.zig");
    const normp_gen_exe = b.addExecutable(.{
        .name = "normp",
@@ -127,7 +127,7 @@ pub fn build(b: *std.Build) void {
    });
    normp_gen_exe.root_module.addAnonymousImport("DerivedNormalizationProps.txt", .{ .root_source_file = b.path("data/unicode/DerivedNormalizationProps.txt") });
    const run_normp_gen_exe = b.addRunArtifact(normp_gen_exe);
-    const normp_gen_out = run_normp_gen_exe.addOutputFileArg("normp.bin.z");
+    const normp_gen_out = run_normp_gen_exe.addOutputFileArg("normp.zig");
    const ccc_gen_exe = b.addExecutable(.{
        .name = "ccc",
@@ -139,7 +139,7 @@ pub fn build(b: *std.Build) void {
    });
    ccc_gen_exe.root_module.addAnonymousImport("DerivedCombiningClass.txt", .{ .root_source_file = b.path("data/unicode/extracted/DerivedCombiningClass.txt") });
    const run_ccc_gen_exe = b.addRunArtifact(ccc_gen_exe);
-    const ccc_gen_out = run_ccc_gen_exe.addOutputFileArg("ccc.bin.z");
+    const ccc_gen_out = run_ccc_gen_exe.addOutputFileArg("ccc.zig");
    const gencat_gen_exe = b.addExecutable(.{
        .name = "gencat",
@@ -164,7 +164,7 @@ pub fn build(b: *std.Build) void {
    fold_gen_exe.root_module.addAnonymousImport("DerivedCoreProperties.txt", .{ .root_source_file = b.path("data/unicode/DerivedCoreProperties.txt") });
    fold_gen_exe.root_module.addAnonymousImport("CaseFolding.txt", .{ .root_source_file = b.path("data/unicode/CaseFolding.txt") });
    const run_fold_gen_exe = b.addRunArtifact(fold_gen_exe);
-    const fold_gen_out = run_fold_gen_exe.addOutputFileArg("fold.bin.z");
+    const fold_gen_out = run_fold_gen_exe.addOutputFileArg("fold.zig");
    // Numeric types
    const num_gen_exe = b.addExecutable(.{
@@ -518,6 +518,8 @@ pub fn build(b: *std.Build) void {
    test_unicode_step.dependOn(&run_unicode_tests.step);
    test_unicode_step.dependOn(&display_width_tr.step);
    test_unicode_step.dependOn(&words_tr.step);
+    test_unicode_step.dependOn(&norm_tr.step);
+    test_unicode_step.dependOn(&case_fold_tr.step);
    const test_step = b.step("test", "Run all module tests");
    test_step.dependOn(&run_unicode_tests.step);
diff --git a/codegen/ccc.zig b/codegen/ccc.zig
index 4e470ae..e76222f 100644
--- a/codegen/ccc.zig
+++ b/codegen/ccc.zig
@@ -112,12 +112,24 @@ pub fn main() anyerror!void {
    defer out_file.close();
    var writer = out_file.writer(&write_buf);
-    const endian = builtin.cpu.arch.endian();
+    try writer.interface.print(
-    try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian);
+        \\//! This file is auto-generated. Do not edit.
-    for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian);
+        \\
+        \\pub const s1: [{}]u16 = .{{
-    try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian);
+    , .{stage1.items.len});
-    try writer.interface.writeAll(stage2.items);
+    for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry});
+    try writer.interface.print(
+        \\
+        \\}};
+        \\
+        \\pub const s2: [{}]u8 = .{{
+    , .{stage2.items.len});
+    for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry});
+    try writer.interface.writeAll(
+        \\};
+    );
    try writer.interface.flush();
 }
diff --git a/codegen/compat.zig b/codegen/compat.zig
index debb83d..a9d1f92 100644
--- a/codegen/compat.zig
+++ b/codegen/compat.zig
@@ -1,58 +1,82 @@
 const std = @import("std");
 const builtin = @import("builtin");
+const block_size = 256;
+const Block = [block_size][]const u21;
+const BlockMap = std.HashMap(
+    Block,
+    u16,
+    struct {
+        pub fn hash(_: @This(), k: Block) u64 {
+            var hasher = std.hash.Wyhash.init(0);
+            std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
+            return hasher.final();
+        }
+        pub fn eql(_: @This(), aBlock: Block, bBlock: Block) bool {
+            return for (aBlock, bBlock) |a, b| {
+                if (a.len != b.len) return false;
+                for (a, b) |a_cp, b_cp| {
+                    if (a_cp != b_cp) return false;
+                }
+            } else true;
+        }
+    },
+    std.hash_map.default_max_load_percentage,
+);
 pub fn main() anyerror!void {
    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer arena.deinit();
    const allocator = arena.allocator();
    // Process UnicodeData.txt
-    var write_buf: [4096]u8 = undefined;
    var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt"));
    var args_iter = try std.process.argsWithAllocator(allocator);
    defer args_iter.deinit();
    _ = args_iter.skip();
    const output_path = args_iter.next() orelse @panic("No output file arg!");
-    var out_file = try std.fs.cwd().createFile(output_path, .{});
+    var compat_map = std.AutoHashMap(u21, []u21).init(allocator);
-    defer out_file.close();
+    defer compat_map.deinit();
-    var writer = out_file.writer(&write_buf);
-    const endian = builtin.cpu.arch.endian();
+    while (in_reader.takeDelimiterInclusive('\n')) |line| {
-    lines: while (in_reader.takeDelimiterInclusive('\n')) |took| {
-        const line = std.mem.trimRight(u8, took, "\n");
        if (line.len == 0) continue;
        var field_iter = std.mem.splitScalar(u8, line, ';');
-        var cps: [19]u24 = undefined;
+        var cp: u21 = undefined;
-        var len: u8 = 1;
        var i: usize = 0;
        while (field_iter.next()) |field| : (i += 1) {
+            if (field.len == 0) continue;
            switch (i) {
-                0 => cps[0] = try std.fmt.parseInt(u24, field, 16),
+                0 => {
+                    cp = try std.fmt.parseInt(u21, field, 16);
+                },
                5 => {
                    // Not compatibility.
-                    if (field.len == 0 or field[0] != '<') continue :lines;
+                    if (field[0] != '<') continue;
                    var cp_iter = std.mem.tokenizeScalar(u8, field, ' ');
                    _ = cp_iter.next(); // <compat type>
+                    var cps: [18]u21 = undefined;
+                    var len: u8 = 0;
                    while (cp_iter.next()) |cp_str| : (len += 1) {
-                        cps[len] = try std.fmt.parseInt(u24, cp_str, 16);
+                        cps[len] = try std.fmt.parseInt(u21, cp_str, 16);
                    }
-                },
-                2 => if (line[0] == '<') continue :lines,
+                    const slice = try allocator.dupe(u21, cps[0..len]);
+                    try compat_map.put(cp, slice);
+                },
                else => {},
            }
        }
-        try writer.interface.writeInt(u8, @intCast(len), endian);
-        for (cps[0..len]) |cp| try writer.interface.writeInt(u24, cp, endian);
    } else |err| switch (err) {
        error.EndOfStream => {},
        else => {
@@ -60,6 +84,63 @@ pub fn main() anyerror!void {
        },
    }
-    try writer.interface.writeInt(u16, 0, endian);
+    // Build multi-tiered lookup tables for compatibility decompositions
+    var blocks_map = BlockMap.init(allocator);
+    defer blocks_map.deinit();
+    var stage1 = std.array_list.Managed(u16).init(allocator);
+    defer stage1.deinit();
+    var stage2 = std.array_list.Managed([]const u21).init(allocator);
+    defer stage2.deinit();
+    var block: Block = [_][]const u21{&[_]u21{}} ** block_size;
+    var block_len: u16 = 0;
+    for (0..0x110000) |i| {
+        const cp: u21 = @intCast(i);
+        const compat: []const u21 = compat_map.get(cp) orelse &[_]u21{};
+        block[block_len] = compat;
+        block_len += 1;
+        if (block_len < block_size and cp != 0x10ffff) continue;
+        const gop = try blocks_map.getOrPut(block);
+        if (!gop.found_existing) {
+            gop.value_ptr.* = @intCast(stage2.items.len);
+            try stage2.appendSlice(&block);
+        }
+        try stage1.append(gop.value_ptr.*);
+        block_len = 0;
+    }
+    // Write out
+    var write_buf: [4096]u8 = undefined;
+    var out_file = try std.fs.cwd().createFile(output_path, .{});
+    defer out_file.close();
+    var writer = out_file.writer(&write_buf);
+    try writer.interface.print(
+        \\//! This file is auto-generated. Do not edit.
+        \\
+        \\pub const s1: [{}]u16 = .{{
+    , .{stage1.items.len});
+    for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry});
+    try writer.interface.print(
+        \\
+        \\}};
+        \\
+        \\pub const s2: [{}][]const u21 = .{{
+    , .{stage2.items.len});
+    for (stage2.items) |entry| {
+        try writer.interface.print("&.{any}, ", .{entry});
+    }
+    try writer.interface.writeAll(
+        \\};
+    );
    try writer.interface.flush();
 }
diff --git a/codegen/fold.zig b/codegen/fold.zig
index 366ed79..c5f54eb 100644
--- a/codegen/fold.zig
+++ b/codegen/fold.zig
@@ -228,26 +228,45 @@ pub fn main() anyerror!void {
        var out_file = try std.fs.cwd().createFile(output_path, .{});
        defer out_file.close();
        var writer = out_file.writer(&write_buf);
-        const endian = builtin.cpu.arch.endian();
        // Table metadata.
-        try writer.interface.writeInt(u24, @intCast(codepoint_cutoff), endian);
+        try writer.interface.print(
-        try writer.interface.writeInt(u24, @intCast(multiple_codepoint_start), endian);
+            \\//! This file is auto-generated. Do not edit.
-        // Stage 1
+            \\
-        try writer.interface.writeInt(u16, @intCast(meaningful_stage1.len), endian);
+            \\pub const cutoff: u21 = {};
-        try writer.interface.writeAll(meaningful_stage1);
+            \\pub const cwcf_exceptions_min: u21 = {};
-        // Stage 2
+            \\pub const cwcf_exceptions_max: u21 = {};
-        try writer.interface.writeInt(u16, @intCast(stage2.len), endian);
+            \\pub const cwcf_exceptions: [{}]u21 = .{{
-        try writer.interface.writeAll(stage2);
+        , .{ codepoint_cutoff, std.mem.min(u21, changes_when_casefolded_exceptions.items), std.mem.max(u21, changes_when_casefolded_exceptions.items), changes_when_casefolded_exceptions.items.len });
-        // Stage 3
+        for (changes_when_casefolded_exceptions.items) |cp| try writer.interface.print("{}, ", .{cp});
-        try writer.interface.writeInt(u16, @intCast(stage3.len), endian);
-        for (stage3) |offset| try writer.interface.writeInt(i24, offset, endian);
+        try writer.interface.print(
-        // Changes when case folded
+            \\
-        // Min and max
+            \\}};
-        try writer.interface.writeInt(u24, std.mem.min(u21, changes_when_casefolded_exceptions.items), endian);
+            \\
-        try writer.interface.writeInt(u24, std.mem.max(u21, changes_when_casefolded_exceptions.items), endian);
+            \\pub const multiple_start: u21 = {};
-        try writer.interface.writeInt(u16, @intCast(changes_when_casefolded_exceptions.items.len), endian);
+            \\pub const stage1: [{}]u8 = .{{
-        for (changes_when_casefolded_exceptions.items) |cp| try writer.interface.writeInt(u24, cp, endian);
+        , .{ multiple_codepoint_start, meaningful_stage1.len });
+        for (meaningful_stage1) |entry| try writer.interface.print("{}, ", .{entry});
+        try writer.interface.print(
+            \\
+            \\}};
+            \\
+            \\pub const stage2: [{}]u8 = .{{
+        , .{stage2.len});
+        for (stage2) |entry| try writer.interface.print("{}, ", .{entry});
+        try writer.interface.print(
+            \\
+            \\}};
+            \\
+            \\pub const stage3: [{}]i24 = .{{
+        , .{stage3.len});
+        for (stage3) |entry| try writer.interface.print("{}, ", .{entry});
+        try writer.interface.writeAll(
+            \\};
+        );
        try writer.interface.flush();
    }
diff --git a/codegen/hangul.zig b/codegen/hangul.zig
index 2e4c175..d7504a9 100644
--- a/codegen/hangul.zig
+++ b/codegen/hangul.zig
@@ -120,12 +120,24 @@ pub fn main() anyerror!void {
    defer out_file.close();
    var writer = out_file.writer(&write_buf);
-    const endian = builtin.cpu.arch.endian();
+    try writer.interface.print(
-    try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian);
+        \\//! This file is auto-generated. Do not edit.
-    for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian);
+        \\
+        \\pub const s1: [{}]u16 = .{{
-    try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian);
+    , .{stage1.items.len});
-    for (stage2.items) |i| try writer.interface.writeInt(u8, i, endian);
+    for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry});
+    try writer.interface.print(
+        \\
+        \\}};
+        \\
+        \\pub const s2: [{}]u3 = .{{
+    , .{stage2.items.len});
+    for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry});
+    try writer.interface.writeAll(
+        \\};
+    );
    try writer.interface.flush();
 }
diff --git a/codegen/normp.zig b/codegen/normp.zig
index eaf6989..343f03e 100644
--- a/codegen/normp.zig
+++ b/codegen/normp.zig
@@ -121,12 +121,24 @@ pub fn main() anyerror!void {
    defer out_file.close();
    var writer = out_file.writer(&write_buf);
-    const endian = builtin.cpu.arch.endian();
+    try writer.interface.print(
-    try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian);
+        \\//! This file is auto-generated. Do not edit.
-    for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian);
+        \\
+        \\pub const s1: [{}]u16 = .{{
-    try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian);
+    , .{stage1.items.len});
-    for (stage2.items) |i| try writer.interface.writeInt(u8, i, endian);
+    for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry});
+    try writer.interface.print(
+        \\
+        \\}};
+        \\
+        \\pub const s2: [{}]u3 = .{{
+    , .{stage2.items.len});
+    for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry});
+    try writer.interface.writeAll(
+        \\};
+    );
    try writer.interface.flush();
 }
diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig
index df86b92..88f047c 100644
--- a/src/CaseFolding.zig
+++ b/src/CaseFolding.zig
@@ -1,113 +1,53 @@
-cutoff: u21 = undefined,
-cwcf_exceptions_min: u21 = undefined,
-cwcf_exceptions_max: u21 = undefined,
-cwcf_exceptions: []u21 = undefined,
-multiple_start: u21 = undefined,
-stage1: []u8 = undefined,
-stage2: []u8 = undefined,
-stage3: []i24 = undefined,
-normalize: Normalize,
-owns_normalize: bool,
 const CaseFolding = @This();
-pub fn init(allocator: Allocator) Allocator.Error!CaseFolding {
+const Data = struct {
-    var case_fold: CaseFolding = undefined;
+    cutoff: u21 = undefined,
-    try case_fold.setup(allocator);
+    cwcf_exceptions_min: u21 = undefined,
-    return case_fold;
+    cwcf_exceptions_max: u21 = undefined,
-}
+    cwcf_exceptions: []const u21 = undefined,
+    multiple_start: u21 = undefined,
-pub fn initWithNormalize(allocator: Allocator, norm: Normalize) Allocator.Error!CaseFolding {
+    stage1: []const u8 = undefined,
-    var casefold: CaseFolding = undefined;
+    stage2: []const u8 = undefined,
-    try casefold.setupWithNormalize(allocator, norm);
+    stage3: []const i24 = undefined,
-    return casefold;
+};
-}
+const casefold = casefold: {
-pub fn setup(casefold: *CaseFolding, allocator: Allocator) Allocator.Error!void {
+    const data = @import("fold");
-    try casefold.setupImpl(allocator);
+    break :casefold Data{
-    // Handle normalize memory separately during setup:
+        .cutoff = data.cutoff,
-    casefold.owns_normalize = false;
+        .multiple_start = data.multiple_start,
-    errdefer casefold.deinit(allocator);
+        .stage1 = &data.stage1,
-    try casefold.normalize.setup(allocator);
+        .stage2 = &data.stage2,
-    casefold.owns_normalize = true;
+        .stage3 = &data.stage3,
-}
+        .cwcf_exceptions_min = data.cwcf_exceptions_min,
+        .cwcf_exceptions_max = data.cwcf_exceptions_max,
-pub fn setupWithNormalize(casefold: *CaseFolding, allocator: Allocator, norm: Normalize) !void {
+        .cwcf_exceptions = &data.cwcf_exceptions,
-    try casefold.setupImpl(allocator);
-    casefold.normalize = norm;
-    casefold.owns_normalize = false;
-}
-fn setupImpl(casefold: *CaseFolding, allocator: Allocator) Allocator.Error!void {
-    casefold.setupImplInner(allocator) catch |err| {
-        switch (err) {
-            error.OutOfMemory => |e| return e,
-            else => unreachable,
-        }
    };
-}
+};
-inline fn setupImplInner(casefold: *CaseFolding, allocator: Allocator) !void {
-    const in_bytes = @embedFile("fold");
-    var in_fbs = std.io.fixedBufferStream(in_bytes);
-    var reader = in_fbs.reader();
-    const endian = builtin.cpu.arch.endian();
-    casefold.cutoff = @intCast(try reader.readInt(u24, endian));
-    casefold.multiple_start = @intCast(try reader.readInt(u24, endian));
-    var len = try reader.readInt(u16, endian);
-    casefold.stage1 = try allocator.alloc(u8, len);
-    errdefer allocator.free(casefold.stage1);
-    for (0..len) |i| casefold.stage1[i] = try reader.readInt(u8, endian);
-    len = try reader.readInt(u16, endian);
-    casefold.stage2 = try allocator.alloc(u8, len);
-    errdefer allocator.free(casefold.stage2);
-    for (0..len) |i| casefold.stage2[i] = try reader.readInt(u8, endian);
-    len = try reader.readInt(u16, endian);
-    casefold.stage3 = try allocator.alloc(i24, len);
-    errdefer allocator.free(casefold.stage3);
-    for (0..len) |i| casefold.stage3[i] = try reader.readInt(i24, endian);
-    casefold.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian));
-    casefold.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian));
-    len = try reader.readInt(u16, endian);
-    casefold.cwcf_exceptions = try allocator.alloc(u21, len);
-    errdefer allocator.free(casefold.cwcf_exceptions);
-    for (0..len) |i| casefold.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian));
-}
-pub fn deinit(fdata: *const CaseFolding, allocator: mem.Allocator) void {
-    allocator.free(fdata.stage1);
-    allocator.free(fdata.stage2);
-    allocator.free(fdata.stage3);
-    allocator.free(fdata.cwcf_exceptions);
-    if (fdata.owns_normalize) fdata.normalize.deinit(allocator);
-}
 /// Returns the case fold for `cp`.
-pub fn caseFold(fdata: *const CaseFolding, cp: u21, buf: []u21) []const u21 {
+pub fn caseFold(cp: u21, buf: []u21) []const u21 {
-    if (cp >= fdata.cutoff) return &.{};
+    // Unmatched code points fold to themselves, so we default to this.
+    buf[0] = cp;
-    const stage1_val = fdata.stage1[cp >> 8];
+    if (cp >= casefold.cutoff) return buf[0..1];
-    if (stage1_val == 0) return &.{};
+    const stage1_val = casefold.stage1[cp >> 8];
+    if (stage1_val == 0) return buf[0..1];
    const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF);
-    const stage3_index = fdata.stage2[stage2_index];
+    const stage3_index = casefold.stage2[stage2_index];
    if (stage3_index & 0x80 != 0) {
-        const real_index = @as(usize, fdata.multiple_start) + (stage3_index ^ 0x80) * 3;
+        const real_index = @as(usize, casefold.multiple_start) + (stage3_index ^ 0x80) * 3;
-        const mapping = mem.sliceTo(fdata.stage3[real_index..][0..3], 0);
+        const mapping = mem.sliceTo(casefold.stage3[real_index..][0..3], 0);
        for (mapping, 0..) |c, i| buf[i] = @intCast(c);
        return buf[0..mapping.len];
    }
-    const offset = fdata.stage3[stage3_index];
+    const offset = casefold.stage3[stage3_index];
-    if (offset == 0) return &.{};
+    if (offset == 0) return buf[0..1];
    buf[0] = @intCast(@as(i32, cp) + offset);
@@ -117,7 +57,6 @@ pub fn caseFold(fdata: *const CaseFolding, cp: u21, buf: []u21) []const u21 {
 /// Produces the case folded code points for `cps`. Caller must free returned
 /// slice with `allocator`.
 pub fn caseFoldAlloc(
-    casefold: *const CaseFolding,
    allocator: Allocator,
    cps: []const u21,
 ) Allocator.Error![]const u21 {
@@ -126,7 +65,7 @@ pub fn caseFoldAlloc(
    var buf: [3]u21 = undefined;
    for (cps) |cp| {
-        const cf = casefold.caseFold(cp, &buf);
+        const cf = CaseFolding.caseFold(cp, &buf);
        if (cf.len == 0) {
            try cfcps.append(cp);
@@ -139,19 +78,19 @@ pub fn caseFoldAlloc(
 }
 /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`).
-pub fn cpChangesWhenCaseFolded(casefold: *const CaseFolding, cp: u21) bool {
+pub fn cpChangesWhenCaseFolded(cp: u21) bool {
    var buf: [3]u21 = undefined;
-    const has_mapping = casefold.caseFold(cp, &buf).len != 0;
+    const has_mapping = CaseFolding.caseFold(cp, &buf).len != 0;
-    return has_mapping and !casefold.isCwcfException(cp);
+    return has_mapping and !CaseFolding.isCwcfException(cp);
 }
-pub fn changesWhenCaseFolded(casefold: *const CaseFolding, cps: []const u21) bool {
+pub fn changesWhenCaseFolded(cps: []const u21) bool {
    return for (cps) |cp| {
-        if (casefold.cpChangesWhenCaseFolded(cp)) break true;
+        if (CaseFolding.cpChangesWhenCaseFolded(cp)) break true;
    } else false;
 }
-fn isCwcfException(casefold: *const CaseFolding, cp: u21) bool {
+fn isCwcfException(cp: u21) bool {
    return cp >= casefold.cwcf_exceptions_min and
        cp <= casefold.cwcf_exceptions_max and
        std.mem.indexOfScalar(u21, casefold.cwcf_exceptions, cp) != null;
@@ -160,88 +99,114 @@ fn isCwcfException(casefold: *const CaseFolding, cp: u21) bool {
 /// Caseless compare `a` and `b` by decomposing to NFKD. This is the most
 /// comprehensive comparison possible, but slower than `canonCaselessMatch`.
 pub fn compatCaselessMatch(
-    casefold: *const CaseFolding,
    allocator: Allocator,
+    normalize: Normalize,
    a: []const u8,
    b: []const u8,
 ) Allocator.Error!bool {
    if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
    // Process a
-    const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd);
+    const nfd_a = try normalize.nfxdCodePoints(allocator, a, .nfd);
    defer allocator.free(nfd_a);
    var need_free_cf_nfd_a = false;
    var cf_nfd_a: []const u21 = nfd_a;
-    if (casefold.changesWhenCaseFolded(nfd_a)) {
+    if (CaseFolding.changesWhenCaseFolded(nfd_a)) {
-        cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a);
+        cf_nfd_a = try CaseFolding.caseFoldAlloc(allocator, nfd_a);
        need_free_cf_nfd_a = true;
    }
    defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a);
-    const nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_a);
+    const nfkd_cf_nfd_a = try normalize.nfkdCodePoints(allocator, cf_nfd_a);
    defer allocator.free(nfkd_cf_nfd_a);
-    const cf_nfkd_cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_a);
+    const cf_nfkd_cf_nfd_a = try CaseFolding.caseFoldAlloc(allocator, nfkd_cf_nfd_a);
    defer allocator.free(cf_nfkd_cf_nfd_a);
-    const nfkd_cf_nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a);
+    const nfkd_cf_nfkd_cf_nfd_a = try normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a);
    defer allocator.free(nfkd_cf_nfkd_cf_nfd_a);
    // Process b
-    const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd);
+    const nfd_b = try normalize.nfxdCodePoints(allocator, b, .nfd);
    defer allocator.free(nfd_b);
    var need_free_cf_nfd_b = false;
    var cf_nfd_b: []const u21 = nfd_b;
-    if (casefold.changesWhenCaseFolded(nfd_b)) {
+    if (CaseFolding.changesWhenCaseFolded(nfd_b)) {
-        cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b);
+        cf_nfd_b = try CaseFolding.caseFoldAlloc(allocator, nfd_b);
        need_free_cf_nfd_b = true;
    }
    defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b);
-    const nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_b);
+    const nfkd_cf_nfd_b = try normalize.nfkdCodePoints(allocator, cf_nfd_b);
    defer allocator.free(nfkd_cf_nfd_b);
-    const cf_nfkd_cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_b);
+    const cf_nfkd_cf_nfd_b = try CaseFolding.caseFoldAlloc(allocator, nfkd_cf_nfd_b);
    defer allocator.free(cf_nfkd_cf_nfd_b);
-    const nfkd_cf_nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b);
+    const nfkd_cf_nfkd_cf_nfd_b = try normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b);
    defer allocator.free(nfkd_cf_nfkd_cf_nfd_b);
    return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b);
 }
+test "caseFold" {
+    var buf: [3]u21 = undefined;
+    // Folds '1' to '1'
+    try testing.expectEqual(1, caseFold('1', &buf).len);
+    try testing.expectEqual('1', caseFold('1', &buf)[0]);
+    // Folds '2' to '2'
+    try testing.expectEqual(1, caseFold('2', &buf).len);
+    try testing.expectEqual('2', caseFold('2', &buf)[0]);
+    // Folds Armenian capital letter 'Zhe' (U+053A)
+    try testing.expectEqual(1, caseFold('Ժ', &buf).len);
+    // Armenian small letter 'Zhe' (U+056A)
+    try testing.expectEqual('ժ', caseFold('Ժ', &buf)[0]);
+    // Folds Greek small letter Upsilon with Dialytika and Perispomeni (U+1FE7)
+    try testing.expectEqual(3, caseFold('ῧ', &buf).len);
+    // Greek small letter Upsilon (U+03C5)
+    try testing.expectEqual('υ', caseFold('ῧ', &buf)[0]);
+    // Combining Diaeresis
+    try testing.expectEqual('\u{0308}', caseFold('ῧ', &buf)[1]);
+    // Combining Greek Perispomeni
+    try testing.expectEqual('\u{0342}', caseFold('ῧ', &buf)[2]);
+}
 test "compatCaselessMatch" {
    const allocator = testing.allocator;
-    const caser = try CaseFolding.init(allocator);
+    var normalize = try Normalize.init(allocator);
-    defer caser.deinit(allocator);
+    defer normalize.deinit(allocator);
-    try testing.expect(try caser.compatCaselessMatch(allocator, "ascii only!", "ASCII Only!"));
+    try testing.expect(try compatCaselessMatch(allocator, normalize, "ascii only!", "ASCII Only!"));
    const a = "Héllo World! \u{3d3}";
    const b = "He\u{301}llo World! \u{3a5}\u{301}";
-    try testing.expect(try caser.compatCaselessMatch(allocator, a, b));
+    try testing.expect(try compatCaselessMatch(allocator, normalize, a, b));
    const c = "He\u{301}llo World! \u{3d2}\u{301}";
-    try testing.expect(try caser.compatCaselessMatch(allocator, a, c));
+    try testing.expect(try compatCaselessMatch(allocator, normalize, a, c));
 }
 /// Performs canonical caseless string matching by decomposing to NFD. This is
 /// faster than `compatCaselessMatch`, but less comprehensive.
 pub fn canonCaselessMatch(
-    casefold: *const CaseFolding,
    allocator: Allocator,
+    normalize: Normalize,
    a: []const u8,
    b: []const u8,
 ) Allocator.Error!bool {
    if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
    // Process a
-    const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd);
+    const nfd_a = try normalize.nfxdCodePoints(allocator, a, .nfd);
    defer allocator.free(nfd_a);
    var need_free_cf_nfd_a = false;
    var cf_nfd_a: []const u21 = nfd_a;
-    if (casefold.changesWhenCaseFolded(nfd_a)) {
+    if (CaseFolding.changesWhenCaseFolded(nfd_a)) {
-        cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a);
+        cf_nfd_a = try CaseFolding.caseFoldAlloc(allocator, nfd_a);
        need_free_cf_nfd_a = true;
    }
    defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a);
@@ -249,19 +214,19 @@ pub fn canonCaselessMatch(
    var need_free_nfd_cf_nfd_a = false;
    var nfd_cf_nfd_a = cf_nfd_a;
    if (!need_free_cf_nfd_a) {
-        nfd_cf_nfd_a = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_a);
+        nfd_cf_nfd_a = try normalize.nfdCodePoints(allocator, cf_nfd_a);
        need_free_nfd_cf_nfd_a = true;
    }
    defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a);
    // Process b
-    const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd);
+    const nfd_b = try normalize.nfxdCodePoints(allocator, b, .nfd);
    defer allocator.free(nfd_b);
    var need_free_cf_nfd_b = false;
    var cf_nfd_b: []const u21 = nfd_b;
-    if (casefold.changesWhenCaseFolded(nfd_b)) {
+    if (CaseFolding.changesWhenCaseFolded(nfd_b)) {
-        cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b);
+        cf_nfd_b = try CaseFolding.caseFoldAlloc(allocator, nfd_b);
        need_free_cf_nfd_b = true;
    }
    defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b);
@@ -269,7 +234,7 @@ pub fn canonCaselessMatch(
    var need_free_nfd_cf_nfd_b = false;
    var nfd_cf_nfd_b = cf_nfd_b;
    if (!need_free_cf_nfd_b) {
-        nfd_cf_nfd_b = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_b);
+        nfd_cf_nfd_b = try normalize.nfdCodePoints(allocator, cf_nfd_b);
        need_free_nfd_cf_nfd_b = true;
    }
    defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b);
@@ -280,40 +245,17 @@ pub fn canonCaselessMatch(
 test "canonCaselessMatch" {
    const allocator = testing.allocator;
-    const caser = try CaseFolding.init(allocator);
+    var normalize = try Normalize.init(allocator);
-    defer caser.deinit(allocator);
+    defer normalize.deinit(allocator);
-    try testing.expect(try caser.canonCaselessMatch(allocator, "ascii only!", "ASCII Only!"));
+    try testing.expect(try canonCaselessMatch(allocator, normalize, "ascii only!", "ASCII Only!"));
    const a = "Héllo World! \u{3d3}";
    const b = "He\u{301}llo World! \u{3a5}\u{301}";
-    try testing.expect(!try caser.canonCaselessMatch(allocator, a, b));
+    try testing.expect(!try canonCaselessMatch(allocator, normalize, a, b));
    const c = "He\u{301}llo World! \u{3d2}\u{301}";
-    try testing.expect(try caser.canonCaselessMatch(allocator, a, c));
+    try testing.expect(try canonCaselessMatch(allocator, normalize, a, c));
-}
-fn testAllocations(allocator: Allocator) !void {
-    // With normalize provided
-    {
-        const normalize = try Normalize.init(allocator);
-        defer normalize.deinit(allocator);
-        const caser = try CaseFolding.initWithNormalize(allocator, normalize);
-        defer caser.deinit(allocator);
-    }
-    // With normalize owned
-    {
-        const caser = try CaseFolding.init(allocator);
-        defer caser.deinit(allocator);
-    }
-}
-test "Allocation Failures" {
-    try testing.checkAllAllocationFailures(
-        testing.allocator,
-        testAllocations,
-        .{},
-    );
 }
 const std = @import("std");
diff --git a/src/CombiningData.zig b/src/CombiningData.zig
index f58e0de..660dc14 100644
--- a/src/CombiningData.zig
+++ b/src/CombiningData.zig
@@ -1,45 +1,28 @@
 //! Combining Class Data
-s1: []u16 = undefined,
+const Data = struct {
-s2: []u8 = undefined,
+    s1: []const u16 = undefined,
+    s2: []const u8 = undefined,
+};
+const combining_data = combining_data: {
+    const data = @import("ccc");
+    break :combining_data Data{
+        .s1 = &data.s1,
+        .s2 = &data.s2,
+    };
+};
 const CombiningData = @This();
-pub fn init(allocator: mem.Allocator) !CombiningData {
-    const in_bytes = @embedFile("ccc");
-    var in_fbs = std.io.fixedBufferStream(in_bytes);
-    var reader = in_fbs.reader();
-    const endian = builtin.cpu.arch.endian();
-    var cbdata = CombiningData{};
-    const stage_1_len: u16 = try reader.readInt(u16, endian);
-    cbdata.s1 = try allocator.alloc(u16, stage_1_len);
-    errdefer allocator.free(cbdata.s1);
-    for (0..stage_1_len) |i| cbdata.s1[i] = try reader.readInt(u16, endian);
-    const stage_2_len: u16 = try reader.readInt(u16, endian);
-    cbdata.s2 = try allocator.alloc(u8, stage_2_len);
-    errdefer allocator.free(cbdata.s2);
-    _ = try reader.readAll(cbdata.s2);
-    return cbdata;
-}
-pub fn deinit(cbdata: *const CombiningData, allocator: mem.Allocator) void {
-    allocator.free(cbdata.s1);
-    allocator.free(cbdata.s2);
-}
 /// Returns the canonical combining class for a code point.
-pub fn ccc(cbdata: CombiningData, cp: u21) u8 {
+pub fn ccc(cp: u21) u8 {
-    return cbdata.s2[cbdata.s1[cp >> 8] + (cp & 0xff)];
+    return combining_data.s2[combining_data.s1[cp >> 8] + (cp & 0xff)];
 }
 /// True if `cp` is a starter code point, not a combining character.
-pub fn isStarter(cbdata: CombiningData, cp: u21) bool {
+pub fn isStarter(cp: u21) bool {
-    return cbdata.s2[cbdata.s1[cp >> 8] + (cp & 0xff)] == 0;
+    return combining_data.s2[combining_data.s1[cp >> 8] + (cp & 0xff)] == 0;
 }
 const std = @import("std");
diff --git a/src/CompatData.zig b/src/CompatData.zig
index 40ecd12..68d47f2 100644
--- a/src/CompatData.zig
+++ b/src/CompatData.zig
@@ -1,57 +1,19 @@
 //! Compatibility Data
-nfkd: [][]u21 = undefined,
+const Data = struct {
-cps: []u21 = undefined,
+    s1: []const u16 = undefined,
+    s2: []const []const u21 = undefined,
-const CompatData = @This();
+};
-pub fn init(allocator: mem.Allocator) !CompatData {
+const compat_data = compat_data: {
-    const in_bytes = @embedFile("compat");
+    const data = @import("compat");
-    var in_fbs = std.io.fixedBufferStream(in_bytes);
+    break :compat_data Data{
-    var reader = in_fbs.reader();
+        .s1 = &data.s1,
+        .s2 = &data.s2,
-    const endian = builtin.cpu.arch.endian();
-    var cpdata = CompatData{
-        .nfkd = try allocator.alloc([]u21, 0x110000),
    };
-    {
+};
-        errdefer allocator.free(cpdata.nfkd);
-        cpdata.cps = try allocator.alloc(u21, magic.compat_size);
-    }
-    errdefer cpdata.deinit(allocator);
-    @memset(cpdata.nfkd, &.{});
-    var total_len: usize = 0;
-    while (true) {
-        const len: u8 = try reader.readInt(u8, endian);
-        if (len == 0) break;
-        const cp = try reader.readInt(u24, endian);
-        const nk_s = cpdata.cps[total_len..][0 .. len - 1];
-        for (0..len - 1) |i| {
-            nk_s[i] = @intCast(try reader.readInt(u24, endian));
-        }
-        cpdata.nfkd[cp] = nk_s;
-        total_len += len - 1;
-    }
-    if (comptime magic.print) std.debug.print("CompatData magic number: {d}", .{total_len});
-    return cpdata;
-}
-pub fn deinit(cpdata: *const CompatData, allocator: mem.Allocator) void {
-    allocator.free(cpdata.cps);
-    allocator.free(cpdata.nfkd);
-}
 /// Returns compatibility decomposition for `cp`.
-pub fn toNfkd(cpdata: *const CompatData, cp: u21) []u21 {
+pub fn toNfkd(cp: u21) []const u21 {
-    return cpdata.nfkd[cp];
+    return compat_data.s2[compat_data.s1[cp >> 8] + (cp & 0xff)];
 }
-const std = @import("std");
-const builtin = @import("builtin");
-const mem = std.mem;
-const magic = @import("magic");
diff --git a/src/HangulData.zig b/src/HangulData.zig
index cae8b97..9c17421 100644
--- a/src/HangulData.zig
+++ b/src/HangulData.zig
@@ -9,39 +9,23 @@ pub const Syllable = enum {
    T,
 };
-s1: []u16 = undefined,
+const Data = struct {
-s2: []u3 = undefined,
+    s1: []const u16 = undefined,
+    s2: []const u3 = undefined,
-const Hangul = @This();
+};
-pub fn init(allocator: mem.Allocator) !Hangul {
-    const in_bytes = @embedFile("hangul");
-    var in_fbs = std.io.fixedBufferStream(in_bytes);
-    var reader = in_fbs.reader();
-    const endian = builtin.cpu.arch.endian();
-    var hangul = Hangul{};
-    const stage_1_len: u16 = try reader.readInt(u16, endian);
-    hangul.s1 = try allocator.alloc(u16, stage_1_len);
-    errdefer allocator.free(hangul.s1);
-    for (0..stage_1_len) |i| hangul.s1[i] = try reader.readInt(u16, endian);
-    const stage_2_len: u16 = try reader.readInt(u16, endian);
-    hangul.s2 = try allocator.alloc(u3, stage_2_len);
-    errdefer allocator.free(hangul.s2);
-    for (0..stage_2_len) |i| hangul.s2[i] = @intCast(try reader.readInt(u8, endian));
-    return hangul;
+const hangul = hangul_data: {
-}
+    const data = @import("hangul");
+    break :hangul_data Data{
+        .s1 = &data.s1,
+        .s2 = &data.s2,
+    };
+};
-pub fn deinit(hangul: *const Hangul, allocator: mem.Allocator) void {
+const Hangul = @This();
-    allocator.free(hangul.s1);
-    allocator.free(hangul.s2);
-}
 /// Returns the Hangul syllable type for `cp`.
-pub fn syllable(hangul: *const Hangul, cp: u21) Syllable {
+pub fn syllable(cp: u21) Syllable {
    return @enumFromInt(hangul.s2[hangul.s1[cp >> 8] + (cp & 0xff)]);
 }
diff --git a/src/NormPropsData.zig b/src/NormPropsData.zig
index 7b53542..cca3556 100644
--- a/src/NormPropsData.zig
+++ b/src/NormPropsData.zig
@@ -1,48 +1,32 @@
 //! Normalization Properties Data
-s1: []u16 = undefined,
+const Data = struct {
-s2: []u4 = undefined,
+    s1: []const u16 = undefined,
+    s2: []const u3 = undefined,
+};
+const norms = norm_props_data: {
+    const data = @import("normp");
+    break :norm_props_data Data{
+        .s1 = &data.s1,
+        .s2 = &data.s2,
+    };
+};
 const NormProps = @This();
-pub fn init(allocator: mem.Allocator) !NormProps {
-    const in_bytes = @embedFile("normp");
-    var in_fbs = std.io.fixedBufferStream(in_bytes);
-    var reader = in_fbs.reader();
-    const endian = builtin.cpu.arch.endian();
-    var norms = NormProps{};
-    const stage_1_len: u16 = try reader.readInt(u16, endian);
-    norms.s1 = try allocator.alloc(u16, stage_1_len);
-    errdefer allocator.free(norms.s1);
-    for (0..stage_1_len) |i| norms.s1[i] = try reader.readInt(u16, endian);
-    const stage_2_len: u16 = try reader.readInt(u16, endian);
-    norms.s2 = try allocator.alloc(u4, stage_2_len);
-    errdefer allocator.free(norms.s2);
-    for (0..stage_2_len) |i| norms.s2[i] = @intCast(try reader.readInt(u8, endian));
-    return norms;
-}
-pub fn deinit(norms: *const NormProps, allocator: mem.Allocator) void {
-    allocator.free(norms.s1);
-    allocator.free(norms.s2);
-}
 /// Returns true if `cp` is already in NFD form.
-pub fn isNfd(norms: *const NormProps, cp: u21) bool {
+pub fn isNfd(cp: u21) bool {
    return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 1 == 0;
 }
 /// Returns true if `cp` is already in NFKD form.
-pub fn isNfkd(norms: *const NormProps, cp: u21) bool {
+pub fn isNfkd(cp: u21) bool {
    return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 2 == 0;
 }
 /// Returns true if `cp` is not allowed in any normalized form.
-pub fn isFcx(norms: *const NormProps, cp: u21) bool {
+pub fn isFcx(cp: u21) bool {
    return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
 }
diff --git a/src/Normalize.zig b/src/Normalize.zig
index 4a1bae8..3191a8c 100644
--- a/src/Normalize.zig
+++ b/src/Normalize.zig
@@ -3,64 +3,22 @@
 //! NFKC, NFD, and NFKD normalization forms.
 canon_data: CanonData = undefined,
-ccc_data: CccData = undefined,
-compat_data: CompatData = undefined,
-hangul_data: HangulData = undefined,
-normp_data: NormPropsData = undefined,
 const Normalize = @This();
-pub fn init(allocator: Allocator) Allocator.Error!Normalize {
+pub fn init(allocator: Allocator) !Normalize {
    var norm: Normalize = undefined;
    try norm.setup(allocator);
    return norm;
 }
-pub fn setup(self: *Normalize, allocator: Allocator) Allocator.Error!void {
+pub fn setup(self: *Normalize, allocator: Allocator) !void {
-    self.canon_data = CanonData.init(allocator) catch |err| {
+    self.canon_data = try CanonData.init(allocator);
-        switch (err) {
-            error.OutOfMemory => |e| return e,
-            else => unreachable,
-        }
-    };
-    errdefer self.canon_data.deinit(allocator);
-    self.ccc_data = CccData.init(allocator) catch |err| {
-        switch (err) {
-            error.OutOfMemory => |e| return e,
-            else => unreachable,
-        }
-    };
-    errdefer self.ccc_data.deinit(allocator);
-    self.compat_data = CompatData.init(allocator) catch |err| {
-        switch (err) {
-            error.OutOfMemory => |e| return e,
-            else => unreachable,
-        }
-    };
-    errdefer self.compat_data.deinit(allocator);
-    self.hangul_data = HangulData.init(allocator) catch |err| {
-        switch (err) {
-            error.OutOfMemory => |e| return e,
-            else => unreachable,
-        }
-    };
-    errdefer self.hangul_data.deinit(allocator);
-    self.normp_data = NormPropsData.init(allocator) catch |err| {
-        switch (err) {
-            error.OutOfMemory => |e| return e,
-            else => unreachable,
-        }
-    };
 }
 pub fn deinit(norm: *const Normalize, allocator: Allocator) void {
-    // Reasonably safe (?)
+    const mut_norm = @constCast(norm);
-    var mut_norm = @constCast(norm);
    mut_norm.canon_data.deinit(allocator);
-    mut_norm.ccc_data.deinit(allocator);
-    mut_norm.compat_data.deinit(allocator);
-    mut_norm.hangul_data.deinit(allocator);
-    mut_norm.normp_data.deinit(allocator);
 }
 const SBase: u21 = 0xAC00;
@@ -73,8 +31,8 @@ const TCount: u21 = 28;
 const NCount: u21 = 588; // VCount * TCount
 const SCount: u21 = 11172; // LCount * NCount
-fn decomposeHangul(self: Normalize, cp: u21, buf: []u21) ?Decomp {
+fn decomposeHangul(cp: u21, buf: []u21) ?Decomp {
-    const kind = self.hangul_data.syllable(cp);
+    const kind = HangulData.syllable(cp);
    if (kind != .LV and kind != .LVT) return null;
    const SIndex: u21 = cp - SBase;
@@ -143,7 +101,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp {
        },
        .nfkd => {
-            dc.cps = self.compat_data.toNfkd(cp);
+            dc.cps = CompatData.toNfkd(cp);
            if (dc.cps.len != 0) {
                dc.form = .nfkd;
            } else {
@@ -170,13 +128,13 @@ fn decompose(
    // NFD / NFKD quick checks.
    switch (form) {
-        .nfd => if (self.normp_data.isNfd(cp)) return .{},
+        .nfd => if (NormPropsData.isNfd(cp)) return .{},
-        .nfkd => if (self.normp_data.isNfkd(cp)) return .{},
+        .nfkd => if (NormPropsData.isNfkd(cp)) return .{},
        else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."),
    }
    // Hangul precomposed syllable full decomposition.
-    if (self.decomposeHangul(cp, buf)) |dc| return dc;
+    if (decomposeHangul(cp, buf)) |dc| return dc;
    // Full decomposition.
    var dc = Decomp{ .form = form };
@@ -218,9 +176,8 @@ fn decompose(
 test "decompose" {
    const allocator = testing.allocator;
-    const n = try Normalize.init(allocator);
+    var n = try Normalize.init(allocator);
    defer n.deinit(allocator);
    var buf: [18]u21 = undefined;
    var dc = n.decompose('é', .nfd, &buf);
@@ -280,17 +237,17 @@ pub const Result = struct {
 };
 // Compares code points by Canonical Combining Class order.
-fn cccLess(self: Normalize, lhs: u21, rhs: u21) bool {
+fn cccLess(_: void, lhs: u21, rhs: u21) bool {
-    return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs);
+    return CombiningData.ccc(lhs) < CombiningData.ccc(rhs);
 }
 // Applies the Canonical Sorting Algorithm.
-fn canonicalSort(self: Normalize, cps: []u21) void {
+fn canonicalSort(cps: []u21) void {
    var i: usize = 0;
    while (i < cps.len) : (i += 1) {
        const start: usize = i;
-        while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {}
+        while (i < cps.len and CombiningData.ccc(cps[i]) != 0) : (i += 1) {}
-        mem.sort(u21, cps[start..i], self, cccLess);
+        mem.sort(u21, cps[start..i], {}, cccLess);
    }
 }
@@ -320,7 +277,7 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo
        }
    }
-    self.canonicalSort(dcp_list.items);
+    canonicalSort(dcp_list.items);
    return try dcp_list.toOwnedSlice();
 }
@@ -346,7 +303,7 @@ fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
 test "nfd ASCII / no-alloc" {
    const allocator = testing.allocator;
-    const n = try Normalize.init(allocator);
+    var n = try Normalize.init(allocator);
    defer n.deinit(allocator);
    const result = try n.nfd(allocator, "Hello World!");
@@ -357,7 +314,7 @@ test "nfd ASCII / no-alloc" {
 test "nfd !ASCII / alloc" {
    const allocator = testing.allocator;
-    const n = try Normalize.init(allocator);
+    var n = try Normalize.init(allocator);
    defer n.deinit(allocator);
    const result = try n.nfd(allocator, "Héllo World! \u{3d3}");
@@ -368,7 +325,7 @@ test "nfd !ASCII / alloc" {
 test "nfkd ASCII / no-alloc" {
    const allocator = testing.allocator;
-    const n = try Normalize.init(allocator);
+    var n = try Normalize.init(allocator);
    defer n.deinit(allocator);
    const result = try n.nfkd(allocator, "Hello World!");
@@ -379,7 +336,7 @@ test "nfkd ASCII / no-alloc" {
 test "nfkd !ASCII / alloc" {
    const allocator = testing.allocator;
-    const n = try Normalize.init(allocator);
+    var n = try Normalize.init(allocator);
    defer n.deinit(allocator);
    const result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
@@ -408,7 +365,7 @@ pub fn nfdCodePoints(
        }
    }
-    self.canonicalSort(dcp_list.items);
+    canonicalSort(dcp_list.items);
    return try dcp_list.toOwnedSlice();
 }
@@ -433,15 +390,15 @@ pub fn nfkdCodePoints(
        }
    }
-    self.canonicalSort(dcp_list.items);
+    canonicalSort(dcp_list.items);
    return try dcp_list.toOwnedSlice();
 }
 // Composition (NFC, NFKC)
-fn isHangul(self: Normalize, cp: u21) bool {
+fn isHangul(cp: u21) bool {
-    return cp >= 0x1100 and self.hangul_data.syllable(cp) != .none;
+    return cp >= 0x1100 and HangulData.syllable(cp) != .none;
 }
 /// Normalizes `str` to NFC.
@@ -479,7 +436,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
        block_check: while (i < dcps.len) : (i += 1) {
            const C = dcps[i];
            if (C == tombstone) continue :block_check;
-            const cc_C = self.ccc_data.ccc(C);
+            const cc_C = CombiningData.ccc(C);
            var starter_index: ?usize = null;
            var j: usize = i;
@@ -489,12 +446,12 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
                if (dcps[j] == tombstone) continue;
                // Check for starter.
-                if (self.ccc_data.isStarter(dcps[j])) {
+                if (CombiningData.isStarter(dcps[j])) {
                    // Check for blocking conditions.
                    for (dcps[(j + 1)..i]) |B| {
                        if (B == tombstone) continue;
-                        const cc_B = self.ccc_data.ccc(B);
+                        const cc_B = CombiningData.ccc(B);
-                        if (cc_B != 0 and self.isHangul(C)) continue :block_check;
+                        if (cc_B != 0 and isHangul(C)) continue :block_check;
                        if (cc_B >= cc_C) continue :block_check;
                    }
@@ -515,10 +472,10 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
                // If L and C are Hangul syllables, we can compose
                // them algorithmically if possible.
-                if (self.isHangul(L) and self.isHangul(C)) {
+                if (isHangul(L) and isHangul(C)) {
                    // Get Hangul syllable types.
-                    const l_stype = self.hangul_data.syllable(L);
+                    const l_stype = HangulData.syllable(L);
-                    const c_stype = self.hangul_data.syllable(C);
+                    const c_stype = HangulData.syllable(C);
                    if (l_stype == .LV and c_stype == .T) {
                        // LV, T canonical composition.
@@ -547,7 +504,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
                        // Composition Exclusions  (FCX) list,
                        // preventing it from appearing in any
                        // composed form (NFC, NFKC).
-                        if (!self.normp_data.isFcx(P)) {
+                        if (!NormPropsData.isFcx(P)) {
                            dcps[sidx] = P;
                            dcps[i] = tombstone; // Mark for deletion.
                            deleted += 1;
@@ -577,7 +534,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
 test "nfc" {
    const allocator = testing.allocator;
-    const n = try Normalize.init(allocator);
+    var n = try Normalize.init(allocator);
    defer n.deinit(allocator);
    const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
@@ -588,7 +545,7 @@ test "nfc" {
 test "nfkc" {
    const allocator = testing.allocator;
-    const n = try Normalize.init(allocator);
+    var n = try Normalize.init(allocator);
    defer n.deinit(allocator);
    const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
@@ -609,7 +566,7 @@ pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8)
 test "eql" {
    const allocator = testing.allocator;
-    const n = try Normalize.init(allocator);
+    var n = try Normalize.init(allocator);
    defer n.deinit(allocator);
    try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
@@ -666,13 +623,13 @@ const mem = std.mem;
 const simd = std.simd;
 const testing = std.testing;
 const unicode = std.unicode;
-const Allocator = std.mem.Allocator;
+const Allocator = mem.Allocator;
 const ascii = @import("ascii");
 const CodePointIterator = @import("code_point").Iterator;
 const CanonData = @import("CanonData");
-const CccData = @import("CombiningData");
+const CombiningData = @import("CombiningData");
 const CompatData = @import("CompatData");
 const HangulData = @import("HangulData");
 const NormPropsData = @import("NormPropsData");
diff --git a/src/comptime_map.zig b/src/comptime_map.zig
new file mode 100644
index 0000000..b2c4d11
--- /dev/null
+++ b/src/comptime_map.zig
@@ -0,0 +1,176 @@
+//! A copypasta of Vexu's comptime hashmap:
+//!
+//! https://github.com/Vexu/comptime_hash_map
+//!
+//! (License at base of file, thanks Vexu!)
+//!
+/// A comptime hashmap constructed with automatically selected hash and eql functions.
+pub fn AutoComptimeHashMap(comptime K: type, comptime V: type, comptime values: anytype) type {
+    return ComptimeHashMap(K, V, hash_map.AutoContext(K), values);
+}
+/// Builtin hashmap for strings as keys.
+pub fn ComptimeStringHashMap(comptime V: type, comptime values: anytype) type {
+    return ComptimeHashMap([]const u8, V, hash_map.StringContext, values);
+}
+/// A hashmap which is constructed at compile time from constant values.
+/// Intended to be used as a faster lookup table.
+pub fn ComptimeHashMap(comptime K: type, comptime V: type, comptime ctx: type, comptime values: anytype) type {
+    std.debug.assert(values.len != 0);
+    @setEvalBranchQuota(1000 * values.len);
+    const Entry = struct {
+        key: K = undefined,
+        val: V = undefined,
+        used: bool = false,
+    };
+    // ensure that the hash map will be at most 60% full
+    const size = if (@import("builtin").is_test) values.len else math.ceilPowerOfTwo(usize, values.len * 5 / 3) catch unreachable;
+    comptime var slots = [1]Entry{.{}} ** size;
+    comptime var distance: [size]usize = .{0} ** size;
+    comptime var max_distance_from_start_index = 0;
+    slot_loop: for (values) |kv| {
+        var key: K = kv.@"0";
+        var value: V = kv.@"1";
+        const start_index = @as(usize, ctx.hash(undefined, key)) & (size - 1);
+        var roll_over = 0;
+        var distance_from_start_index = 0;
+        while (roll_over < size) : ({
+            roll_over += 1;
+            distance_from_start_index += 1;
+        }) {
+            const index = (start_index + roll_over) & (size - 1);
+            const entry = &slots[index];
+            if (entry.used and !ctx.eql(undefined, entry.key, key)) {
+                if (distance[index] < distance_from_start_index) {
+                    // robin hood to the rescue
+                    const tmp = slots[index];
+                    max_distance_from_start_index = @max(max_distance_from_start_index, distance_from_start_index);
+                    entry.* = .{
+                        .used = true,
+                        .key = key,
+                        .val = value,
+                    };
+                    const tmp_distance = distance[index];
+                    distance[index] = distance_from_start_index;
+                    key = tmp.key;
+                    value = tmp.val;
+                    distance_from_start_index = tmp_distance;
+                }
+                continue;
+            }
+            max_distance_from_start_index = @max(distance_from_start_index, max_distance_from_start_index);
+            entry.* = .{
+                .used = true,
+                .key = key,
+                .val = value,
+            };
+            distance[index] = distance_from_start_index;
+            continue :slot_loop;
+        }
+        unreachable; // put into a full map
+    }
+    return struct {
+        const entries = slots;
+        pub fn has(key: K) bool {
+            return get(key) != null;
+        }
+        pub fn get(key: K) ?*const V {
+            const start_index = @as(usize, ctx.hash(undefined, key)) & (size - 1);
+            {
+                var roll_over: usize = 0;
+                while (roll_over <= max_distance_from_start_index) : (roll_over += 1) {
+                    const index = (start_index + roll_over) & (size - 1);
+                    const entry = &entries[index];
+                    if (!entry.used) return null;
+                    if (ctx.eql(undefined, entry.key, key)) return &entry.val;
+                }
+            }
+            return null;
+        }
+    };
+}
+test "basic usage" {
+    const map = ComptimeStringHashMap(usize, .{
+        .{ "foo", 1 },
+        .{ "bar", 2 },
+        .{ "baz", 3 },
+        .{ "quux", 4 },
+        .{ "Foo", 1 },
+        .{ "Bar", 2 },
+        .{ "Baz", 3 },
+        .{ "Quux", 4 },
+    });
+    try testing.expect(map.has("foo"));
+    try testing.expect(map.has("bar"));
+    try testing.expect(map.has("Foo"));
+    try testing.expect(map.has("Bar"));
+    try testing.expect(!map.has("zig"));
+    try testing.expect(!map.has("ziguana"));
+    try testing.expect(map.get("baz").?.* == 3);
+    try testing.expect(map.get("quux").?.* == 4);
+    try testing.expect(map.get("nah") == null);
+    try testing.expect(map.get("...") == null);
+}
+test "auto comptime hash map" {
+    const map = AutoComptimeHashMap(usize, []const u8, .{
+        .{ 1, "foo" },
+        .{ 2, "bar" },
+        .{ 3, "baz" },
+        .{ 45, "quux" },
+    });
+    try testing.expect(map.has(1));
+    try testing.expect(map.has(2));
+    try testing.expect(!map.has(4));
+    try testing.expect(!map.has(1_000_000));
+    try testing.expectEqualStrings("foo", map.get(1).?.*);
+    try testing.expectEqualStrings("bar", map.get(2).?.*);
+    try testing.expect(map.get(4) == null);
+    try testing.expect(map.get(4_000_000) == null);
+}
+const std = @import("std");
+const hash_map = std.hash_map;
+const testing = std.testing;
+const math = std.math;
+// MIT License
+//
+// Copyright (c) 2020 Veikka Tuominen
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
author	Sam Atman	2026-02-04 18:01:36 -0500
committer	Sam Atman	2026-02-04 18:01:36 -0500
commit	ba5d9081b479e95ffa7f3baf751beedd370cec14 (patch)
tree	c12041d8aab9f9ff68b25a2e2c9042073c3d5f61
parent	Convert Words module to no-allocation (diff)
download	zg-ba5d9081b479e95ffa7f3baf751beedd370cec14.tar.gz zg-ba5d9081b479e95ffa7f3baf751beedd370cec14.tar.xz zg-ba5d9081b479e95ffa7f3baf751beedd370cec14.zip