From 904fa4d94f30825bec490133ff402c6350f45e26 Mon Sep 17 00:00:00 2001
From: Sam Atman
Date: Wed, 4 Feb 2026 21:21:14 -0500
Subject: Teasing out canonicalization

After coping with a spuriously broken autohash for awhile, I got the
one remaining hash table moved into memory, so there's no further
reason to put up with allocation of basic structures.

So that's nice.
---
 build.zig            |   2 +-
 codegen/canon.zig    | 182 ++++++++++++++++++++++++++++++++++++++++++++-------
 src/CanonData.zig    |  87 ++++++++++++------------
 src/comptime_map.zig |  12 +++-
 4 files changed, 214 insertions(+), 69 deletions(-)

diff --git a/build.zig b/build.zig
index be91f50..3cf69aa 100644
--- a/build.zig
+++ b/build.zig
@@ -91,7 +91,7 @@ pub fn build(b: *std.Build) void {
     });
     canon_gen_exe.root_module.addAnonymousImport("UnicodeData.txt", .{ .root_source_file = b.path("data/unicode/UnicodeData.txt") });
     const run_canon_gen_exe = b.addRunArtifact(canon_gen_exe);
-    const canon_gen_out = run_canon_gen_exe.addOutputFileArg("canon.bin.z");
+    const canon_gen_out = run_canon_gen_exe.addOutputFileArg("canon.zig");
 
     const compat_gen_exe = b.addExecutable(.{
         .name = "compat",
diff --git a/codegen/canon.zig b/codegen/canon.zig
index d95a905..e92be5d 100644
--- a/codegen/canon.zig
+++ b/codegen/canon.zig
@@ -1,12 +1,38 @@
 const std = @import("std");
 const builtin = @import("builtin");
 
+const block_size = 256;
+const Block = [block_size]Canonicalization;
+
+const Canonicalization = struct {
+    len: u3 = 0,
+    cps: [2]u21 = [_]u21{0} ** 2,
+};
+
+const BlockMap = std.HashMap(
+    Block,
+    u16,
+    struct {
+        pub fn hash(_: @This(), k: Block) u64 {
+            var hasher = std.hash.Wyhash.init(0);
+            std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
+            return hasher.final();
+        }
+
+        pub fn eql(_: @This(), aBlock: Block, bBlock: Block) bool {
+            return for (aBlock, bBlock) |a, b| {
+                if (a.len != b.len or a.cps[0] != b.cps[0] or a.cps[1] != b.cps[1]) return false;
+            } else true;
+        }
+    },
+    std.hash_map.default_max_load_percentage,
+);
+
 pub fn main() anyerror!void {
     var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
     defer arena.deinit();
     const allocator = arena.allocator();
 
-    var write_buf: [4096]u8 = undefined;
     // Process UnicodeData.txt
     var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt"));
     var args_iter = try std.process.argsWithAllocator(allocator);
@@ -14,53 +40,163 @@ pub fn main() anyerror!void {
     _ = args_iter.skip();
     const output_path = args_iter.next() orelse @panic("No output file arg!");
 
-    var out_file = try std.fs.cwd().createFile(output_path, .{});
-    defer out_file.close();
-    var file_writer = out_file.writer(&write_buf);
-    var writer = &file_writer.interface;
-    const endian = builtin.cpu.arch.endian();
+    var canon_map = std.AutoHashMap(u21, Canonicalization).init(allocator);
+    defer canon_map.deinit();
+
+    var composite_set = std.AutoArrayHashMap(u21, [2]u21).init(allocator);
 
-    lines: while (in_reader.takeDelimiterInclusive('\n')) |took| {
-        const line = std.mem.trimRight(u8, took, "\n");
+    while (in_reader.takeDelimiterInclusive('\n')) |line| {
         if (line.len == 0) continue;
 
         var field_iter = std.mem.splitScalar(u8, line, ';');
-        var cps: [3]u24 = undefined;
-        var len: u8 = 2;
+        var cp: u21 = undefined;
 
         var i: usize = 0;
         while (field_iter.next()) |field| : (i += 1) {
+            if (field.len == 0) continue;
+
             switch (i) {
-                0 => cps[0] = try std.fmt.parseInt(u24, field, 16),
+                0 => cp = try std.fmt.parseInt(u21, field, 16),
 
                 5 => {
                     // Not canonical.
-                    if (field.len == 0 or field[0] == '<') continue :lines;
+                    if (field[0] == '<') continue;
+
                     if (std.mem.indexOfScalar(u8, field, ' ')) |space| {
                         // Canonical
-                        len = 3;
-                        cps[1] = try std.fmt.parseInt(u24, field[0..space], 16);
-                        cps[2] = try std.fmt.parseInt(u24, field[space + 1 ..], 16);
+                        const c0, const c1 = .{
+                            try std.fmt.parseInt(u21, field[0..space], 16),
+                            try std.fmt.parseInt(u21, field[space + 1 ..], 16),
+                        };
+                        try canon_map.put(cp, Canonicalization{
+                            .len = 2,
+                            .cps = [_]u21{ c0, c1 },
+                        });
+                        try composite_set.put(cp, [_]u21{ c0, c1 });
                     } else {
                         // Singleton
-                        cps[1] = try std.fmt.parseInt(u24, field, 16);
+                        try canon_map.put(cp, Canonicalization{
+                            .len = 1,
+                            .cps = [_]u21{
+                                try std.fmt.parseInt(u21, field, 16),
+                                0,
+                            },
+                        });
                     }
                 },
 
-                2 => if (line[0] == '<') continue :lines,
-
                 else => {},
             }
         }
-
-        try writer.writeInt(u8, @intCast(len), endian);
-        for (cps[0..len]) |cp| try writer.writeInt(u24, cp, endian);
     } else |err| switch (err) {
         error.EndOfStream => {},
         else => {
             return err;
         },
     }
-    try writer.writeInt(u16, 0, endian);
-    try writer.flush();
+
+    // Build multi-tiered lookup tables for decompositions
+    var blocks_map = BlockMap.init(allocator);
+    defer blocks_map.deinit();
+
+    var stage1 = std.array_list.Managed(u16).init(allocator);
+    defer stage1.deinit();
+
+    var stage2 = std.array_list.Managed(Canonicalization).init(allocator);
+    defer stage2.deinit();
+
+    var block: Block = [_]Canonicalization{.{}} ** block_size;
+    var block_len: u16 = 0;
+
+    for (0..0x110000) |i| {
+        const cp: u21 = @intCast(i);
+
+        const canon: Canonicalization = canon_map.get(cp) orelse .{};
+
+        block[block_len] = canon;
+        block_len += 1;
+
+        if (block_len < block_size and cp != 0x10ffff) continue;
+
+        const gop = try blocks_map.getOrPut(block);
+        if (!gop.found_existing) {
+            gop.value_ptr.* = @intCast(stage2.items.len);
+            try stage2.appendSlice(&block);
+        }
+
+        try stage1.append(gop.value_ptr.*);
+        block_len = 0;
+    }
+
+    var write_buf: [4096]u8 = undefined;
+    var out_file = try std.fs.cwd().createFile(output_path, .{});
+    defer out_file.close();
+    var writer = out_file.writer(&write_buf);
+
+    try writer.interface.print(
+        \\//! This file is auto-generated. Do not edit.
+        \\
+        \\pub const Canonicalization = struct {{
+        \\    len: u3,
+        \\    cps: [2]u21,
+        \\}};
+        \\
+        \\pub const s1: [{}]u16 = .{{
+    , .{stage1.items.len});
+    for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry});
+
+    try writer.interface.print(
+        \\
+        \\}};
+        \\
+        \\pub const s2: [{}]Canonicalization = .{{
+    , .{stage2.items.len});
+    for (stage2.items) |entry| {
+        try writer.interface.print(".{{ .len = {}, .cps = .{{ {}, {} }} }}, ", .{
+            entry.len,
+            entry.cps[0],
+            entry.cps[1],
+        });
+    }
+
+    const composite = composite_set.keys();
+    // TODO: cut
+    try writer.interface.print(
+        \\
+        \\}};
+        \\
+        \\pub const composite: [{}]u21 = .{{
+    , .{composite.len});
+    for (composite) |entry| try writer.interface.print("{}, ", .{entry});
+
+    try writer.interface.writeAll(
+        \\};
+    );
+
+    try writer.interface.print(
+        \\
+        \\ pub const c_map: [{}]struct {{ [2]u21, u21 }} = .{{
+    , .{composite.len});
+    for (composite) |comp| {
+        const canon = canon_map.get(comp).?;
+        std.debug.assert(canon.len == 2);
+        try writer.interface.print(
+            \\ .{{ .{{{}, {}}}, {}}},
+        ,
+            .{ canon.cps[0], canon.cps[1], comp },
+        );
+    }
+    // var c_entries = composite_set.iterator();
+    // while (c_entries.next()) |entry| {
+    //     try writer.interface.print(
+    //         \\ .{{ .{{{}, {}}}, {}}},
+    //     ,
+    //         .{ entry.value_ptr[0], entry.value_ptr[1], entry.key_ptr.* },
+    //     );
+    // }
+    try writer.interface.writeAll(
+        \\};
+    );
+
+    try writer.interface.flush();
 }
diff --git a/src/CanonData.zig b/src/CanonData.zig
index cf9dc8a..c972534 100644
--- a/src/CanonData.zig
+++ b/src/CanonData.zig
@@ -1,78 +1,77 @@
 //! Canonicalization Data
 
+s1: []const u16 = undefined,
+s2: []const @import("canon").Canonicalization = undefined,
 nfc: std.AutoHashMapUnmanaged([2]u21, u21),
-nfd: [][]u21 = undefined,
-cps: []u21 = undefined,
 
 const CanonData = @This();
 
-pub fn init(allocator: mem.Allocator) !CanonData {
-    const in_bytes = @embedFile("canon");
-    var in_fbs = std.io.fixedBufferStream(in_bytes);
-    var reader = in_fbs.reader();
+// There's a bug here, which is down to how static u21 vs. runtime are handled,
+// the "unique representation" claim is not working out.  So we do this:
 
-    const endian = builtin.cpu.arch.endian();
-    var cdata = CanonData{
-        .nfc = .empty,
-        .nfd = try allocator.alloc([]u21, 0x110000),
-    };
-    {
-        errdefer allocator.free(cdata.nfd);
-        cdata.cps = try allocator.alloc(u21, magic.canon_size);
+const Context = struct {
+    pub fn hash(_: Context, cps: [2]u21) u64 {
+        const cp_44: u64 = (@as(u64, cps[0]) << 22) + cps[1];
+        return std.hash.int(cp_44);
     }
 
-    var total_cp: u24 = undefined;
-
-    errdefer {
-        cdata.nfc.deinit(allocator);
-        allocator.free(cdata.cps);
-        allocator.free(cdata.nfd);
+    pub fn eql(_: Context, cps1: [2]u21, cps2: [2]u21) bool {
+        return cps1[0] == cps2[0] and cps1[1] == cps2[1];
     }
+};
 
-    @memset(cdata.nfd, &.{});
+const c_map = comptime_map.ComptimeHashMap([2]u21, u21, Context, @import("canon").c_map);
 
-    var total_len: usize = 0;
+pub fn init(allocator: mem.Allocator) !CanonData {
+    var cdata = CanonData{
+        .nfc = .empty,
+    };
+    errdefer cdata.deinit(allocator);
 
-    while (true) {
-        const len: u8 = try reader.readInt(u8, endian);
-        if (len == 0) break;
-        const cp = try reader.readInt(u24, endian);
-        total_cp = cp;
-        const nfd_cp = cdata.cps[total_len..][0 .. len - 1];
-        for (0..len - 1) |i| {
-            nfd_cp[i] = @intCast(try reader.readInt(u24, endian));
-        }
-        if (len == 3) {
-            try cdata.nfc.put(allocator, nfd_cp[0..2].*, @intCast(cp));
-        }
-        cdata.nfd[cp] = nfd_cp;
-        total_len += len - 1;
+    const data = @import("canon");
+    cdata.s1 = &data.s1;
+    cdata.s2 = &data.s2;
+    var count: usize = 0;
+    for (data.composite) |cp| {
+        count += 1;
+        const cps = cdata.toNfd(cp);
+        std.debug.assert(cps.len == 2);
+        try cdata.nfc.put(allocator, cps[0..2].*, cp);
     }
 
-    if (comptime magic.print) std.debug.print("CanonData magic number: {d}\n", .{total_len});
+    // var keys = cdata.nfc.keyIterator();
+    // while (keys.next()) |key| {
+    //     const c32: [2]u32 = .{ key[0], key[1] };
+    //     if (c_map.get(c32)) |_| {
+    //         std.debug.print("got", .{});
+    //     }
+    // }
 
     return cdata;
 }
 
 pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void {
     cdata.nfc.deinit(allocator);
-    allocator.free(cdata.cps);
-    allocator.free(cdata.nfd);
 }
 
 /// Returns canonical decomposition for `cp`.
 pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 {
-    return cdata.nfd[cp];
+    const canon = &cdata.s2[cdata.s1[cp >> 8] + (cp & 0xff)];
+    return canon.cps[0..canon.len];
 }
 
 // Returns the primary composite for the codepoints in `cp`.
 pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 {
-    return cdata.nfc.get(cps);
+    _ = cdata;
+    if (c_map.get(cps)) |cpp| {
+        return cpp.*;
+    } else {
+        return null;
+    }
+    unreachable;
 }
 
 const std = @import("std");
 const builtin = @import("builtin");
-const compress = std.compress;
 const mem = std.mem;
-const magic = @import("magic");
-const options = @import("options");
+const comptime_map = @import("comptime_map.zig");
diff --git a/src/comptime_map.zig b/src/comptime_map.zig
index b2c4d11..d41f6f8 100644
--- a/src/comptime_map.zig
+++ b/src/comptime_map.zig
@@ -28,7 +28,7 @@ pub fn ComptimeHashMap(comptime K: type, comptime V: type, comptime ctx: type, c
     };
 
     // ensure that the hash map will be at most 60% full
-    const size = if (@import("builtin").is_test) values.len else math.ceilPowerOfTwo(usize, values.len * 5 / 3) catch unreachable;
+    const size = math.ceilPowerOfTwo(usize, values.len * 5 / 3) catch unreachable;
     comptime var slots = [1]Entry{.{}} ** size;
     comptime var distance: [size]usize = .{0} ** size;
 
@@ -148,6 +148,15 @@ test "auto comptime hash map" {
     try testing.expect(map.get(4_000_000) == null);
 }
 
+test "array pair comptime hash map" {
+    const map = AutoComptimeHashMap([2]u32, u21, .{
+        .{ .{ 2, 3 }, 5 },
+        .{ .{ 42, 56 }, 12 },
+        .{ .{ 2, 4 }, 6 },
+    });
+    try testing.expect(map.has(.{ 2, 4 }));
+}
+
 const std = @import("std");
 const hash_map = std.hash_map;
 const testing = std.testing;
@@ -174,3 +183,4 @@ const math = std.math;
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
+
-- 
cgit v1.2.3