From 904fa4d94f30825bec490133ff402c6350f45e26 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 4 Feb 2026 21:21:14 -0500 Subject: Teasing out canonicalization After coping with a spuriously broken autohash for awhile, I got the one remaining hash table moved into memory, so there's no further reason to put up with allocation of basic structures. So that's nice. --- src/CanonData.zig | 87 +++++++++++++++++++++++++++---------------------------- 1 file changed, 43 insertions(+), 44 deletions(-) (limited to 'src/CanonData.zig') diff --git a/src/CanonData.zig b/src/CanonData.zig index cf9dc8a..c972534 100644 --- a/src/CanonData.zig +++ b/src/CanonData.zig @@ -1,78 +1,77 @@ //! Canonicalization Data +s1: []const u16 = undefined, +s2: []const @import("canon").Canonicalization = undefined, nfc: std.AutoHashMapUnmanaged([2]u21, u21), -nfd: [][]u21 = undefined, -cps: []u21 = undefined, const CanonData = @This(); -pub fn init(allocator: mem.Allocator) !CanonData { - const in_bytes = @embedFile("canon"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var reader = in_fbs.reader(); +// There's a bug here, which is down to how static u21 vs. runtime are handled, +// the "unique representation" claim is not working out. So we do this: - const endian = builtin.cpu.arch.endian(); - var cdata = CanonData{ - .nfc = .empty, - .nfd = try allocator.alloc([]u21, 0x110000), - }; - { - errdefer allocator.free(cdata.nfd); - cdata.cps = try allocator.alloc(u21, magic.canon_size); +const Context = struct { + pub fn hash(_: Context, cps: [2]u21) u64 { + const cp_44: u64 = (@as(u64, cps[0]) << 22) + cps[1]; + return std.hash.int(cp_44); } - var total_cp: u24 = undefined; - - errdefer { - cdata.nfc.deinit(allocator); - allocator.free(cdata.cps); - allocator.free(cdata.nfd); + pub fn eql(_: Context, cps1: [2]u21, cps2: [2]u21) bool { + return cps1[0] == cps2[0] and cps1[1] == cps2[1]; } +}; - @memset(cdata.nfd, &.{}); +const c_map = comptime_map.ComptimeHashMap([2]u21, u21, Context, @import("canon").c_map); - var total_len: usize = 0; +pub fn init(allocator: mem.Allocator) !CanonData { + var cdata = CanonData{ + .nfc = .empty, + }; + errdefer cdata.deinit(allocator); - while (true) { - const len: u8 = try reader.readInt(u8, endian); - if (len == 0) break; - const cp = try reader.readInt(u24, endian); - total_cp = cp; - const nfd_cp = cdata.cps[total_len..][0 .. len - 1]; - for (0..len - 1) |i| { - nfd_cp[i] = @intCast(try reader.readInt(u24, endian)); - } - if (len == 3) { - try cdata.nfc.put(allocator, nfd_cp[0..2].*, @intCast(cp)); - } - cdata.nfd[cp] = nfd_cp; - total_len += len - 1; + const data = @import("canon"); + cdata.s1 = &data.s1; + cdata.s2 = &data.s2; + var count: usize = 0; + for (data.composite) |cp| { + count += 1; + const cps = cdata.toNfd(cp); + std.debug.assert(cps.len == 2); + try cdata.nfc.put(allocator, cps[0..2].*, cp); } - if (comptime magic.print) std.debug.print("CanonData magic number: {d}\n", .{total_len}); + // var keys = cdata.nfc.keyIterator(); + // while (keys.next()) |key| { + // const c32: [2]u32 = .{ key[0], key[1] }; + // if (c_map.get(c32)) |_| { + // std.debug.print("got", .{}); + // } + // } return cdata; } pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void { cdata.nfc.deinit(allocator); - allocator.free(cdata.cps); - allocator.free(cdata.nfd); } /// Returns canonical decomposition for `cp`. pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 { - return cdata.nfd[cp]; + const canon = &cdata.s2[cdata.s1[cp >> 8] + (cp & 0xff)]; + return canon.cps[0..canon.len]; } // Returns the primary composite for the codepoints in `cp`. pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 { - return cdata.nfc.get(cps); + _ = cdata; + if (c_map.get(cps)) |cpp| { + return cpp.*; + } else { + return null; + } + unreachable; } const std = @import("std"); const builtin = @import("builtin"); -const compress = std.compress; const mem = std.mem; -const magic = @import("magic"); -const options = @import("options"); +const comptime_map = @import("comptime_map.zig"); -- cgit v1.2.3