From 95f9487f6a7bde2d7266399bdf6843b97cc1b301 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Thu, 5 Feb 2026 07:07:40 -0500 Subject: Base units do not allocate CanonData included. I may still sort out caseless matching without allocation, but that's a stretch goal. Closes #86 Closes #85 --- src/CanonData.zig | 57 +++++++++++++++++-------------------------------------- 1 file changed, 17 insertions(+), 40 deletions(-) (limited to 'src/CanonData.zig') diff --git a/src/CanonData.zig b/src/CanonData.zig index c972534..5c1ffa6 100644 --- a/src/CanonData.zig +++ b/src/CanonData.zig @@ -1,13 +1,23 @@ //! Canonicalization Data -s1: []const u16 = undefined, -s2: []const @import("canon").Canonicalization = undefined, -nfc: std.AutoHashMapUnmanaged([2]u21, u21), +const Data = struct { + s1: []const u16 = undefined, + s2: []const @import("canon").Canonicalization = undefined, +}; + +const canon_data = canon_data: { + const canon_ = @import("canon"); + break :canon_data Data{ + .s1 = &canon_.s1, + .s2 = &canon_.s2, + }; +}; const CanonData = @This(); // There's a bug here, which is down to how static u21 vs. runtime are handled, -// the "unique representation" claim is not working out. So we do this: +// the "unique representation" claim is not working out. AutoHash casts to bytes, +// and that won't fly. So we do this: const Context = struct { pub fn hash(_: Context, cps: [2]u21) u64 { @@ -22,47 +32,14 @@ const Context = struct { const c_map = comptime_map.ComptimeHashMap([2]u21, u21, Context, @import("canon").c_map); -pub fn init(allocator: mem.Allocator) !CanonData { - var cdata = CanonData{ - .nfc = .empty, - }; - errdefer cdata.deinit(allocator); - - const data = @import("canon"); - cdata.s1 = &data.s1; - cdata.s2 = &data.s2; - var count: usize = 0; - for (data.composite) |cp| { - count += 1; - const cps = cdata.toNfd(cp); - std.debug.assert(cps.len == 2); - try cdata.nfc.put(allocator, cps[0..2].*, cp); - } - - // var keys = cdata.nfc.keyIterator(); - // while (keys.next()) |key| { - // const c32: [2]u32 = .{ key[0], key[1] }; - // if (c_map.get(c32)) |_| { - // std.debug.print("got", .{}); - // } - // } - - return cdata; -} - -pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void { - cdata.nfc.deinit(allocator); -} - /// Returns canonical decomposition for `cp`. -pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 { - const canon = &cdata.s2[cdata.s1[cp >> 8] + (cp & 0xff)]; +pub fn toNfd(cp: u21) []const u21 { + const canon = &canon_data.s2[canon_data.s1[cp >> 8] + (cp & 0xff)]; return canon.cps[0..canon.len]; } // Returns the primary composite for the codepoints in `cp`. -pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 { - _ = cdata; +pub fn toNfc(cps: [2]u21) ?u21 { if (c_map.get(cps)) |cpp| { return cpp.*; } else { -- cgit v1.2.3