//! Canonicalization Data s1: []const u16 = undefined, s2: []const @import("canon").Canonicalization = undefined, nfc: std.AutoHashMapUnmanaged([2]u21, u21), const CanonData = @This(); // There's a bug here, which is down to how static u21 vs. runtime are handled, // the "unique representation" claim is not working out. So we do this: const Context = struct { pub fn hash(_: Context, cps: [2]u21) u64 { const cp_44: u64 = (@as(u64, cps[0]) << 22) + cps[1]; return std.hash.int(cp_44); } pub fn eql(_: Context, cps1: [2]u21, cps2: [2]u21) bool { return cps1[0] == cps2[0] and cps1[1] == cps2[1]; } }; const c_map = comptime_map.ComptimeHashMap([2]u21, u21, Context, @import("canon").c_map); pub fn init(allocator: mem.Allocator) !CanonData { var cdata = CanonData{ .nfc = .empty, }; errdefer cdata.deinit(allocator); const data = @import("canon"); cdata.s1 = &data.s1; cdata.s2 = &data.s2; var count: usize = 0; for (data.composite) |cp| { count += 1; const cps = cdata.toNfd(cp); std.debug.assert(cps.len == 2); try cdata.nfc.put(allocator, cps[0..2].*, cp); } // var keys = cdata.nfc.keyIterator(); // while (keys.next()) |key| { // const c32: [2]u32 = .{ key[0], key[1] }; // if (c_map.get(c32)) |_| { // std.debug.print("got", .{}); // } // } return cdata; } pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void { cdata.nfc.deinit(allocator); } /// Returns canonical decomposition for `cp`. pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 { const canon = &cdata.s2[cdata.s1[cp >> 8] + (cp & 0xff)]; return canon.cps[0..canon.len]; } // Returns the primary composite for the codepoints in `cp`. pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 { _ = cdata; if (c_map.get(cps)) |cpp| { return cpp.*; } else { return null; } unreachable; } const std = @import("std"); const builtin = @import("builtin"); const mem = std.mem; const comptime_map = @import("comptime_map.zig");