From 904fa4d94f30825bec490133ff402c6350f45e26 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 4 Feb 2026 21:21:14 -0500 Subject: Teasing out canonicalization After coping with a spuriously broken autohash for awhile, I got the one remaining hash table moved into memory, so there's no further reason to put up with allocation of basic structures. So that's nice. --- build.zig | 2 +- codegen/canon.zig | 182 ++++++++++++++++++++++++++++++++++++++++++++------- src/CanonData.zig | 87 ++++++++++++------------ src/comptime_map.zig | 12 +++- 4 files changed, 214 insertions(+), 69 deletions(-) diff --git a/build.zig b/build.zig index be91f50..3cf69aa 100644 --- a/build.zig +++ b/build.zig @@ -91,7 +91,7 @@ pub fn build(b: *std.Build) void { }); canon_gen_exe.root_module.addAnonymousImport("UnicodeData.txt", .{ .root_source_file = b.path("data/unicode/UnicodeData.txt") }); const run_canon_gen_exe = b.addRunArtifact(canon_gen_exe); - const canon_gen_out = run_canon_gen_exe.addOutputFileArg("canon.bin.z"); + const canon_gen_out = run_canon_gen_exe.addOutputFileArg("canon.zig"); const compat_gen_exe = b.addExecutable(.{ .name = "compat", diff --git a/codegen/canon.zig b/codegen/canon.zig index d95a905..e92be5d 100644 --- a/codegen/canon.zig +++ b/codegen/canon.zig @@ -1,12 +1,38 @@ const std = @import("std"); const builtin = @import("builtin"); +const block_size = 256; +const Block = [block_size]Canonicalization; + +const Canonicalization = struct { + len: u3 = 0, + cps: [2]u21 = [_]u21{0} ** 2, +}; + +const BlockMap = std.HashMap( + Block, + u16, + struct { + pub fn hash(_: @This(), k: Block) u64 { + var hasher = std.hash.Wyhash.init(0); + std.hash.autoHashStrat(&hasher, k, .DeepRecursive); + return hasher.final(); + } + + pub fn eql(_: @This(), aBlock: Block, bBlock: Block) bool { + return for (aBlock, bBlock) |a, b| { + if (a.len != b.len or a.cps[0] != b.cps[0] or a.cps[1] != b.cps[1]) return false; + } else true; + } + }, + std.hash_map.default_max_load_percentage, +); + pub fn main() anyerror!void { var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); defer arena.deinit(); const allocator = arena.allocator(); - var write_buf: [4096]u8 = undefined; // Process UnicodeData.txt var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt")); var args_iter = try std.process.argsWithAllocator(allocator); @@ -14,53 +40,163 @@ pub fn main() anyerror!void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - var out_file = try std.fs.cwd().createFile(output_path, .{}); - defer out_file.close(); - var file_writer = out_file.writer(&write_buf); - var writer = &file_writer.interface; - const endian = builtin.cpu.arch.endian(); + var canon_map = std.AutoHashMap(u21, Canonicalization).init(allocator); + defer canon_map.deinit(); + + var composite_set = std.AutoArrayHashMap(u21, [2]u21).init(allocator); - lines: while (in_reader.takeDelimiterInclusive('\n')) |took| { - const line = std.mem.trimRight(u8, took, "\n"); + while (in_reader.takeDelimiterInclusive('\n')) |line| { if (line.len == 0) continue; var field_iter = std.mem.splitScalar(u8, line, ';'); - var cps: [3]u24 = undefined; - var len: u8 = 2; + var cp: u21 = undefined; var i: usize = 0; while (field_iter.next()) |field| : (i += 1) { + if (field.len == 0) continue; + switch (i) { - 0 => cps[0] = try std.fmt.parseInt(u24, field, 16), + 0 => cp = try std.fmt.parseInt(u21, field, 16), 5 => { // Not canonical. - if (field.len == 0 or field[0] == '<') continue :lines; + if (field[0] == '<') continue; + if (std.mem.indexOfScalar(u8, field, ' ')) |space| { // Canonical - len = 3; - cps[1] = try std.fmt.parseInt(u24, field[0..space], 16); - cps[2] = try std.fmt.parseInt(u24, field[space + 1 ..], 16); + const c0, const c1 = .{ + try std.fmt.parseInt(u21, field[0..space], 16), + try std.fmt.parseInt(u21, field[space + 1 ..], 16), + }; + try canon_map.put(cp, Canonicalization{ + .len = 2, + .cps = [_]u21{ c0, c1 }, + }); + try composite_set.put(cp, [_]u21{ c0, c1 }); } else { // Singleton - cps[1] = try std.fmt.parseInt(u24, field, 16); + try canon_map.put(cp, Canonicalization{ + .len = 1, + .cps = [_]u21{ + try std.fmt.parseInt(u21, field, 16), + 0, + }, + }); } }, - 2 => if (line[0] == '<') continue :lines, - else => {}, } } - - try writer.writeInt(u8, @intCast(len), endian); - for (cps[0..len]) |cp| try writer.writeInt(u24, cp, endian); } else |err| switch (err) { error.EndOfStream => {}, else => { return err; }, } - try writer.writeInt(u16, 0, endian); - try writer.flush(); + + // Build multi-tiered lookup tables for decompositions + var blocks_map = BlockMap.init(allocator); + defer blocks_map.deinit(); + + var stage1 = std.array_list.Managed(u16).init(allocator); + defer stage1.deinit(); + + var stage2 = std.array_list.Managed(Canonicalization).init(allocator); + defer stage2.deinit(); + + var block: Block = [_]Canonicalization{.{}} ** block_size; + var block_len: u16 = 0; + + for (0..0x110000) |i| { + const cp: u21 = @intCast(i); + + const canon: Canonicalization = canon_map.get(cp) orelse .{}; + + block[block_len] = canon; + block_len += 1; + + if (block_len < block_size and cp != 0x10ffff) continue; + + const gop = try blocks_map.getOrPut(block); + if (!gop.found_existing) { + gop.value_ptr.* = @intCast(stage2.items.len); + try stage2.appendSlice(&block); + } + + try stage1.append(gop.value_ptr.*); + block_len = 0; + } + + var write_buf: [4096]u8 = undefined; + var out_file = try std.fs.cwd().createFile(output_path, .{}); + defer out_file.close(); + var writer = out_file.writer(&write_buf); + + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const Canonicalization = struct {{ + \\ len: u3, + \\ cps: [2]u21, + \\}}; + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]Canonicalization = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| { + try writer.interface.print(".{{ .len = {}, .cps = .{{ {}, {} }} }}, ", .{ + entry.len, + entry.cps[0], + entry.cps[1], + }); + } + + const composite = composite_set.keys(); + // TODO: cut + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const composite: [{}]u21 = .{{ + , .{composite.len}); + for (composite) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); + + try writer.interface.print( + \\ + \\ pub const c_map: [{}]struct {{ [2]u21, u21 }} = .{{ + , .{composite.len}); + for (composite) |comp| { + const canon = canon_map.get(comp).?; + std.debug.assert(canon.len == 2); + try writer.interface.print( + \\ .{{ .{{{}, {}}}, {}}}, + , + .{ canon.cps[0], canon.cps[1], comp }, + ); + } + // var c_entries = composite_set.iterator(); + // while (c_entries.next()) |entry| { + // try writer.interface.print( + // \\ .{{ .{{{}, {}}}, {}}}, + // , + // .{ entry.value_ptr[0], entry.value_ptr[1], entry.key_ptr.* }, + // ); + // } + try writer.interface.writeAll( + \\}; + ); + + try writer.interface.flush(); } diff --git a/src/CanonData.zig b/src/CanonData.zig index cf9dc8a..c972534 100644 --- a/src/CanonData.zig +++ b/src/CanonData.zig @@ -1,78 +1,77 @@ //! Canonicalization Data +s1: []const u16 = undefined, +s2: []const @import("canon").Canonicalization = undefined, nfc: std.AutoHashMapUnmanaged([2]u21, u21), -nfd: [][]u21 = undefined, -cps: []u21 = undefined, const CanonData = @This(); -pub fn init(allocator: mem.Allocator) !CanonData { - const in_bytes = @embedFile("canon"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var reader = in_fbs.reader(); +// There's a bug here, which is down to how static u21 vs. runtime are handled, +// the "unique representation" claim is not working out. So we do this: - const endian = builtin.cpu.arch.endian(); - var cdata = CanonData{ - .nfc = .empty, - .nfd = try allocator.alloc([]u21, 0x110000), - }; - { - errdefer allocator.free(cdata.nfd); - cdata.cps = try allocator.alloc(u21, magic.canon_size); +const Context = struct { + pub fn hash(_: Context, cps: [2]u21) u64 { + const cp_44: u64 = (@as(u64, cps[0]) << 22) + cps[1]; + return std.hash.int(cp_44); } - var total_cp: u24 = undefined; - - errdefer { - cdata.nfc.deinit(allocator); - allocator.free(cdata.cps); - allocator.free(cdata.nfd); + pub fn eql(_: Context, cps1: [2]u21, cps2: [2]u21) bool { + return cps1[0] == cps2[0] and cps1[1] == cps2[1]; } +}; - @memset(cdata.nfd, &.{}); +const c_map = comptime_map.ComptimeHashMap([2]u21, u21, Context, @import("canon").c_map); - var total_len: usize = 0; +pub fn init(allocator: mem.Allocator) !CanonData { + var cdata = CanonData{ + .nfc = .empty, + }; + errdefer cdata.deinit(allocator); - while (true) { - const len: u8 = try reader.readInt(u8, endian); - if (len == 0) break; - const cp = try reader.readInt(u24, endian); - total_cp = cp; - const nfd_cp = cdata.cps[total_len..][0 .. len - 1]; - for (0..len - 1) |i| { - nfd_cp[i] = @intCast(try reader.readInt(u24, endian)); - } - if (len == 3) { - try cdata.nfc.put(allocator, nfd_cp[0..2].*, @intCast(cp)); - } - cdata.nfd[cp] = nfd_cp; - total_len += len - 1; + const data = @import("canon"); + cdata.s1 = &data.s1; + cdata.s2 = &data.s2; + var count: usize = 0; + for (data.composite) |cp| { + count += 1; + const cps = cdata.toNfd(cp); + std.debug.assert(cps.len == 2); + try cdata.nfc.put(allocator, cps[0..2].*, cp); } - if (comptime magic.print) std.debug.print("CanonData magic number: {d}\n", .{total_len}); + // var keys = cdata.nfc.keyIterator(); + // while (keys.next()) |key| { + // const c32: [2]u32 = .{ key[0], key[1] }; + // if (c_map.get(c32)) |_| { + // std.debug.print("got", .{}); + // } + // } return cdata; } pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void { cdata.nfc.deinit(allocator); - allocator.free(cdata.cps); - allocator.free(cdata.nfd); } /// Returns canonical decomposition for `cp`. pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 { - return cdata.nfd[cp]; + const canon = &cdata.s2[cdata.s1[cp >> 8] + (cp & 0xff)]; + return canon.cps[0..canon.len]; } // Returns the primary composite for the codepoints in `cp`. pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 { - return cdata.nfc.get(cps); + _ = cdata; + if (c_map.get(cps)) |cpp| { + return cpp.*; + } else { + return null; + } + unreachable; } const std = @import("std"); const builtin = @import("builtin"); -const compress = std.compress; const mem = std.mem; -const magic = @import("magic"); -const options = @import("options"); +const comptime_map = @import("comptime_map.zig"); diff --git a/src/comptime_map.zig b/src/comptime_map.zig index b2c4d11..d41f6f8 100644 --- a/src/comptime_map.zig +++ b/src/comptime_map.zig @@ -28,7 +28,7 @@ pub fn ComptimeHashMap(comptime K: type, comptime V: type, comptime ctx: type, c }; // ensure that the hash map will be at most 60% full - const size = if (@import("builtin").is_test) values.len else math.ceilPowerOfTwo(usize, values.len * 5 / 3) catch unreachable; + const size = math.ceilPowerOfTwo(usize, values.len * 5 / 3) catch unreachable; comptime var slots = [1]Entry{.{}} ** size; comptime var distance: [size]usize = .{0} ** size; @@ -148,6 +148,15 @@ test "auto comptime hash map" { try testing.expect(map.get(4_000_000) == null); } +test "array pair comptime hash map" { + const map = AutoComptimeHashMap([2]u32, u21, .{ + .{ .{ 2, 3 }, 5 }, + .{ .{ 42, 56 }, 12 }, + .{ .{ 2, 4 }, 6 }, + }); + try testing.expect(map.has(.{ 2, 4 })); +} + const std = @import("std"); const hash_map = std.hash_map; const testing = std.testing; @@ -174,3 +183,4 @@ const math = std.math; // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. + -- cgit v1.2.3