Teasing out canonicalization

After coping with a spuriously broken autohash for awhile, I got the one remaining hash table moved into memory, so there's no further reason to put up with allocation of basic structures. So that's nice.
author: Sam Atman 2026-02-04 21:21:14 -0500
committer: Sam Atman 2026-02-04 21:21:14 -0500
commit: 904fa4d94f30825bec490133ff402c6350f45e26 (patch)
tree: cbfc44b59ca1e539d6d2a4ff8f083fb4e25d41ef
parent: Rest of the 'easy' stuff (diff)
download: zg-904fa4d94f30825bec490133ff402c6350f45e26.tar.gz
zg-904fa4d94f30825bec490133ff402c6350f45e26.tar.xz
zg-904fa4d94f30825bec490133ff402c6350f45e26.zip
4 files changed, 214 insertions, 69 deletions
diff --git a/build.zig b/build.zig
index be91f50..3cf69aa 100644
--- a/build.zig
+++ b/build.zig
@@ -91,7 +91,7 @@ pub fn build(b: *std.Build) void {
    });
    canon_gen_exe.root_module.addAnonymousImport("UnicodeData.txt", .{ .root_source_file = b.path("data/unicode/UnicodeData.txt") });
    const run_canon_gen_exe = b.addRunArtifact(canon_gen_exe);
-    const canon_gen_out = run_canon_gen_exe.addOutputFileArg("canon.bin.z");
+    const canon_gen_out = run_canon_gen_exe.addOutputFileArg("canon.zig");
    const compat_gen_exe = b.addExecutable(.{
        .name = "compat",
diff --git a/codegen/canon.zig b/codegen/canon.zig
index d95a905..e92be5d 100644
--- a/codegen/canon.zig
+++ b/codegen/canon.zig
@@ -1,12 +1,38 @@
 const std = @import("std");
 const builtin = @import("builtin");
+const block_size = 256;
+const Block = [block_size]Canonicalization;
+const Canonicalization = struct {
+    len: u3 = 0,
+    cps: [2]u21 = [_]u21{0} ** 2,
+};
+const BlockMap = std.HashMap(
+    Block,
+    u16,
+    struct {
+        pub fn hash(_: @This(), k: Block) u64 {
+            var hasher = std.hash.Wyhash.init(0);
+            std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
+            return hasher.final();
+        }
+        pub fn eql(_: @This(), aBlock: Block, bBlock: Block) bool {
+            return for (aBlock, bBlock) |a, b| {
+                if (a.len != b.len or a.cps[0] != b.cps[0] or a.cps[1] != b.cps[1]) return false;
+            } else true;
+        }
+    },
+    std.hash_map.default_max_load_percentage,
+);
 pub fn main() anyerror!void {
    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer arena.deinit();
    const allocator = arena.allocator();
-    var write_buf: [4096]u8 = undefined;
    // Process UnicodeData.txt
    var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt"));
    var args_iter = try std.process.argsWithAllocator(allocator);
@@ -14,53 +40,163 @@ pub fn main() anyerror!void {
    _ = args_iter.skip();
    const output_path = args_iter.next() orelse @panic("No output file arg!");
-    var out_file = try std.fs.cwd().createFile(output_path, .{});
+    var canon_map = std.AutoHashMap(u21, Canonicalization).init(allocator);
-    defer out_file.close();
+    defer canon_map.deinit();
-    var file_writer = out_file.writer(&write_buf);
-    var writer = &file_writer.interface;
+    var composite_set = std.AutoArrayHashMap(u21, [2]u21).init(allocator);
-    const endian = builtin.cpu.arch.endian();
-    lines: while (in_reader.takeDelimiterInclusive('\n')) |took| {
+    while (in_reader.takeDelimiterInclusive('\n')) |line| {
-        const line = std.mem.trimRight(u8, took, "\n");
        if (line.len == 0) continue;
        var field_iter = std.mem.splitScalar(u8, line, ';');
-        var cps: [3]u24 = undefined;
+        var cp: u21 = undefined;
-        var len: u8 = 2;
        var i: usize = 0;
        while (field_iter.next()) |field| : (i += 1) {
+            if (field.len == 0) continue;
            switch (i) {
-                0 => cps[0] = try std.fmt.parseInt(u24, field, 16),
+                0 => cp = try std.fmt.parseInt(u21, field, 16),
                5 => {
                    // Not canonical.
-                    if (field.len == 0 or field[0] == '<') continue :lines;
+                    if (field[0] == '<') continue;
                    if (std.mem.indexOfScalar(u8, field, ' ')) |space| {
                        // Canonical
-                        len = 3;
+                        const c0, const c1 = .{
-                        cps[1] = try std.fmt.parseInt(u24, field[0..space], 16);
+                            try std.fmt.parseInt(u21, field[0..space], 16),
-                        cps[2] = try std.fmt.parseInt(u24, field[space + 1 ..], 16);
+                            try std.fmt.parseInt(u21, field[space + 1 ..], 16),
+                        };
+                        try canon_map.put(cp, Canonicalization{
+                            .len = 2,
+                            .cps = [_]u21{ c0, c1 },
+                        });
+                        try composite_set.put(cp, [_]u21{ c0, c1 });
                    } else {
                        // Singleton
-                        cps[1] = try std.fmt.parseInt(u24, field, 16);
+                        try canon_map.put(cp, Canonicalization{
+                            .len = 1,
+                            .cps = [_]u21{
+                                try std.fmt.parseInt(u21, field, 16),
+                                0,
+                            },
+                        });
                    }
                },
-                2 => if (line[0] == '<') continue :lines,
                else => {},
            }
        }
-        try writer.writeInt(u8, @intCast(len), endian);
-        for (cps[0..len]) |cp| try writer.writeInt(u24, cp, endian);
    } else |err| switch (err) {
        error.EndOfStream => {},
        else => {
            return err;
        },
    }
-    try writer.writeInt(u16, 0, endian);
-    try writer.flush();
+    // Build multi-tiered lookup tables for decompositions
+    var blocks_map = BlockMap.init(allocator);
+    defer blocks_map.deinit();
+    var stage1 = std.array_list.Managed(u16).init(allocator);
+    defer stage1.deinit();
+    var stage2 = std.array_list.Managed(Canonicalization).init(allocator);
+    defer stage2.deinit();
+    var block: Block = [_]Canonicalization{.{}} ** block_size;
+    var block_len: u16 = 0;
+    for (0..0x110000) |i| {
+        const cp: u21 = @intCast(i);
+        const canon: Canonicalization = canon_map.get(cp) orelse .{};
+        block[block_len] = canon;
+        block_len += 1;
+        if (block_len < block_size and cp != 0x10ffff) continue;
+        const gop = try blocks_map.getOrPut(block);
+        if (!gop.found_existing) {
+            gop.value_ptr.* = @intCast(stage2.items.len);
+            try stage2.appendSlice(&block);
+        }
+        try stage1.append(gop.value_ptr.*);
+        block_len = 0;
+    }
+    var write_buf: [4096]u8 = undefined;
+    var out_file = try std.fs.cwd().createFile(output_path, .{});
+    defer out_file.close();
+    var writer = out_file.writer(&write_buf);
+    try writer.interface.print(
+        \\//! This file is auto-generated. Do not edit.
+        \\
+        \\pub const Canonicalization = struct {{
+        \\    len: u3,
+        \\    cps: [2]u21,
+        \\}};
+        \\
+        \\pub const s1: [{}]u16 = .{{
+    , .{stage1.items.len});
+    for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry});
+    try writer.interface.print(
+        \\
+        \\}};
+        \\
+        \\pub const s2: [{}]Canonicalization = .{{
+    , .{stage2.items.len});
+    for (stage2.items) |entry| {
+        try writer.interface.print(".{{ .len = {}, .cps = .{{ {}, {} }} }}, ", .{
+            entry.len,
+            entry.cps[0],
+            entry.cps[1],
+        });
+    }
+    const composite = composite_set.keys();
+    // TODO: cut
+    try writer.interface.print(
+        \\
+        \\}};
+        \\
+        \\pub const composite: [{}]u21 = .{{
+    , .{composite.len});
+    for (composite) |entry| try writer.interface.print("{}, ", .{entry});
+    try writer.interface.writeAll(
+        \\};
+    );
+    try writer.interface.print(
+        \\
+        \\ pub const c_map: [{}]struct {{ [2]u21, u21 }} = .{{
+    , .{composite.len});
+    for (composite) |comp| {
+        const canon = canon_map.get(comp).?;
+        std.debug.assert(canon.len == 2);
+        try writer.interface.print(
+            \\ .{{ .{{{}, {}}}, {}}},
+        ,
+            .{ canon.cps[0], canon.cps[1], comp },
+        );
+    }
+    // var c_entries = composite_set.iterator();
+    // while (c_entries.next()) |entry| {
+    //     try writer.interface.print(
+    //         \\ .{{ .{{{}, {}}}, {}}},
+    //     ,
+    //         .{ entry.value_ptr[0], entry.value_ptr[1], entry.key_ptr.* },
+    //     );
+    // }
+    try writer.interface.writeAll(
+        \\};
+    );
+    try writer.interface.flush();
 }
diff --git a/src/CanonData.zig b/src/CanonData.zig
index cf9dc8a..c972534 100644
--- a/src/CanonData.zig
+++ b/src/CanonData.zig
@@ -1,78 +1,77 @@
 //! Canonicalization Data
+s1: []const u16 = undefined,
+s2: []const @import("canon").Canonicalization = undefined,
 nfc: std.AutoHashMapUnmanaged([2]u21, u21),
-nfd: [][]u21 = undefined,
-cps: []u21 = undefined,
 const CanonData = @This();
-pub fn init(allocator: mem.Allocator) !CanonData {
+// There's a bug here, which is down to how static u21 vs. runtime are handled,
-    const in_bytes = @embedFile("canon");
+// the "unique representation" claim is not working out.  So we do this:
-    var in_fbs = std.io.fixedBufferStream(in_bytes);
-    var reader = in_fbs.reader();
-    const endian = builtin.cpu.arch.endian();
+const Context = struct {
-    var cdata = CanonData{
+    pub fn hash(_: Context, cps: [2]u21) u64 {
-        .nfc = .empty,
+        const cp_44: u64 = (@as(u64, cps[0]) << 22) + cps[1];
-        .nfd = try allocator.alloc([]u21, 0x110000),
+        return std.hash.int(cp_44);
-    };
-    {
-        errdefer allocator.free(cdata.nfd);
-        cdata.cps = try allocator.alloc(u21, magic.canon_size);
    }
-    var total_cp: u24 = undefined;
+    pub fn eql(_: Context, cps1: [2]u21, cps2: [2]u21) bool {
+        return cps1[0] == cps2[0] and cps1[1] == cps2[1];
-    errdefer {
-        cdata.nfc.deinit(allocator);
-        allocator.free(cdata.cps);
-        allocator.free(cdata.nfd);
    }
+};
-    @memset(cdata.nfd, &.{});
+const c_map = comptime_map.ComptimeHashMap([2]u21, u21, Context, @import("canon").c_map);
-    var total_len: usize = 0;
+pub fn init(allocator: mem.Allocator) !CanonData {
+    var cdata = CanonData{
+        .nfc = .empty,
+    };
+    errdefer cdata.deinit(allocator);
-    while (true) {
+    const data = @import("canon");
-        const len: u8 = try reader.readInt(u8, endian);
+    cdata.s1 = &data.s1;
-        if (len == 0) break;
+    cdata.s2 = &data.s2;
-        const cp = try reader.readInt(u24, endian);
+    var count: usize = 0;
-        total_cp = cp;
+    for (data.composite) |cp| {
-        const nfd_cp = cdata.cps[total_len..][0 .. len - 1];
+        count += 1;
-        for (0..len - 1) |i| {
+        const cps = cdata.toNfd(cp);
-            nfd_cp[i] = @intCast(try reader.readInt(u24, endian));
+        std.debug.assert(cps.len == 2);
-        }
+        try cdata.nfc.put(allocator, cps[0..2].*, cp);
-        if (len == 3) {
-            try cdata.nfc.put(allocator, nfd_cp[0..2].*, @intCast(cp));
-        }
-        cdata.nfd[cp] = nfd_cp;
-        total_len += len - 1;
    }
-    if (comptime magic.print) std.debug.print("CanonData magic number: {d}\n", .{total_len});
+    // var keys = cdata.nfc.keyIterator();
+    // while (keys.next()) |key| {
+    //     const c32: [2]u32 = .{ key[0], key[1] };
+    //     if (c_map.get(c32)) |_| {
+    //         std.debug.print("got", .{});
+    //     }
+    // }
    return cdata;
 }
 pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void {
    cdata.nfc.deinit(allocator);
-    allocator.free(cdata.cps);
-    allocator.free(cdata.nfd);
 }
 /// Returns canonical decomposition for `cp`.
 pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 {
-    return cdata.nfd[cp];
+    const canon = &cdata.s2[cdata.s1[cp >> 8] + (cp & 0xff)];
+    return canon.cps[0..canon.len];
 }
 // Returns the primary composite for the codepoints in `cp`.
 pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 {
-    return cdata.nfc.get(cps);
+    _ = cdata;
+    if (c_map.get(cps)) |cpp| {
+        return cpp.*;
+    } else {
+        return null;
+    }
+    unreachable;
 }
 const std = @import("std");
 const builtin = @import("builtin");
-const compress = std.compress;
 const mem = std.mem;
-const magic = @import("magic");
+const comptime_map = @import("comptime_map.zig");
-const options = @import("options");
diff --git a/src/comptime_map.zig b/src/comptime_map.zig
index b2c4d11..d41f6f8 100644
--- a/src/comptime_map.zig
+++ b/src/comptime_map.zig
@@ -28,7 +28,7 @@ pub fn ComptimeHashMap(comptime K: type, comptime V: type, comptime ctx: type, c
    };
    // ensure that the hash map will be at most 60% full
-    const size = if (@import("builtin").is_test) values.len else math.ceilPowerOfTwo(usize, values.len * 5 / 3) catch unreachable;
+    const size = math.ceilPowerOfTwo(usize, values.len * 5 / 3) catch unreachable;
    comptime var slots = [1]Entry{.{}} ** size;
    comptime var distance: [size]usize = .{0} ** size;
@@ -148,6 +148,15 @@ test "auto comptime hash map" {
    try testing.expect(map.get(4_000_000) == null);
 }
+test "array pair comptime hash map" {
+    const map = AutoComptimeHashMap([2]u32, u21, .{
+        .{ .{ 2, 3 }, 5 },
+        .{ .{ 42, 56 }, 12 },
+        .{ .{ 2, 4 }, 6 },
+    });
+    try testing.expect(map.has(.{ 2, 4 }));
+}
 const std = @import("std");
 const hash_map = std.hash_map;
 const testing = std.testing;
@@ -174,3 +183,4 @@ const math = std.math;
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
author	Sam Atman	2026-02-04 21:21:14 -0500
committer	Sam Atman	2026-02-04 21:21:14 -0500
commit	904fa4d94f30825bec490133ff402c6350f45e26 (patch)
tree	cbfc44b59ca1e539d6d2a4ff8f083fb4e25d41ef
parent	Rest of the 'easy' stuff (diff)
download	zg-904fa4d94f30825bec490133ff402c6350f45e26.tar.gz zg-904fa4d94f30825bec490133ff402c6350f45e26.tar.xz zg-904fa4d94f30825bec490133ff402c6350f45e26.zip

diff --git a/build.zig b/build.zig index be91f50..3cf69aa 100644 --- a/build.zig +++ b/build.zig
@@ -91,7 +91,7 @@ pub fn build(b: *std.Build) void {
91	});	91	});
92	canon_gen_exe.root_module.addAnonymousImport("UnicodeData.txt", .{ .root_source_file = b.path("data/unicode/UnicodeData.txt") });	92	canon_gen_exe.root_module.addAnonymousImport("UnicodeData.txt", .{ .root_source_file = b.path("data/unicode/UnicodeData.txt") });
93	const run_canon_gen_exe = b.addRunArtifact(canon_gen_exe);	93	const run_canon_gen_exe = b.addRunArtifact(canon_gen_exe);
94	const canon_gen_out = run_canon_gen_exe.addOutputFileArg("canon.bin.z");	94	const canon_gen_out = run_canon_gen_exe.addOutputFileArg("canon.zig");
95		95
96	const compat_gen_exe = b.addExecutable(.{	96	const compat_gen_exe = b.addExecutable(.{
97	.name = "compat",	97	.name = "compat",


diff --git a/codegen/canon.zig b/codegen/canon.zig index d95a905..e92be5d 100644 --- a/codegen/canon.zig +++ b/codegen/canon.zig
@@ -1,12 +1,38 @@
1	const std = @import("std");	1	const std = @import("std");
2	const builtin = @import("builtin");	2	const builtin = @import("builtin");
3		3
		4	const block_size = 256;
		5	const Block = [block_size]Canonicalization;
		6
		7	const Canonicalization = struct {
		8	len: u3 = 0,
		9	cps: [2]u21 = [_]u21{0} ** 2,
		10	};
		11
		12	const BlockMap = std.HashMap(
		13	Block,
		14	u16,
		15	struct {
		16	pub fn hash(_: @This(), k: Block) u64 {
		17	var hasher = std.hash.Wyhash.init(0);
		18	std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
		19	return hasher.final();
		20	}
		21
		22	pub fn eql(_: @This(), aBlock: Block, bBlock: Block) bool {
		23	return for (aBlock, bBlock) \|a, b\| {
		24	if (a.len != b.len or a.cps[0] != b.cps[0] or a.cps[1] != b.cps[1]) return false;
		25	} else true;
		26	}
		27	},
		28	std.hash_map.default_max_load_percentage,
		29	);
		30
4	pub fn main() anyerror!void {	31	pub fn main() anyerror!void {
5	var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);	32	var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
6	defer arena.deinit();	33	defer arena.deinit();
7	const allocator = arena.allocator();	34	const allocator = arena.allocator();
8		35
9	var write_buf: [4096]u8 = undefined;
10	// Process UnicodeData.txt	36	// Process UnicodeData.txt
11	var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt"));	37	var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt"));
12	var args_iter = try std.process.argsWithAllocator(allocator);	38	var args_iter = try std.process.argsWithAllocator(allocator);
@@ -14,53 +40,163 @@ pub fn main() anyerror!void {
14	_ = args_iter.skip();	40	_ = args_iter.skip();
15	const output_path = args_iter.next() orelse @panic("No output file arg!");	41	const output_path = args_iter.next() orelse @panic("No output file arg!");
16		42
17	var out_file = try std.fs.cwd().createFile(output_path, .{});	43	var canon_map = std.AutoHashMap(u21, Canonicalization).init(allocator);
18	defer out_file.close();	44	defer canon_map.deinit();
19	var file_writer = out_file.writer(&write_buf);	45
20	var writer = &file_writer.interface;	46	var composite_set = std.AutoArrayHashMap(u21, [2]u21).init(allocator);
21	const endian = builtin.cpu.arch.endian();
22		47
23	lines: while (in_reader.takeDelimiterInclusive('\n')) \|took\| {	48	while (in_reader.takeDelimiterInclusive('\n')) \|line\| {
24	const line = std.mem.trimRight(u8, took, "\n");
25	if (line.len == 0) continue;	49	if (line.len == 0) continue;
26		50
27	var field_iter = std.mem.splitScalar(u8, line, ';');	51	var field_iter = std.mem.splitScalar(u8, line, ';');
28	var cps: [3]u24 = undefined;	52	var cp: u21 = undefined;
29	var len: u8 = 2;
30		53
31	var i: usize = 0;	54	var i: usize = 0;
32	while (field_iter.next()) \|field\| : (i += 1) {	55	while (field_iter.next()) \|field\| : (i += 1) {
		56	if (field.len == 0) continue;
		57
33	switch (i) {	58	switch (i) {
34	0 => cps[0] = try std.fmt.parseInt(u24, field, 16),	59	0 => cp = try std.fmt.parseInt(u21, field, 16),
35		60
36	5 => {	61	5 => {
37	// Not canonical.	62	// Not canonical.
38	if (field.len == 0 or field[0] == '<') continue :lines;	63	if (field[0] == '<') continue;
		64
39	if (std.mem.indexOfScalar(u8, field, ' ')) \|space\| {	65	if (std.mem.indexOfScalar(u8, field, ' ')) \|space\| {
40	// Canonical	66	// Canonical
41	len = 3;	67	const c0, const c1 = .{
42	cps[1] = try std.fmt.parseInt(u24, field[0..space], 16);	68	try std.fmt.parseInt(u21, field[0..space], 16),
43	cps[2] = try std.fmt.parseInt(u24, field[space + 1 ..], 16);	69	try std.fmt.parseInt(u21, field[space + 1 ..], 16),
		70	};
		71	try canon_map.put(cp, Canonicalization{
		72	.len = 2,
		73	.cps = [_]u21{ c0, c1 },
		74	});
		75	try composite_set.put(cp, [_]u21{ c0, c1 });
44	} else {	76	} else {
45	// Singleton	77	// Singleton
46	cps[1] = try std.fmt.parseInt(u24, field, 16);	78	try canon_map.put(cp, Canonicalization{
		79	.len = 1,
		80	.cps = [_]u21{
		81	try std.fmt.parseInt(u21, field, 16),
		82	0,
		83	},
		84	});
47	}	85	}
48	},	86	},
49		87
50	2 => if (line[0] == '<') continue :lines,
51
52	else => {},	88	else => {},
53	}	89	}
54	}	90	}
55
56	try writer.writeInt(u8, @intCast(len), endian);
57	for (cps[0..len]) \|cp\| try writer.writeInt(u24, cp, endian);
58	} else \|err\| switch (err) {	91	} else \|err\| switch (err) {
59	error.EndOfStream => {},	92	error.EndOfStream => {},
60	else => {	93	else => {
61	return err;	94	return err;
62	},	95	},
63	}	96	}
64	try writer.writeInt(u16, 0, endian);	97
65	try writer.flush();	98	// Build multi-tiered lookup tables for decompositions
		99	var blocks_map = BlockMap.init(allocator);
		100	defer blocks_map.deinit();
		101
		102	var stage1 = std.array_list.Managed(u16).init(allocator);
		103	defer stage1.deinit();
		104
		105	var stage2 = std.array_list.Managed(Canonicalization).init(allocator);
		106	defer stage2.deinit();
		107
		108	var block: Block = [_]Canonicalization{.{}} ** block_size;
		109	var block_len: u16 = 0;
		110
		111	for (0..0x110000) \|i\| {
		112	const cp: u21 = @intCast(i);
		113
		114	const canon: Canonicalization = canon_map.get(cp) orelse .{};
		115
		116	block[block_len] = canon;
		117	block_len += 1;
		118
		119	if (block_len < block_size and cp != 0x10ffff) continue;
		120
		121	const gop = try blocks_map.getOrPut(block);
		122	if (!gop.found_existing) {
		123	gop.value_ptr.* = @intCast(stage2.items.len);
		124	try stage2.appendSlice(&block);
		125	}
		126
		127	try stage1.append(gop.value_ptr.*);
		128	block_len = 0;
		129	}
		130
		131	var write_buf: [4096]u8 = undefined;
		132	var out_file = try std.fs.cwd().createFile(output_path, .{});
		133	defer out_file.close();
		134	var writer = out_file.writer(&write_buf);
		135
		136	try writer.interface.print(
		137	\\//! This file is auto-generated. Do not edit.
		138	\\
		139	\\pub const Canonicalization = struct {{
		140	\\ len: u3,
		141	\\ cps: [2]u21,
		142	\\}};
		143	\\
		144	\\pub const s1: [{}]u16 = .{{
		145	, .{stage1.items.len});
		146	for (stage1.items) \|entry\| try writer.interface.print("{}, ", .{entry});
		147
		148	try writer.interface.print(
		149	\\
		150	\\}};
		151	\\
		152	\\pub const s2: [{}]Canonicalization = .{{
		153	, .{stage2.items.len});
		154	for (stage2.items) \|entry\| {
		155	try writer.interface.print(".{{ .len = {}, .cps = .{{ {}, {} }} }}, ", .{
		156	entry.len,
		157	entry.cps[0],
		158	entry.cps[1],
		159	});
		160	}
		161
		162	const composite = composite_set.keys();
		163	// TODO: cut
		164	try writer.interface.print(
		165	\\
		166	\\}};
		167	\\
		168	\\pub const composite: [{}]u21 = .{{
		169	, .{composite.len});
		170	for (composite) \|entry\| try writer.interface.print("{}, ", .{entry});
		171
		172	try writer.interface.writeAll(
		173	\\};
		174	);
		175
		176	try writer.interface.print(
		177	\\
		178	\\ pub const c_map: [{}]struct {{ [2]u21, u21 }} = .{{
		179	, .{composite.len});
		180	for (composite) \|comp\| {
		181	const canon = canon_map.get(comp).?;
		182	std.debug.assert(canon.len == 2);
		183	try writer.interface.print(
		184	\\ .{{ .{{{}, {}}}, {}}},
		185	,
		186	.{ canon.cps[0], canon.cps[1], comp },
		187	);
		188	}
		189	// var c_entries = composite_set.iterator();
		190	// while (c_entries.next()) \|entry\| {
		191	// try writer.interface.print(
		192	// \\ .{{ .{{{}, {}}}, {}}},
		193	// ,
		194	// .{ entry.value_ptr[0], entry.value_ptr[1], entry.key_ptr.* },
		195	// );
		196	// }
		197	try writer.interface.writeAll(
		198	\\};
		199	);
		200
		201	try writer.interface.flush();
66	}	202	}


diff --git a/src/CanonData.zig b/src/CanonData.zig index cf9dc8a..c972534 100644 --- a/src/CanonData.zig +++ b/src/CanonData.zig
@@ -1,78 +1,77 @@
1	//! Canonicalization Data	1	//! Canonicalization Data
2		2
		3	s1: []const u16 = undefined,
		4	s2: []const @import("canon").Canonicalization = undefined,
3	nfc: std.AutoHashMapUnmanaged([2]u21, u21),	5	nfc: std.AutoHashMapUnmanaged([2]u21, u21),
4	nfd: [][]u21 = undefined,
5	cps: []u21 = undefined,
6		6
7	const CanonData = @This();	7	const CanonData = @This();
8		8
9	pub fn init(allocator: mem.Allocator) !CanonData {	9	// There's a bug here, which is down to how static u21 vs. runtime are handled,
10	const in_bytes = @embedFile("canon");	10	// the "unique representation" claim is not working out. So we do this:
11	var in_fbs = std.io.fixedBufferStream(in_bytes);
12	var reader = in_fbs.reader();
13		11
14	const endian = builtin.cpu.arch.endian();	12	const Context = struct {
15	var cdata = CanonData{	13	pub fn hash(_: Context, cps: [2]u21) u64 {
16	.nfc = .empty,	14	const cp_44: u64 = (@as(u64, cps[0]) << 22) + cps[1];
17	.nfd = try allocator.alloc([]u21, 0x110000),	15	return std.hash.int(cp_44);
18	};
19	{
20	errdefer allocator.free(cdata.nfd);
21	cdata.cps = try allocator.alloc(u21, magic.canon_size);
22	}	16	}
23		17
24	var total_cp: u24 = undefined;	18	pub fn eql(_: Context, cps1: [2]u21, cps2: [2]u21) bool {
25		19	return cps1[0] == cps2[0] and cps1[1] == cps2[1];
26	errdefer {
27	cdata.nfc.deinit(allocator);
28	allocator.free(cdata.cps);
29	allocator.free(cdata.nfd);
30	}	20	}
		21	};
31		22
32	@memset(cdata.nfd, &.{});	23	const c_map = comptime_map.ComptimeHashMap([2]u21, u21, Context, @import("canon").c_map);
33		24
34	var total_len: usize = 0;	25	pub fn init(allocator: mem.Allocator) !CanonData {
		26	var cdata = CanonData{
		27	.nfc = .empty,
		28	};
		29	errdefer cdata.deinit(allocator);
35		30
36	while (true) {	31	const data = @import("canon");
37	const len: u8 = try reader.readInt(u8, endian);	32	cdata.s1 = &data.s1;
38	if (len == 0) break;	33	cdata.s2 = &data.s2;
39	const cp = try reader.readInt(u24, endian);	34	var count: usize = 0;
40	total_cp = cp;	35	for (data.composite) \|cp\| {
41	const nfd_cp = cdata.cps[total_len..][0 .. len - 1];	36	count += 1;
42	for (0..len - 1) \|i\| {	37	const cps = cdata.toNfd(cp);
43	nfd_cp[i] = @intCast(try reader.readInt(u24, endian));	38	std.debug.assert(cps.len == 2);
44	}	39	try cdata.nfc.put(allocator, cps[0..2].*, cp);
45	if (len == 3) {
46	try cdata.nfc.put(allocator, nfd_cp[0..2].*, @intCast(cp));
47	}
48	cdata.nfd[cp] = nfd_cp;
49	total_len += len - 1;
50	}	40	}
51		41
52	if (comptime magic.print) std.debug.print("CanonData magic number: {d}\n", .{total_len});	42	// var keys = cdata.nfc.keyIterator();
		43	// while (keys.next()) \|key\| {
		44	// const c32: [2]u32 = .{ key[0], key[1] };
		45	// if (c_map.get(c32)) \|_\| {
		46	// std.debug.print("got", .{});
		47	// }
		48	// }
53		49
54	return cdata;	50	return cdata;
55	}	51	}
56		52
57	pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void {	53	pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void {
58	cdata.nfc.deinit(allocator);	54	cdata.nfc.deinit(allocator);
59	allocator.free(cdata.cps);
60	allocator.free(cdata.nfd);
61	}	55	}
62		56
63	/// Returns canonical decomposition for `cp`.	57	/// Returns canonical decomposition for `cp`.
64	pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 {	58	pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 {
65	return cdata.nfd[cp];	59	const canon = &cdata.s2[cdata.s1[cp >> 8] + (cp & 0xff)];
		60	return canon.cps[0..canon.len];
66	}	61	}
67		62
68	// Returns the primary composite for the codepoints in `cp`.	63	// Returns the primary composite for the codepoints in `cp`.
69	pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 {	64	pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 {
70	return cdata.nfc.get(cps);	65	_ = cdata;
		66	if (c_map.get(cps)) \|cpp\| {
		67	return cpp.*;
		68	} else {
		69	return null;
		70	}
		71	unreachable;
71	}	72	}
72		73
73	const std = @import("std");	74	const std = @import("std");
74	const builtin = @import("builtin");	75	const builtin = @import("builtin");
75	const compress = std.compress;
76	const mem = std.mem;	76	const mem = std.mem;
77	const magic = @import("magic");	77	const comptime_map = @import("comptime_map.zig");
78	const options = @import("options");


diff --git a/src/comptime_map.zig b/src/comptime_map.zig index b2c4d11..d41f6f8 100644 --- a/src/comptime_map.zig +++ b/src/comptime_map.zig
@@ -28,7 +28,7 @@ pub fn ComptimeHashMap(comptime K: type, comptime V: type, comptime ctx: type, c
28	};	28	};
29		29
30	// ensure that the hash map will be at most 60% full	30	// ensure that the hash map will be at most 60% full
31	const size = if (@import("builtin").is_test) values.len else math.ceilPowerOfTwo(usize, values.len * 5 / 3) catch unreachable;	31	const size = math.ceilPowerOfTwo(usize, values.len * 5 / 3) catch unreachable;
32	comptime var slots = [1]Entry{.{}} ** size;	32	comptime var slots = [1]Entry{.{}} ** size;
33	comptime var distance: [size]usize = .{0} ** size;	33	comptime var distance: [size]usize = .{0} ** size;
34		34
@@ -148,6 +148,15 @@ test "auto comptime hash map" {
148	try testing.expect(map.get(4_000_000) == null);	148	try testing.expect(map.get(4_000_000) == null);
149	}	149	}
150		150
		151	test "array pair comptime hash map" {
		152	const map = AutoComptimeHashMap([2]u32, u21, .{
		153	.{ .{ 2, 3 }, 5 },
		154	.{ .{ 42, 56 }, 12 },
		155	.{ .{ 2, 4 }, 6 },
		156	});
		157	try testing.expect(map.has(.{ 2, 4 }));
		158	}
		159
151	const std = @import("std");	160	const std = @import("std");
152	const hash_map = std.hash_map;	161	const hash_map = std.hash_map;
153	const testing = std.testing;	162	const testing = std.testing;
@@ -174,3 +183,4 @@ const math = std.math;
174	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,	183	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
175	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE	184	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
176	// SOFTWARE.	185	// SOFTWARE.
		186