summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Sam Atman2026-02-04 21:21:14 -0500
committerGravatar Sam Atman2026-02-04 21:21:14 -0500
commit904fa4d94f30825bec490133ff402c6350f45e26 (patch)
treecbfc44b59ca1e539d6d2a4ff8f083fb4e25d41ef
parentRest of the 'easy' stuff (diff)
downloadzg-904fa4d94f30825bec490133ff402c6350f45e26.tar.gz
zg-904fa4d94f30825bec490133ff402c6350f45e26.tar.xz
zg-904fa4d94f30825bec490133ff402c6350f45e26.zip
Teasing out canonicalization
After coping with a spuriously broken autohash for awhile, I got the one remaining hash table moved into memory, so there's no further reason to put up with allocation of basic structures. So that's nice.
-rw-r--r--build.zig2
-rw-r--r--codegen/canon.zig182
-rw-r--r--src/CanonData.zig87
-rw-r--r--src/comptime_map.zig12
4 files changed, 214 insertions, 69 deletions
diff --git a/build.zig b/build.zig
index be91f50..3cf69aa 100644
--- a/build.zig
+++ b/build.zig
@@ -91,7 +91,7 @@ pub fn build(b: *std.Build) void {
91 }); 91 });
92 canon_gen_exe.root_module.addAnonymousImport("UnicodeData.txt", .{ .root_source_file = b.path("data/unicode/UnicodeData.txt") }); 92 canon_gen_exe.root_module.addAnonymousImport("UnicodeData.txt", .{ .root_source_file = b.path("data/unicode/UnicodeData.txt") });
93 const run_canon_gen_exe = b.addRunArtifact(canon_gen_exe); 93 const run_canon_gen_exe = b.addRunArtifact(canon_gen_exe);
94 const canon_gen_out = run_canon_gen_exe.addOutputFileArg("canon.bin.z"); 94 const canon_gen_out = run_canon_gen_exe.addOutputFileArg("canon.zig");
95 95
96 const compat_gen_exe = b.addExecutable(.{ 96 const compat_gen_exe = b.addExecutable(.{
97 .name = "compat", 97 .name = "compat",
diff --git a/codegen/canon.zig b/codegen/canon.zig
index d95a905..e92be5d 100644
--- a/codegen/canon.zig
+++ b/codegen/canon.zig
@@ -1,12 +1,38 @@
1const std = @import("std"); 1const std = @import("std");
2const builtin = @import("builtin"); 2const builtin = @import("builtin");
3 3
4const block_size = 256;
5const Block = [block_size]Canonicalization;
6
7const Canonicalization = struct {
8 len: u3 = 0,
9 cps: [2]u21 = [_]u21{0} ** 2,
10};
11
12const BlockMap = std.HashMap(
13 Block,
14 u16,
15 struct {
16 pub fn hash(_: @This(), k: Block) u64 {
17 var hasher = std.hash.Wyhash.init(0);
18 std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
19 return hasher.final();
20 }
21
22 pub fn eql(_: @This(), aBlock: Block, bBlock: Block) bool {
23 return for (aBlock, bBlock) |a, b| {
24 if (a.len != b.len or a.cps[0] != b.cps[0] or a.cps[1] != b.cps[1]) return false;
25 } else true;
26 }
27 },
28 std.hash_map.default_max_load_percentage,
29);
30
4pub fn main() anyerror!void { 31pub fn main() anyerror!void {
5 var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); 32 var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
6 defer arena.deinit(); 33 defer arena.deinit();
7 const allocator = arena.allocator(); 34 const allocator = arena.allocator();
8 35
9 var write_buf: [4096]u8 = undefined;
10 // Process UnicodeData.txt 36 // Process UnicodeData.txt
11 var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt")); 37 var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt"));
12 var args_iter = try std.process.argsWithAllocator(allocator); 38 var args_iter = try std.process.argsWithAllocator(allocator);
@@ -14,53 +40,163 @@ pub fn main() anyerror!void {
14 _ = args_iter.skip(); 40 _ = args_iter.skip();
15 const output_path = args_iter.next() orelse @panic("No output file arg!"); 41 const output_path = args_iter.next() orelse @panic("No output file arg!");
16 42
17 var out_file = try std.fs.cwd().createFile(output_path, .{}); 43 var canon_map = std.AutoHashMap(u21, Canonicalization).init(allocator);
18 defer out_file.close(); 44 defer canon_map.deinit();
19 var file_writer = out_file.writer(&write_buf); 45
20 var writer = &file_writer.interface; 46 var composite_set = std.AutoArrayHashMap(u21, [2]u21).init(allocator);
21 const endian = builtin.cpu.arch.endian();
22 47
23 lines: while (in_reader.takeDelimiterInclusive('\n')) |took| { 48 while (in_reader.takeDelimiterInclusive('\n')) |line| {
24 const line = std.mem.trimRight(u8, took, "\n");
25 if (line.len == 0) continue; 49 if (line.len == 0) continue;
26 50
27 var field_iter = std.mem.splitScalar(u8, line, ';'); 51 var field_iter = std.mem.splitScalar(u8, line, ';');
28 var cps: [3]u24 = undefined; 52 var cp: u21 = undefined;
29 var len: u8 = 2;
30 53
31 var i: usize = 0; 54 var i: usize = 0;
32 while (field_iter.next()) |field| : (i += 1) { 55 while (field_iter.next()) |field| : (i += 1) {
56 if (field.len == 0) continue;
57
33 switch (i) { 58 switch (i) {
34 0 => cps[0] = try std.fmt.parseInt(u24, field, 16), 59 0 => cp = try std.fmt.parseInt(u21, field, 16),
35 60
36 5 => { 61 5 => {
37 // Not canonical. 62 // Not canonical.
38 if (field.len == 0 or field[0] == '<') continue :lines; 63 if (field[0] == '<') continue;
64
39 if (std.mem.indexOfScalar(u8, field, ' ')) |space| { 65 if (std.mem.indexOfScalar(u8, field, ' ')) |space| {
40 // Canonical 66 // Canonical
41 len = 3; 67 const c0, const c1 = .{
42 cps[1] = try std.fmt.parseInt(u24, field[0..space], 16); 68 try std.fmt.parseInt(u21, field[0..space], 16),
43 cps[2] = try std.fmt.parseInt(u24, field[space + 1 ..], 16); 69 try std.fmt.parseInt(u21, field[space + 1 ..], 16),
70 };
71 try canon_map.put(cp, Canonicalization{
72 .len = 2,
73 .cps = [_]u21{ c0, c1 },
74 });
75 try composite_set.put(cp, [_]u21{ c0, c1 });
44 } else { 76 } else {
45 // Singleton 77 // Singleton
46 cps[1] = try std.fmt.parseInt(u24, field, 16); 78 try canon_map.put(cp, Canonicalization{
79 .len = 1,
80 .cps = [_]u21{
81 try std.fmt.parseInt(u21, field, 16),
82 0,
83 },
84 });
47 } 85 }
48 }, 86 },
49 87
50 2 => if (line[0] == '<') continue :lines,
51
52 else => {}, 88 else => {},
53 } 89 }
54 } 90 }
55
56 try writer.writeInt(u8, @intCast(len), endian);
57 for (cps[0..len]) |cp| try writer.writeInt(u24, cp, endian);
58 } else |err| switch (err) { 91 } else |err| switch (err) {
59 error.EndOfStream => {}, 92 error.EndOfStream => {},
60 else => { 93 else => {
61 return err; 94 return err;
62 }, 95 },
63 } 96 }
64 try writer.writeInt(u16, 0, endian); 97
65 try writer.flush(); 98 // Build multi-tiered lookup tables for decompositions
99 var blocks_map = BlockMap.init(allocator);
100 defer blocks_map.deinit();
101
102 var stage1 = std.array_list.Managed(u16).init(allocator);
103 defer stage1.deinit();
104
105 var stage2 = std.array_list.Managed(Canonicalization).init(allocator);
106 defer stage2.deinit();
107
108 var block: Block = [_]Canonicalization{.{}} ** block_size;
109 var block_len: u16 = 0;
110
111 for (0..0x110000) |i| {
112 const cp: u21 = @intCast(i);
113
114 const canon: Canonicalization = canon_map.get(cp) orelse .{};
115
116 block[block_len] = canon;
117 block_len += 1;
118
119 if (block_len < block_size and cp != 0x10ffff) continue;
120
121 const gop = try blocks_map.getOrPut(block);
122 if (!gop.found_existing) {
123 gop.value_ptr.* = @intCast(stage2.items.len);
124 try stage2.appendSlice(&block);
125 }
126
127 try stage1.append(gop.value_ptr.*);
128 block_len = 0;
129 }
130
131 var write_buf: [4096]u8 = undefined;
132 var out_file = try std.fs.cwd().createFile(output_path, .{});
133 defer out_file.close();
134 var writer = out_file.writer(&write_buf);
135
136 try writer.interface.print(
137 \\//! This file is auto-generated. Do not edit.
138 \\
139 \\pub const Canonicalization = struct {{
140 \\ len: u3,
141 \\ cps: [2]u21,
142 \\}};
143 \\
144 \\pub const s1: [{}]u16 = .{{
145 , .{stage1.items.len});
146 for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry});
147
148 try writer.interface.print(
149 \\
150 \\}};
151 \\
152 \\pub const s2: [{}]Canonicalization = .{{
153 , .{stage2.items.len});
154 for (stage2.items) |entry| {
155 try writer.interface.print(".{{ .len = {}, .cps = .{{ {}, {} }} }}, ", .{
156 entry.len,
157 entry.cps[0],
158 entry.cps[1],
159 });
160 }
161
162 const composite = composite_set.keys();
163 // TODO: cut
164 try writer.interface.print(
165 \\
166 \\}};
167 \\
168 \\pub const composite: [{}]u21 = .{{
169 , .{composite.len});
170 for (composite) |entry| try writer.interface.print("{}, ", .{entry});
171
172 try writer.interface.writeAll(
173 \\};
174 );
175
176 try writer.interface.print(
177 \\
178 \\ pub const c_map: [{}]struct {{ [2]u21, u21 }} = .{{
179 , .{composite.len});
180 for (composite) |comp| {
181 const canon = canon_map.get(comp).?;
182 std.debug.assert(canon.len == 2);
183 try writer.interface.print(
184 \\ .{{ .{{{}, {}}}, {}}},
185 ,
186 .{ canon.cps[0], canon.cps[1], comp },
187 );
188 }
189 // var c_entries = composite_set.iterator();
190 // while (c_entries.next()) |entry| {
191 // try writer.interface.print(
192 // \\ .{{ .{{{}, {}}}, {}}},
193 // ,
194 // .{ entry.value_ptr[0], entry.value_ptr[1], entry.key_ptr.* },
195 // );
196 // }
197 try writer.interface.writeAll(
198 \\};
199 );
200
201 try writer.interface.flush();
66} 202}
diff --git a/src/CanonData.zig b/src/CanonData.zig
index cf9dc8a..c972534 100644
--- a/src/CanonData.zig
+++ b/src/CanonData.zig
@@ -1,78 +1,77 @@
1//! Canonicalization Data 1//! Canonicalization Data
2 2
3s1: []const u16 = undefined,
4s2: []const @import("canon").Canonicalization = undefined,
3nfc: std.AutoHashMapUnmanaged([2]u21, u21), 5nfc: std.AutoHashMapUnmanaged([2]u21, u21),
4nfd: [][]u21 = undefined,
5cps: []u21 = undefined,
6 6
7const CanonData = @This(); 7const CanonData = @This();
8 8
9pub fn init(allocator: mem.Allocator) !CanonData { 9// There's a bug here, which is down to how static u21 vs. runtime are handled,
10 const in_bytes = @embedFile("canon"); 10// the "unique representation" claim is not working out. So we do this:
11 var in_fbs = std.io.fixedBufferStream(in_bytes);
12 var reader = in_fbs.reader();
13 11
14 const endian = builtin.cpu.arch.endian(); 12const Context = struct {
15 var cdata = CanonData{ 13 pub fn hash(_: Context, cps: [2]u21) u64 {
16 .nfc = .empty, 14 const cp_44: u64 = (@as(u64, cps[0]) << 22) + cps[1];
17 .nfd = try allocator.alloc([]u21, 0x110000), 15 return std.hash.int(cp_44);
18 };
19 {
20 errdefer allocator.free(cdata.nfd);
21 cdata.cps = try allocator.alloc(u21, magic.canon_size);
22 } 16 }
23 17
24 var total_cp: u24 = undefined; 18 pub fn eql(_: Context, cps1: [2]u21, cps2: [2]u21) bool {
25 19 return cps1[0] == cps2[0] and cps1[1] == cps2[1];
26 errdefer {
27 cdata.nfc.deinit(allocator);
28 allocator.free(cdata.cps);
29 allocator.free(cdata.nfd);
30 } 20 }
21};
31 22
32 @memset(cdata.nfd, &.{}); 23const c_map = comptime_map.ComptimeHashMap([2]u21, u21, Context, @import("canon").c_map);
33 24
34 var total_len: usize = 0; 25pub fn init(allocator: mem.Allocator) !CanonData {
26 var cdata = CanonData{
27 .nfc = .empty,
28 };
29 errdefer cdata.deinit(allocator);
35 30
36 while (true) { 31 const data = @import("canon");
37 const len: u8 = try reader.readInt(u8, endian); 32 cdata.s1 = &data.s1;
38 if (len == 0) break; 33 cdata.s2 = &data.s2;
39 const cp = try reader.readInt(u24, endian); 34 var count: usize = 0;
40 total_cp = cp; 35 for (data.composite) |cp| {
41 const nfd_cp = cdata.cps[total_len..][0 .. len - 1]; 36 count += 1;
42 for (0..len - 1) |i| { 37 const cps = cdata.toNfd(cp);
43 nfd_cp[i] = @intCast(try reader.readInt(u24, endian)); 38 std.debug.assert(cps.len == 2);
44 } 39 try cdata.nfc.put(allocator, cps[0..2].*, cp);
45 if (len == 3) {
46 try cdata.nfc.put(allocator, nfd_cp[0..2].*, @intCast(cp));
47 }
48 cdata.nfd[cp] = nfd_cp;
49 total_len += len - 1;
50 } 40 }
51 41
52 if (comptime magic.print) std.debug.print("CanonData magic number: {d}\n", .{total_len}); 42 // var keys = cdata.nfc.keyIterator();
43 // while (keys.next()) |key| {
44 // const c32: [2]u32 = .{ key[0], key[1] };
45 // if (c_map.get(c32)) |_| {
46 // std.debug.print("got", .{});
47 // }
48 // }
53 49
54 return cdata; 50 return cdata;
55} 51}
56 52
57pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void { 53pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void {
58 cdata.nfc.deinit(allocator); 54 cdata.nfc.deinit(allocator);
59 allocator.free(cdata.cps);
60 allocator.free(cdata.nfd);
61} 55}
62 56
63/// Returns canonical decomposition for `cp`. 57/// Returns canonical decomposition for `cp`.
64pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 { 58pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 {
65 return cdata.nfd[cp]; 59 const canon = &cdata.s2[cdata.s1[cp >> 8] + (cp & 0xff)];
60 return canon.cps[0..canon.len];
66} 61}
67 62
68// Returns the primary composite for the codepoints in `cp`. 63// Returns the primary composite for the codepoints in `cp`.
69pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 { 64pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 {
70 return cdata.nfc.get(cps); 65 _ = cdata;
66 if (c_map.get(cps)) |cpp| {
67 return cpp.*;
68 } else {
69 return null;
70 }
71 unreachable;
71} 72}
72 73
73const std = @import("std"); 74const std = @import("std");
74const builtin = @import("builtin"); 75const builtin = @import("builtin");
75const compress = std.compress;
76const mem = std.mem; 76const mem = std.mem;
77const magic = @import("magic"); 77const comptime_map = @import("comptime_map.zig");
78const options = @import("options");
diff --git a/src/comptime_map.zig b/src/comptime_map.zig
index b2c4d11..d41f6f8 100644
--- a/src/comptime_map.zig
+++ b/src/comptime_map.zig
@@ -28,7 +28,7 @@ pub fn ComptimeHashMap(comptime K: type, comptime V: type, comptime ctx: type, c
28 }; 28 };
29 29
30 // ensure that the hash map will be at most 60% full 30 // ensure that the hash map will be at most 60% full
31 const size = if (@import("builtin").is_test) values.len else math.ceilPowerOfTwo(usize, values.len * 5 / 3) catch unreachable; 31 const size = math.ceilPowerOfTwo(usize, values.len * 5 / 3) catch unreachable;
32 comptime var slots = [1]Entry{.{}} ** size; 32 comptime var slots = [1]Entry{.{}} ** size;
33 comptime var distance: [size]usize = .{0} ** size; 33 comptime var distance: [size]usize = .{0} ** size;
34 34
@@ -148,6 +148,15 @@ test "auto comptime hash map" {
148 try testing.expect(map.get(4_000_000) == null); 148 try testing.expect(map.get(4_000_000) == null);
149} 149}
150 150
151test "array pair comptime hash map" {
152 const map = AutoComptimeHashMap([2]u32, u21, .{
153 .{ .{ 2, 3 }, 5 },
154 .{ .{ 42, 56 }, 12 },
155 .{ .{ 2, 4 }, 6 },
156 });
157 try testing.expect(map.has(.{ 2, 4 }));
158}
159
151const std = @import("std"); 160const std = @import("std");
152const hash_map = std.hash_map; 161const hash_map = std.hash_map;
153const testing = std.testing; 162const testing = std.testing;
@@ -174,3 +183,4 @@ const math = std.math;
174// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 183// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
175// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 184// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
176// SOFTWARE. 185// SOFTWARE.
186