summaryrefslogtreecommitdiff
path: root/codegen
diff options
context:
space:
mode:
authorGravatar Sam Atman2026-02-04 18:36:18 -0500
committerGravatar Sam Atman2026-02-04 18:36:18 -0500
commite476250ea9326b2550847b301c265115ff375a31 (patch)
treecf627ced47cecce80020b7a1f30aa51852c0c59b /codegen
parentNormalization and case folding (diff)
downloadzg-e476250ea9326b2550847b301c265115ff375a31.tar.gz
zg-e476250ea9326b2550847b301c265115ff375a31.tar.xz
zg-e476250ea9326b2550847b301c265115ff375a31.zip
Rest of the 'easy' stuff
This gets us up to feature parity with Jacob's work. I want to eliminate that last allocation using the comptime hash map, and then see about eliminating allocations from case comparisons as well. That should just about do it.
Diffstat (limited to 'codegen')
-rw-r--r--codegen/case.zig145
-rw-r--r--codegen/core_props.zig30
-rw-r--r--codegen/gencat.zig37
-rw-r--r--codegen/numeric.zig24
-rw-r--r--codegen/props.zig23
-rw-r--r--codegen/scripts.zig31
6 files changed, 253 insertions, 37 deletions
diff --git a/codegen/case.zig b/codegen/case.zig
new file mode 100644
index 0000000..9dffc7c
--- /dev/null
+++ b/codegen/case.zig
@@ -0,0 +1,145 @@
1const std = @import("std");
2const builtin = @import("builtin");
3
4const block_size = 256;
5const Block = [block_size]u44;
6
7comptime {
8 if (@bitSizeOf(u44) != 2 * @bitSizeOf(u21) + 2) {
9 @compileError("u44 doesn't have expected bit size.");
10 }
11}
12
13const BlockMap = std.HashMap(
14 Block,
15 u16,
16 struct {
17 pub fn hash(_: @This(), k: Block) u64 {
18 var hasher = std.hash.Wyhash.init(0);
19 std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
20 return hasher.final();
21 }
22
23 pub fn eql(_: @This(), a: Block, b: Block) bool {
24 return std.mem.eql(u44, &a, &b);
25 }
26 },
27 std.hash_map.default_max_load_percentage,
28);
29
30pub fn main() !void {
31 var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
32 defer arena.deinit();
33 const allocator = arena.allocator();
34
35 var lower_map = std.AutoHashMap(u21, u21).init(allocator);
36 defer lower_map.deinit();
37
38 var upper_map = std.AutoHashMap(u21, u21).init(allocator);
39 defer upper_map.deinit();
40
41 // Process UnicodeData.txt
42
43 var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt"));
44 while (in_reader.takeDelimiterInclusive('\n')) |line| {
45 if (line.len == 0) continue;
46
47 var field_iter = std.mem.splitScalar(u8, line, ';');
48 var cp: u21 = undefined;
49
50 var i: usize = 0;
51 while (field_iter.next()) |field| : (i += 1) {
52 if (field.len == 0) continue;
53
54 switch (i) {
55 0 => cp = try std.fmt.parseInt(u21, field, 16),
56
57 12 => {
58 // Uppercase mapping
59 try upper_map.put(cp, try std.fmt.parseInt(u21, field, 16));
60 },
61
62 13 => {
63 // Lowercase mapping
64 try lower_map.put(cp, try std.fmt.parseInt(u21, field, 16));
65 },
66
67 else => {},
68 }
69 }
70 } else |err| switch (err) {
71 error.EndOfStream => {},
72 else => {
73 return err;
74 },
75 }
76
77 var blocks_map = BlockMap.init(allocator);
78 defer blocks_map.deinit();
79
80 var stage1 = std.array_list.Managed(u16).init(allocator);
81 defer stage1.deinit();
82
83 var stage2 = std.array_list.Managed(u44).init(allocator);
84 defer stage2.deinit();
85
86 var block: Block = [_]u44{0} ** block_size;
87 var block_len: u16 = 0;
88
89 for (0..0x110000) |i| {
90 const cp: u21 = @intCast(i);
91 var case_prop: u44 = 0;
92
93 if (lower_map.get(cp)) |lower| {
94 case_prop |= @as(u44, lower) << 2 | 1;
95 }
96
97 if (upper_map.get(cp)) |upper| {
98 case_prop |= @as(u44, upper) << (2 + 21) | 2;
99 }
100
101 block[block_len] = case_prop;
102 block_len += 1;
103
104 if (block_len < block_size and cp != 0x10ffff) continue;
105
106 const gop = try blocks_map.getOrPut(block);
107 if (!gop.found_existing) {
108 gop.value_ptr.* = @intCast(stage2.items.len);
109 try stage2.appendSlice(&block);
110 }
111
112 try stage1.append(gop.value_ptr.*);
113 block_len = 0;
114 }
115
116 var args_iter = try std.process.argsWithAllocator(allocator);
117 defer args_iter.deinit();
118 _ = args_iter.skip();
119 const output_path = args_iter.next() orelse @panic("No output file arg!");
120
121 var write_buf: [4096]u8 = undefined;
122 var out_file = try std.fs.cwd().createFile(output_path, .{});
123 defer out_file.close();
124 var writer = out_file.writer(&write_buf);
125
126 try writer.interface.print(
127 \\//! This file is auto-generated. Do not edit.
128 \\
129 \\pub const s1: [{}]u16 = .{{
130 , .{stage1.items.len});
131 for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry});
132
133 try writer.interface.print(
134 \\
135 \\}};
136 \\
137 \\pub const s2: [{}]u44 = .{{
138 , .{stage2.items.len});
139 for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry});
140
141 try writer.interface.writeAll(
142 \\};
143 );
144 try writer.interface.flush();
145}
diff --git a/codegen/core_props.zig b/codegen/core_props.zig
index 6ffdf91..99a55e2 100644
--- a/codegen/core_props.zig
+++ b/codegen/core_props.zig
@@ -120,17 +120,29 @@ pub fn main() anyerror!void {
120 _ = args_iter.skip(); 120 _ = args_iter.skip();
121 const output_path = args_iter.next() orelse @panic("No output file arg!"); 121 const output_path = args_iter.next() orelse @panic("No output file arg!");
122 122
123 var out_buf: [4096]u8 = undefined; 123 var write_buf: [4096]u8 = undefined;
124 var out_file = try std.fs.cwd().createFile(output_path, .{}); 124 var out_file = try std.fs.cwd().createFile(output_path, .{});
125 defer out_file.close(); 125 defer out_file.close();
126 var writer = out_file.writer(&out_buf); 126 var writer = out_file.writer(&write_buf);
127 127
128 const endian = builtin.cpu.arch.endian(); 128 try writer.interface.print(
129 try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); 129 \\//! This file is auto-generated. Do not edit.
130 for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); 130 \\
131 131 \\pub const s1: [{}]u16 = .{{
132 try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); 132 , .{stage1.items.len});
133 try writer.interface.writeAll(stage2.items); 133 for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry});
134
135 try writer.interface.print(
136 \\
137 \\}};
138 \\
139 \\pub const s2: [{}]u8 = .{{
140 , .{stage2.items.len});
141 for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry});
142
143 try writer.interface.writeAll(
144 \\};
145 );
134 146
135 try writer.interface.flush(); 147 try writer.interface.flush();
136} 148}
diff --git a/codegen/gencat.zig b/codegen/gencat.zig
index 9800f1d..12c8373 100644
--- a/codegen/gencat.zig
+++ b/codegen/gencat.zig
@@ -150,21 +150,38 @@ pub fn main() !void {
150 defer args_iter.deinit(); 150 defer args_iter.deinit();
151 _ = args_iter.skip(); 151 _ = args_iter.skip();
152 const output_path = args_iter.next() orelse @panic("No output file arg!"); 152 const output_path = args_iter.next() orelse @panic("No output file arg!");
153
154 var write_buf: [4096]u8 = undefined; 153 var write_buf: [4096]u8 = undefined;
155 var out_file = try std.fs.cwd().createFile(output_path, .{}); 154 var out_file = try std.fs.cwd().createFile(output_path, .{});
156 defer out_file.close(); 155 defer out_file.close();
157 var writer = out_file.writer(&write_buf); 156 var writer = out_file.writer(&write_buf);
158 157
159 const endian = builtin.cpu.arch.endian(); 158 try writer.interface.print(
160 try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); 159 \\//! This file is auto-generated. Do not edit.
161 for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); 160 \\
162 161 \\pub const s1: [{}]u16 = .{{
163 try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); 162 , .{stage1.items.len});
164 for (stage2.items) |i| try writer.interface.writeInt(u8, i, endian); 163 for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry});
165 164
166 try writer.interface.writeInt(u8, @intCast(stage3.items.len), endian); 165 try writer.interface.print(
167 for (stage3.items) |i| try writer.interface.writeInt(u8, i, endian); 166 \\
167 \\}};
168 \\
169 \\pub const stage2: [{}]u5 = .{{
170 , .{stage2.items.len});
171 for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry});
172
173 try writer.interface.print(
174 \\
175 \\}};
176 \\
177 \\pub const stage3: [{}]5 = .{{
178 , .{stage3.items.len});
179 for (stage3.items) |entry| try writer.interface.print("{}, ", .{entry});
180
181 try writer.interface.writeAll(
182 \\};
183 );
168 184
169 try writer.interface.flush(); 185 try writer.interface.flush();
186 try writer.interface.flush();
170} 187}
diff --git a/codegen/numeric.zig b/codegen/numeric.zig
index b304349..e7b4861 100644
--- a/codegen/numeric.zig
+++ b/codegen/numeric.zig
@@ -123,12 +123,24 @@ pub fn main() anyerror!void {
123 defer out_file.close(); 123 defer out_file.close();
124 var writer = out_file.writer(&write_buf); 124 var writer = out_file.writer(&write_buf);
125 125
126 const endian = builtin.cpu.arch.endian(); 126 try writer.interface.print(
127 try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); 127 \\//! This file is auto-generated. Do not edit.
128 for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); 128 \\
129 129 \\pub const s1: [{}]u16 = .{{
130 try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); 130 , .{stage1.items.len});
131 try writer.interface.writeAll(stage2.items); 131 for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry});
132
133 try writer.interface.print(
134 \\
135 \\}};
136 \\
137 \\pub const s2: [{}]u8 = .{{
138 , .{stage2.items.len});
139 for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry});
140
141 try writer.interface.writeAll(
142 \\};
143 );
132 144
133 try writer.interface.flush(); 145 try writer.interface.flush();
134} 146}
diff --git a/codegen/props.zig b/codegen/props.zig
index 35c7dfb..ebd5116 100644
--- a/codegen/props.zig
+++ b/codegen/props.zig
@@ -123,11 +123,24 @@ pub fn main() anyerror!void {
123 defer out_file.close(); 123 defer out_file.close();
124 var writer = out_file.writer(&write_buf); 124 var writer = out_file.writer(&write_buf);
125 125
126 const endian = builtin.cpu.arch.endian(); 126 try writer.interface.print(
127 try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); 127 \\//! This file is auto-generated. Do not edit.
128 for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); 128 \\
129 \\pub const s1: [{}]u16 = .{{
130 , .{stage1.items.len});
131 for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry});
132
133 try writer.interface.print(
134 \\
135 \\}};
136 \\
137 \\pub const s2: [{}]u8 = .{{
138 , .{stage2.items.len});
139 for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry});
140
141 try writer.interface.writeAll(
142 \\};
143 );
129 144
130 try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian);
131 try writer.interface.writeAll(stage2.items);
132 try writer.interface.flush(); 145 try writer.interface.flush();
133} 146}
diff --git a/codegen/scripts.zig b/codegen/scripts.zig
index 0f0194c..6bd5866 100644
--- a/codegen/scripts.zig
+++ b/codegen/scripts.zig
@@ -299,15 +299,32 @@ pub fn main() anyerror!void {
299 defer out_file.close(); 299 defer out_file.close();
300 var writer = out_file.writer(&write_buf); 300 var writer = out_file.writer(&write_buf);
301 301
302 const endian = builtin.cpu.arch.endian(); 302 try writer.interface.print(
303 try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); 303 \\//! This file is auto-generated. Do not edit.
304 for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); 304 \\
305 \\pub const s1: [{}]u16 = .{{
306 , .{stage1.items.len});
307 for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry});
305 308
306 try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); 309 try writer.interface.print(
307 for (stage2.items) |i| try writer.interface.writeInt(u8, i, endian); 310 \\
311 \\}};
312 \\
313 \\pub const s2: [{}]u8 = .{{
314 , .{stage2.items.len});
315 for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry});
308 316
309 try writer.interface.writeInt(u8, @intCast(stage3.items.len), endian); 317 try writer.interface.print(
310 for (stage3.items) |i| try writer.interface.writeInt(u8, i, endian); 318 \\
319 \\}};
320 \\
321 \\pub const s3: [{}]u8 = .{{
322 , .{stage3.items.len});
323 for (stage3.items) |entry| try writer.interface.print("{}, ", .{entry});
324
325 try writer.interface.writeAll(
326 \\};
327 );
311 328
312 try writer.interface.flush(); 329 try writer.interface.flush();
313} 330}