summaryrefslogtreecommitdiff
path: root/codegen/case.zig
diff options
context:
space:
mode:
authorGravatar Sam Atman2026-02-04 18:36:18 -0500
committerGravatar Sam Atman2026-02-04 18:36:18 -0500
commite476250ea9326b2550847b301c265115ff375a31 (patch)
treecf627ced47cecce80020b7a1f30aa51852c0c59b /codegen/case.zig
parentNormalization and case folding (diff)
downloadzg-e476250ea9326b2550847b301c265115ff375a31.tar.gz
zg-e476250ea9326b2550847b301c265115ff375a31.tar.xz
zg-e476250ea9326b2550847b301c265115ff375a31.zip
Rest of the 'easy' stuff
This gets us up to feature parity with Jacob's work. I want to eliminate that last allocation using the comptime hash map, and then see about eliminating allocations from case comparisons as well. That should just about do it.
Diffstat (limited to 'codegen/case.zig')
-rw-r--r--codegen/case.zig145
1 files changed, 145 insertions, 0 deletions
diff --git a/codegen/case.zig b/codegen/case.zig
new file mode 100644
index 0000000..9dffc7c
--- /dev/null
+++ b/codegen/case.zig
@@ -0,0 +1,145 @@
1const std = @import("std");
2const builtin = @import("builtin");
3
4const block_size = 256;
5const Block = [block_size]u44;
6
7comptime {
8 if (@bitSizeOf(u44) != 2 * @bitSizeOf(u21) + 2) {
9 @compileError("u44 doesn't have expected bit size.");
10 }
11}
12
13const BlockMap = std.HashMap(
14 Block,
15 u16,
16 struct {
17 pub fn hash(_: @This(), k: Block) u64 {
18 var hasher = std.hash.Wyhash.init(0);
19 std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
20 return hasher.final();
21 }
22
23 pub fn eql(_: @This(), a: Block, b: Block) bool {
24 return std.mem.eql(u44, &a, &b);
25 }
26 },
27 std.hash_map.default_max_load_percentage,
28);
29
30pub fn main() !void {
31 var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
32 defer arena.deinit();
33 const allocator = arena.allocator();
34
35 var lower_map = std.AutoHashMap(u21, u21).init(allocator);
36 defer lower_map.deinit();
37
38 var upper_map = std.AutoHashMap(u21, u21).init(allocator);
39 defer upper_map.deinit();
40
41 // Process UnicodeData.txt
42
43 var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt"));
44 while (in_reader.takeDelimiterInclusive('\n')) |line| {
45 if (line.len == 0) continue;
46
47 var field_iter = std.mem.splitScalar(u8, line, ';');
48 var cp: u21 = undefined;
49
50 var i: usize = 0;
51 while (field_iter.next()) |field| : (i += 1) {
52 if (field.len == 0) continue;
53
54 switch (i) {
55 0 => cp = try std.fmt.parseInt(u21, field, 16),
56
57 12 => {
58 // Uppercase mapping
59 try upper_map.put(cp, try std.fmt.parseInt(u21, field, 16));
60 },
61
62 13 => {
63 // Lowercase mapping
64 try lower_map.put(cp, try std.fmt.parseInt(u21, field, 16));
65 },
66
67 else => {},
68 }
69 }
70 } else |err| switch (err) {
71 error.EndOfStream => {},
72 else => {
73 return err;
74 },
75 }
76
77 var blocks_map = BlockMap.init(allocator);
78 defer blocks_map.deinit();
79
80 var stage1 = std.array_list.Managed(u16).init(allocator);
81 defer stage1.deinit();
82
83 var stage2 = std.array_list.Managed(u44).init(allocator);
84 defer stage2.deinit();
85
86 var block: Block = [_]u44{0} ** block_size;
87 var block_len: u16 = 0;
88
89 for (0..0x110000) |i| {
90 const cp: u21 = @intCast(i);
91 var case_prop: u44 = 0;
92
93 if (lower_map.get(cp)) |lower| {
94 case_prop |= @as(u44, lower) << 2 | 1;
95 }
96
97 if (upper_map.get(cp)) |upper| {
98 case_prop |= @as(u44, upper) << (2 + 21) | 2;
99 }
100
101 block[block_len] = case_prop;
102 block_len += 1;
103
104 if (block_len < block_size and cp != 0x10ffff) continue;
105
106 const gop = try blocks_map.getOrPut(block);
107 if (!gop.found_existing) {
108 gop.value_ptr.* = @intCast(stage2.items.len);
109 try stage2.appendSlice(&block);
110 }
111
112 try stage1.append(gop.value_ptr.*);
113 block_len = 0;
114 }
115
116 var args_iter = try std.process.argsWithAllocator(allocator);
117 defer args_iter.deinit();
118 _ = args_iter.skip();
119 const output_path = args_iter.next() orelse @panic("No output file arg!");
120
121 var write_buf: [4096]u8 = undefined;
122 var out_file = try std.fs.cwd().createFile(output_path, .{});
123 defer out_file.close();
124 var writer = out_file.writer(&write_buf);
125
126 try writer.interface.print(
127 \\//! This file is auto-generated. Do not edit.
128 \\
129 \\pub const s1: [{}]u16 = .{{
130 , .{stage1.items.len});
131 for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry});
132
133 try writer.interface.print(
134 \\
135 \\}};
136 \\
137 \\pub const s2: [{}]u44 = .{{
138 , .{stage2.items.len});
139 for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry});
140
141 try writer.interface.writeAll(
142 \\};
143 );
144 try writer.interface.flush();
145}