summaryrefslogtreecommitdiff
path: root/codegen/canon.zig
diff options
context:
space:
mode:
Diffstat (limited to 'codegen/canon.zig')
-rw-r--r--codegen/canon.zig182
1 files changed, 159 insertions, 23 deletions
diff --git a/codegen/canon.zig b/codegen/canon.zig
index d95a905..e92be5d 100644
--- a/codegen/canon.zig
+++ b/codegen/canon.zig
@@ -1,12 +1,38 @@
1const std = @import("std"); 1const std = @import("std");
2const builtin = @import("builtin"); 2const builtin = @import("builtin");
3 3
4const block_size = 256;
5const Block = [block_size]Canonicalization;
6
7const Canonicalization = struct {
8 len: u3 = 0,
9 cps: [2]u21 = [_]u21{0} ** 2,
10};
11
12const BlockMap = std.HashMap(
13 Block,
14 u16,
15 struct {
16 pub fn hash(_: @This(), k: Block) u64 {
17 var hasher = std.hash.Wyhash.init(0);
18 std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
19 return hasher.final();
20 }
21
22 pub fn eql(_: @This(), aBlock: Block, bBlock: Block) bool {
23 return for (aBlock, bBlock) |a, b| {
24 if (a.len != b.len or a.cps[0] != b.cps[0] or a.cps[1] != b.cps[1]) return false;
25 } else true;
26 }
27 },
28 std.hash_map.default_max_load_percentage,
29);
30
4pub fn main() anyerror!void { 31pub fn main() anyerror!void {
5 var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); 32 var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
6 defer arena.deinit(); 33 defer arena.deinit();
7 const allocator = arena.allocator(); 34 const allocator = arena.allocator();
8 35
9 var write_buf: [4096]u8 = undefined;
10 // Process UnicodeData.txt 36 // Process UnicodeData.txt
11 var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt")); 37 var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt"));
12 var args_iter = try std.process.argsWithAllocator(allocator); 38 var args_iter = try std.process.argsWithAllocator(allocator);
@@ -14,53 +40,163 @@ pub fn main() anyerror!void {
14 _ = args_iter.skip(); 40 _ = args_iter.skip();
15 const output_path = args_iter.next() orelse @panic("No output file arg!"); 41 const output_path = args_iter.next() orelse @panic("No output file arg!");
16 42
17 var out_file = try std.fs.cwd().createFile(output_path, .{}); 43 var canon_map = std.AutoHashMap(u21, Canonicalization).init(allocator);
18 defer out_file.close(); 44 defer canon_map.deinit();
19 var file_writer = out_file.writer(&write_buf); 45
20 var writer = &file_writer.interface; 46 var composite_set = std.AutoArrayHashMap(u21, [2]u21).init(allocator);
21 const endian = builtin.cpu.arch.endian();
22 47
23 lines: while (in_reader.takeDelimiterInclusive('\n')) |took| { 48 while (in_reader.takeDelimiterInclusive('\n')) |line| {
24 const line = std.mem.trimRight(u8, took, "\n");
25 if (line.len == 0) continue; 49 if (line.len == 0) continue;
26 50
27 var field_iter = std.mem.splitScalar(u8, line, ';'); 51 var field_iter = std.mem.splitScalar(u8, line, ';');
28 var cps: [3]u24 = undefined; 52 var cp: u21 = undefined;
29 var len: u8 = 2;
30 53
31 var i: usize = 0; 54 var i: usize = 0;
32 while (field_iter.next()) |field| : (i += 1) { 55 while (field_iter.next()) |field| : (i += 1) {
56 if (field.len == 0) continue;
57
33 switch (i) { 58 switch (i) {
34 0 => cps[0] = try std.fmt.parseInt(u24, field, 16), 59 0 => cp = try std.fmt.parseInt(u21, field, 16),
35 60
36 5 => { 61 5 => {
37 // Not canonical. 62 // Not canonical.
38 if (field.len == 0 or field[0] == '<') continue :lines; 63 if (field[0] == '<') continue;
64
39 if (std.mem.indexOfScalar(u8, field, ' ')) |space| { 65 if (std.mem.indexOfScalar(u8, field, ' ')) |space| {
40 // Canonical 66 // Canonical
41 len = 3; 67 const c0, const c1 = .{
42 cps[1] = try std.fmt.parseInt(u24, field[0..space], 16); 68 try std.fmt.parseInt(u21, field[0..space], 16),
43 cps[2] = try std.fmt.parseInt(u24, field[space + 1 ..], 16); 69 try std.fmt.parseInt(u21, field[space + 1 ..], 16),
70 };
71 try canon_map.put(cp, Canonicalization{
72 .len = 2,
73 .cps = [_]u21{ c0, c1 },
74 });
75 try composite_set.put(cp, [_]u21{ c0, c1 });
44 } else { 76 } else {
45 // Singleton 77 // Singleton
46 cps[1] = try std.fmt.parseInt(u24, field, 16); 78 try canon_map.put(cp, Canonicalization{
79 .len = 1,
80 .cps = [_]u21{
81 try std.fmt.parseInt(u21, field, 16),
82 0,
83 },
84 });
47 } 85 }
48 }, 86 },
49 87
50 2 => if (line[0] == '<') continue :lines,
51
52 else => {}, 88 else => {},
53 } 89 }
54 } 90 }
55
56 try writer.writeInt(u8, @intCast(len), endian);
57 for (cps[0..len]) |cp| try writer.writeInt(u24, cp, endian);
58 } else |err| switch (err) { 91 } else |err| switch (err) {
59 error.EndOfStream => {}, 92 error.EndOfStream => {},
60 else => { 93 else => {
61 return err; 94 return err;
62 }, 95 },
63 } 96 }
64 try writer.writeInt(u16, 0, endian); 97
65 try writer.flush(); 98 // Build multi-tiered lookup tables for decompositions
99 var blocks_map = BlockMap.init(allocator);
100 defer blocks_map.deinit();
101
102 var stage1 = std.array_list.Managed(u16).init(allocator);
103 defer stage1.deinit();
104
105 var stage2 = std.array_list.Managed(Canonicalization).init(allocator);
106 defer stage2.deinit();
107
108 var block: Block = [_]Canonicalization{.{}} ** block_size;
109 var block_len: u16 = 0;
110
111 for (0..0x110000) |i| {
112 const cp: u21 = @intCast(i);
113
114 const canon: Canonicalization = canon_map.get(cp) orelse .{};
115
116 block[block_len] = canon;
117 block_len += 1;
118
119 if (block_len < block_size and cp != 0x10ffff) continue;
120
121 const gop = try blocks_map.getOrPut(block);
122 if (!gop.found_existing) {
123 gop.value_ptr.* = @intCast(stage2.items.len);
124 try stage2.appendSlice(&block);
125 }
126
127 try stage1.append(gop.value_ptr.*);
128 block_len = 0;
129 }
130
131 var write_buf: [4096]u8 = undefined;
132 var out_file = try std.fs.cwd().createFile(output_path, .{});
133 defer out_file.close();
134 var writer = out_file.writer(&write_buf);
135
136 try writer.interface.print(
137 \\//! This file is auto-generated. Do not edit.
138 \\
139 \\pub const Canonicalization = struct {{
140 \\ len: u3,
141 \\ cps: [2]u21,
142 \\}};
143 \\
144 \\pub const s1: [{}]u16 = .{{
145 , .{stage1.items.len});
146 for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry});
147
148 try writer.interface.print(
149 \\
150 \\}};
151 \\
152 \\pub const s2: [{}]Canonicalization = .{{
153 , .{stage2.items.len});
154 for (stage2.items) |entry| {
155 try writer.interface.print(".{{ .len = {}, .cps = .{{ {}, {} }} }}, ", .{
156 entry.len,
157 entry.cps[0],
158 entry.cps[1],
159 });
160 }
161
162 const composite = composite_set.keys();
163 // TODO: cut
164 try writer.interface.print(
165 \\
166 \\}};
167 \\
168 \\pub const composite: [{}]u21 = .{{
169 , .{composite.len});
170 for (composite) |entry| try writer.interface.print("{}, ", .{entry});
171
172 try writer.interface.writeAll(
173 \\};
174 );
175
176 try writer.interface.print(
177 \\
178 \\ pub const c_map: [{}]struct {{ [2]u21, u21 }} = .{{
179 , .{composite.len});
180 for (composite) |comp| {
181 const canon = canon_map.get(comp).?;
182 std.debug.assert(canon.len == 2);
183 try writer.interface.print(
184 \\ .{{ .{{{}, {}}}, {}}},
185 ,
186 .{ canon.cps[0], canon.cps[1], comp },
187 );
188 }
189 // var c_entries = composite_set.iterator();
190 // while (c_entries.next()) |entry| {
191 // try writer.interface.print(
192 // \\ .{{ .{{{}, {}}}, {}}},
193 // ,
194 // .{ entry.value_ptr[0], entry.value_ptr[1], entry.key_ptr.* },
195 // );
196 // }
197 try writer.interface.writeAll(
198 \\};
199 );
200
201 try writer.interface.flush();
66} 202}