summaryrefslogtreecommitdiff
path: root/codegen/dwp.zig
diff options
context:
space:
mode:
Diffstat (limited to 'codegen/dwp.zig')
-rw-r--r--codegen/dwp.zig243
1 files changed, 243 insertions, 0 deletions
diff --git a/codegen/dwp.zig b/codegen/dwp.zig
new file mode 100644
index 0000000..a8cef57
--- /dev/null
+++ b/codegen/dwp.zig
@@ -0,0 +1,243 @@
1const std = @import("std");
2
3const block_size = 256;
4const Block = [block_size]i3;
5
6const BlockMap = std.HashMap(
7 Block,
8 u16,
9 struct {
10 pub fn hash(_: @This(), k: Block) u64 {
11 var hasher = std.hash.Wyhash.init(0);
12 std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
13 return hasher.final();
14 }
15
16 pub fn eql(_: @This(), a: Block, b: Block) bool {
17 return std.mem.eql(i3, &a, &b);
18 }
19 },
20 std.hash_map.default_max_load_percentage,
21);
22
23pub fn main() !void {
24 var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
25 defer arena.deinit();
26 const allocator = arena.allocator();
27
28 var flat_map = std.AutoHashMap(u21, i3).init(allocator);
29 defer flat_map.deinit();
30
31 var line_buf: [4096]u8 = undefined;
32
33 // Process DerivedEastAsianWidth.txt
34 var deaw_file = try std.fs.cwd().openFile("unicode/extracted/DerivedEastAsianWidth.txt", .{});
35 defer deaw_file.close();
36 var deaw_buf = std.io.bufferedReader(deaw_file.reader());
37 const deaw_reader = deaw_buf.reader();
38
39 while (try deaw_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
40 if (line.len == 0) continue;
41
42 // @missing ranges
43 if (std.mem.startsWith(u8, line, "# @missing: ")) {
44 const semi = std.mem.indexOfScalar(u8, line, ';').?;
45 const field = line[12..semi];
46 const dots = std.mem.indexOf(u8, field, "..").?;
47 const from = try std.fmt.parseInt(u21, field[0..dots], 16);
48 const to = try std.fmt.parseInt(u21, field[dots + 2 ..], 16);
49 if (from == 0 and to == 0x10ffff) continue;
50 for (from..to + 1) |cp| try flat_map.put(@intCast(cp), 2);
51 continue;
52 }
53
54 if (line[0] == '#') continue;
55
56 const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line;
57
58 var field_iter = std.mem.tokenizeAny(u8, no_comment, "; ");
59 var current_code: [2]u21 = undefined;
60
61 var i: usize = 0;
62 while (field_iter.next()) |field| : (i += 1) {
63 switch (i) {
64 0 => {
65 // Code point(s)
66 if (std.mem.indexOf(u8, field, "..")) |dots| {
67 current_code = .{
68 try std.fmt.parseInt(u21, field[0..dots], 16),
69 try std.fmt.parseInt(u21, field[dots + 2 ..], 16),
70 };
71 } else {
72 const code = try std.fmt.parseInt(u21, field, 16);
73 current_code = .{ code, code };
74 }
75 },
76 1 => {
77 // Width
78 if (std.mem.eql(u8, field, "W") or std.mem.eql(u8, field, "F")) {
79 for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 2);
80 }
81 },
82 else => {},
83 }
84 }
85 }
86
87 // Process DerivedGeneralCategory.txt
88 var dgc_file = try std.fs.cwd().openFile("unicode/extracted/DerivedGeneralCategory.txt", .{});
89 defer dgc_file.close();
90 var dgc_buf = std.io.bufferedReader(dgc_file.reader());
91 const dgc_reader = dgc_buf.reader();
92
93 while (try dgc_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
94 if (line.len == 0 or line[0] == '#') continue;
95 const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line;
96
97 var field_iter = std.mem.tokenizeAny(u8, no_comment, "; ");
98 var current_code: [2]u21 = undefined;
99
100 var i: usize = 0;
101 while (field_iter.next()) |field| : (i += 1) {
102 switch (i) {
103 0 => {
104 // Code point(s)
105 if (std.mem.indexOf(u8, field, "..")) |dots| {
106 current_code = .{
107 try std.fmt.parseInt(u21, field[0..dots], 16),
108 try std.fmt.parseInt(u21, field[dots + 2 ..], 16),
109 };
110 } else {
111 const code = try std.fmt.parseInt(u21, field, 16);
112 current_code = .{ code, code };
113 }
114 },
115 1 => {
116 // General category
117 if (std.mem.eql(u8, field, "Mn")) {
118 // Nonspacing_Mark
119 for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 0);
120 } else if (std.mem.eql(u8, field, "Me")) {
121 // Enclosing_Mark
122 for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 0);
123 } else if (std.mem.eql(u8, field, "Mc")) {
124 // Spacing_Mark
125 for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 0);
126 } else if (std.mem.eql(u8, field, "Cf")) {
127 if (std.mem.indexOf(u8, line, "ARABIC") == null) {
128 // Format except Arabic
129 for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 0);
130 }
131 }
132 },
133 else => {},
134 }
135 }
136 }
137
138 var blocks_map = BlockMap.init(allocator);
139 defer blocks_map.deinit();
140
141 var stage1 = std.ArrayList(u16).init(allocator);
142 defer stage1.deinit();
143
144 var stage2 = std.ArrayList(i3).init(allocator);
145 defer stage2.deinit();
146
147 var block: Block = [_]i3{0} ** block_size;
148 var block_len: u16 = 0;
149
150 for (0..0x110000) |i| {
151 const cp: u21 = @intCast(i);
152 var width = flat_map.get(cp) orelse 1;
153
154 // Specific overrides
155 switch (cp) {
156 // Three-em dash
157 0x2e3b => width = 3,
158
159 // C0/C1 control codes
160 0...0x20,
161 0x80...0xa0,
162
163 // Line separator
164 0x2028,
165
166 // Paragraph separator
167 0x2029,
168
169 // Hangul syllable and ignorable.
170 0x1160...0x11ff,
171 0xd7b0...0xd7ff,
172 0x2060...0x206f,
173 0xfff0...0xfff8,
174 0xe0000...0xE0fff,
175
176 // Sk with EMOJI MODIFIER comment
177 0x1f3fb...0x1f3ff,
178 => width = 0,
179
180 // Two-em dash
181 0x2e3a,
182
183 // Regional indicators
184 0x1f1e6...0x1f200,
185
186 // CJK Blocks
187 0x3400...0x4dbf, // CJK Unified Ideographs Extension A
188 0x4e00...0x9fff, // CJK Unified Ideographs
189 0xf900...0xfaff, // CJK Compatibility Ideographs
190 0x20000...0x2fffd, // Plane 2
191 0x30000...0x3fffd, // Plane 3
192 => width = 2,
193
194 else => {},
195 }
196
197 // ASCII
198 if (0x20 <= cp and cp < 0x7f) width = 1;
199
200 // Soft hyphen
201 if (cp == 0xad) width = 1;
202
203 // Backspace and delete
204 if (cp == 0x8 or cp == 0x7f) width = -1;
205
206 // Process block
207 block[block_len] = width;
208 block_len += 1;
209
210 if (block_len < block_size and cp != 0x10ffff) continue;
211
212 const gop = try blocks_map.getOrPut(block);
213 if (!gop.found_existing) {
214 gop.value_ptr.* = @intCast(stage2.items.len);
215 try stage2.appendSlice(&block);
216 }
217
218 try stage1.append(gop.value_ptr.*);
219 block_len = 0;
220 }
221
222 var args_iter = std.process.args();
223 _ = args_iter.skip();
224 const output_path = args_iter.next() orelse @panic("No output file arg!");
225
226 var out_file = try std.fs.cwd().createFile(output_path, .{});
227 defer out_file.close();
228 var out_buf = std.io.bufferedWriter(out_file.writer());
229 const writer = out_buf.writer();
230
231 try writer.writeAll("const std = @import(\"std\");\n");
232
233 try writer.print("const Stage2Int = std.math.IntFittingRange(0, {});\n", .{stage2.items.len});
234 try writer.print("pub const stage_1 = [{}]Stage2Int{{", .{stage1.items.len});
235 for (stage1.items) |v| try writer.print("{},", .{v});
236 try writer.writeAll("};\n");
237
238 try writer.print("pub const stage_2 = [{}]i3{{", .{stage2.items.len});
239 for (stage2.items) |v| try writer.print("{},", .{v});
240 try writer.writeAll("};\n");
241
242 try out_buf.flush();
243}