summaryrefslogtreecommitdiff
path: root/codegen/scripts.zig
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-03-27 21:52:02 -0400
committerGravatar Jose Colon Rodriguez2024-03-27 21:52:02 -0400
commit4ce891a8ce5336da39180964792110e131756cdd (patch)
treeb4ff0180157bb49e15d2c36f2cf0cdaab1a24535 /codegen/scripts.zig
parentFriendly general category methods (diff)
downloadzg-4ce891a8ce5336da39180964792110e131756cdd.tar.gz
zg-4ce891a8ce5336da39180964792110e131756cdd.tar.xz
zg-4ce891a8ce5336da39180964792110e131756cdd.zip
ScriptsData and made all Datas const
Diffstat (limited to 'codegen/scripts.zig')
-rw-r--r--codegen/scripts.zig309
1 files changed, 309 insertions, 0 deletions
diff --git a/codegen/scripts.zig b/codegen/scripts.zig
new file mode 100644
index 0000000..e985c1e
--- /dev/null
+++ b/codegen/scripts.zig
@@ -0,0 +1,309 @@
1const std = @import("std");
2const builtin = @import("builtin");
3
4const Script = enum {
5 none,
6 Adlam,
7 Ahom,
8 Anatolian_Hieroglyphs,
9 Arabic,
10 Armenian,
11 Avestan,
12 Balinese,
13 Bamum,
14 Bassa_Vah,
15 Batak,
16 Bengali,
17 Bhaiksuki,
18 Bopomofo,
19 Brahmi,
20 Braille,
21 Buginese,
22 Buhid,
23 Canadian_Aboriginal,
24 Carian,
25 Caucasian_Albanian,
26 Chakma,
27 Cham,
28 Cherokee,
29 Chorasmian,
30 Common,
31 Coptic,
32 Cuneiform,
33 Cypriot,
34 Cypro_Minoan,
35 Cyrillic,
36 Deseret,
37 Devanagari,
38 Dives_Akuru,
39 Dogra,
40 Duployan,
41 Egyptian_Hieroglyphs,
42 Elbasan,
43 Elymaic,
44 Ethiopic,
45 Georgian,
46 Glagolitic,
47 Gothic,
48 Grantha,
49 Greek,
50 Gujarati,
51 Gunjala_Gondi,
52 Gurmukhi,
53 Han,
54 Hangul,
55 Hanifi_Rohingya,
56 Hanunoo,
57 Hatran,
58 Hebrew,
59 Hiragana,
60 Imperial_Aramaic,
61 Inherited,
62 Inscriptional_Pahlavi,
63 Inscriptional_Parthian,
64 Javanese,
65 Kaithi,
66 Kannada,
67 Katakana,
68 Kawi,
69 Kayah_Li,
70 Kharoshthi,
71 Khitan_Small_Script,
72 Khmer,
73 Khojki,
74 Khudawadi,
75 Lao,
76 Latin,
77 Lepcha,
78 Limbu,
79 Linear_A,
80 Linear_B,
81 Lisu,
82 Lycian,
83 Lydian,
84 Mahajani,
85 Makasar,
86 Malayalam,
87 Mandaic,
88 Manichaean,
89 Marchen,
90 Masaram_Gondi,
91 Medefaidrin,
92 Meetei_Mayek,
93 Mende_Kikakui,
94 Meroitic_Cursive,
95 Meroitic_Hieroglyphs,
96 Miao,
97 Modi,
98 Mongolian,
99 Mro,
100 Multani,
101 Myanmar,
102 Nabataean,
103 Nag_Mundari,
104 Nandinagari,
105 New_Tai_Lue,
106 Newa,
107 Nko,
108 Nushu,
109 Nyiakeng_Puachue_Hmong,
110 Ogham,
111 Ol_Chiki,
112 Old_Hungarian,
113 Old_Italic,
114 Old_North_Arabian,
115 Old_Permic,
116 Old_Persian,
117 Old_Sogdian,
118 Old_South_Arabian,
119 Old_Turkic,
120 Old_Uyghur,
121 Oriya,
122 Osage,
123 Osmanya,
124 Pahawh_Hmong,
125 Palmyrene,
126 Pau_Cin_Hau,
127 Phags_Pa,
128 Phoenician,
129 Psalter_Pahlavi,
130 Rejang,
131 Runic,
132 Samaritan,
133 Saurashtra,
134 Sharada,
135 Shavian,
136 Siddham,
137 SignWriting,
138 Sinhala,
139 Sogdian,
140 Sora_Sompeng,
141 Soyombo,
142 Sundanese,
143 Syloti_Nagri,
144 Syriac,
145 Tagalog,
146 Tagbanwa,
147 Tai_Le,
148 Tai_Tham,
149 Tai_Viet,
150 Takri,
151 Tamil,
152 Tangsa,
153 Tangut,
154 Telugu,
155 Thaana,
156 Thai,
157 Tibetan,
158 Tifinagh,
159 Tirhuta,
160 Toto,
161 Ugaritic,
162 Vai,
163 Vithkuqi,
164 Wancho,
165 Warang_Citi,
166 Yezidi,
167 Yi,
168 Zanabazar_Square,
169};
170
171const block_size = 256;
172const Block = [block_size]u8;
173
174const BlockMap = std.HashMap(
175 Block,
176 u16,
177 struct {
178 pub fn hash(_: @This(), k: Block) u64 {
179 var hasher = std.hash.Wyhash.init(0);
180 std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
181 return hasher.final();
182 }
183
184 pub fn eql(_: @This(), a: Block, b: Block) bool {
185 return std.mem.eql(u8, &a, &b);
186 }
187 },
188 std.hash_map.default_max_load_percentage,
189);
190
191pub fn main() !void {
192 var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
193 defer arena.deinit();
194 const allocator = arena.allocator();
195
196 var flat_map = std.AutoHashMap(u21, u8).init(allocator);
197 defer flat_map.deinit();
198
199 var line_buf: [4096]u8 = undefined;
200
201 // Process DerivedGeneralCategory.txt
202 var in_file = try std.fs.cwd().openFile("data/unicode/Scripts.txt", .{});
203 defer in_file.close();
204 var in_buf = std.io.bufferedReader(in_file.reader());
205 const in_reader = in_buf.reader();
206
207 while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
208 if (line.len == 0 or line[0] == '#') continue;
209
210 const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line;
211
212 var field_iter = std.mem.tokenizeAny(u8, no_comment, "; ");
213 var current_code: [2]u21 = undefined;
214
215 var i: usize = 0;
216 while (field_iter.next()) |field| : (i += 1) {
217 switch (i) {
218 0 => {
219 // Code point(s)
220 if (std.mem.indexOf(u8, field, "..")) |dots| {
221 current_code = .{
222 try std.fmt.parseInt(u21, field[0..dots], 16),
223 try std.fmt.parseInt(u21, field[dots + 2 ..], 16),
224 };
225 } else {
226 const code = try std.fmt.parseInt(u21, field, 16);
227 current_code = .{ code, code };
228 }
229 },
230 1 => {
231 // Script
232 const script = std.meta.stringToEnum(Script, field) orelse {
233 std.debug.print("Unknown script: {s}\n", .{field});
234 return error.UnknownScript;
235 };
236 for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), @intFromEnum(script));
237 },
238 else => {},
239 }
240 }
241 }
242
243 var blocks_map = BlockMap.init(allocator);
244 defer blocks_map.deinit();
245
246 var stage1 = std.ArrayList(u16).init(allocator);
247 defer stage1.deinit();
248
249 var stage2 = std.ArrayList(u8).init(allocator);
250 defer stage2.deinit();
251
252 var stage3 = std.ArrayList(u8).init(allocator);
253 defer stage3.deinit();
254
255 var block: Block = [_]u8{0} ** block_size;
256 var block_len: u16 = 0;
257
258 for (0..0x110000) |i| {
259 const cp: u21 = @intCast(i);
260 const script = flat_map.get(cp) orelse 0;
261
262 const stage3_idx = blk: {
263 for (stage3.items, 0..) |script_i, j| {
264 if (script == script_i) break :blk j;
265 }
266 try stage3.append(script);
267 break :blk stage3.items.len - 1;
268 };
269
270 // Process block
271 block[block_len] = @intCast(stage3_idx);
272 block_len += 1;
273
274 if (block_len < block_size and cp != 0x10ffff) continue;
275
276 const gop = try blocks_map.getOrPut(block);
277 if (!gop.found_existing) {
278 gop.value_ptr.* = @intCast(stage2.items.len);
279 try stage2.appendSlice(&block);
280 }
281
282 try stage1.append(gop.value_ptr.*);
283 block_len = 0;
284 }
285
286 var args_iter = try std.process.argsWithAllocator(allocator);
287 defer args_iter.deinit();
288 _ = args_iter.skip();
289 const output_path = args_iter.next() orelse @panic("No output file arg!");
290
291 const compressor = std.compress.deflate.compressor;
292 var out_file = try std.fs.cwd().createFile(output_path, .{});
293 defer out_file.close();
294 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression });
295 defer out_comp.deinit();
296 const writer = out_comp.writer();
297
298 const endian = builtin.cpu.arch.endian();
299 try writer.writeInt(u16, @intCast(stage1.items.len), endian);
300 for (stage1.items) |i| try writer.writeInt(u16, i, endian);
301
302 try writer.writeInt(u16, @intCast(stage2.items.len), endian);
303 for (stage2.items) |i| try writer.writeInt(u8, i, endian);
304
305 try writer.writeInt(u8, @intCast(stage3.items.len), endian);
306 for (stage3.items) |i| try writer.writeInt(u8, i, endian);
307
308 try out_comp.flush();
309}