diff options
Diffstat (limited to 'codegen/scripts.zig')
| -rw-r--r-- | codegen/scripts.zig | 309 |
1 files changed, 309 insertions, 0 deletions
diff --git a/codegen/scripts.zig b/codegen/scripts.zig new file mode 100644 index 0000000..e985c1e --- /dev/null +++ b/codegen/scripts.zig | |||
| @@ -0,0 +1,309 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | |||
| 4 | const Script = enum { | ||
| 5 | none, | ||
| 6 | Adlam, | ||
| 7 | Ahom, | ||
| 8 | Anatolian_Hieroglyphs, | ||
| 9 | Arabic, | ||
| 10 | Armenian, | ||
| 11 | Avestan, | ||
| 12 | Balinese, | ||
| 13 | Bamum, | ||
| 14 | Bassa_Vah, | ||
| 15 | Batak, | ||
| 16 | Bengali, | ||
| 17 | Bhaiksuki, | ||
| 18 | Bopomofo, | ||
| 19 | Brahmi, | ||
| 20 | Braille, | ||
| 21 | Buginese, | ||
| 22 | Buhid, | ||
| 23 | Canadian_Aboriginal, | ||
| 24 | Carian, | ||
| 25 | Caucasian_Albanian, | ||
| 26 | Chakma, | ||
| 27 | Cham, | ||
| 28 | Cherokee, | ||
| 29 | Chorasmian, | ||
| 30 | Common, | ||
| 31 | Coptic, | ||
| 32 | Cuneiform, | ||
| 33 | Cypriot, | ||
| 34 | Cypro_Minoan, | ||
| 35 | Cyrillic, | ||
| 36 | Deseret, | ||
| 37 | Devanagari, | ||
| 38 | Dives_Akuru, | ||
| 39 | Dogra, | ||
| 40 | Duployan, | ||
| 41 | Egyptian_Hieroglyphs, | ||
| 42 | Elbasan, | ||
| 43 | Elymaic, | ||
| 44 | Ethiopic, | ||
| 45 | Georgian, | ||
| 46 | Glagolitic, | ||
| 47 | Gothic, | ||
| 48 | Grantha, | ||
| 49 | Greek, | ||
| 50 | Gujarati, | ||
| 51 | Gunjala_Gondi, | ||
| 52 | Gurmukhi, | ||
| 53 | Han, | ||
| 54 | Hangul, | ||
| 55 | Hanifi_Rohingya, | ||
| 56 | Hanunoo, | ||
| 57 | Hatran, | ||
| 58 | Hebrew, | ||
| 59 | Hiragana, | ||
| 60 | Imperial_Aramaic, | ||
| 61 | Inherited, | ||
| 62 | Inscriptional_Pahlavi, | ||
| 63 | Inscriptional_Parthian, | ||
| 64 | Javanese, | ||
| 65 | Kaithi, | ||
| 66 | Kannada, | ||
| 67 | Katakana, | ||
| 68 | Kawi, | ||
| 69 | Kayah_Li, | ||
| 70 | Kharoshthi, | ||
| 71 | Khitan_Small_Script, | ||
| 72 | Khmer, | ||
| 73 | Khojki, | ||
| 74 | Khudawadi, | ||
| 75 | Lao, | ||
| 76 | Latin, | ||
| 77 | Lepcha, | ||
| 78 | Limbu, | ||
| 79 | Linear_A, | ||
| 80 | Linear_B, | ||
| 81 | Lisu, | ||
| 82 | Lycian, | ||
| 83 | Lydian, | ||
| 84 | Mahajani, | ||
| 85 | Makasar, | ||
| 86 | Malayalam, | ||
| 87 | Mandaic, | ||
| 88 | Manichaean, | ||
| 89 | Marchen, | ||
| 90 | Masaram_Gondi, | ||
| 91 | Medefaidrin, | ||
| 92 | Meetei_Mayek, | ||
| 93 | Mende_Kikakui, | ||
| 94 | Meroitic_Cursive, | ||
| 95 | Meroitic_Hieroglyphs, | ||
| 96 | Miao, | ||
| 97 | Modi, | ||
| 98 | Mongolian, | ||
| 99 | Mro, | ||
| 100 | Multani, | ||
| 101 | Myanmar, | ||
| 102 | Nabataean, | ||
| 103 | Nag_Mundari, | ||
| 104 | Nandinagari, | ||
| 105 | New_Tai_Lue, | ||
| 106 | Newa, | ||
| 107 | Nko, | ||
| 108 | Nushu, | ||
| 109 | Nyiakeng_Puachue_Hmong, | ||
| 110 | Ogham, | ||
| 111 | Ol_Chiki, | ||
| 112 | Old_Hungarian, | ||
| 113 | Old_Italic, | ||
| 114 | Old_North_Arabian, | ||
| 115 | Old_Permic, | ||
| 116 | Old_Persian, | ||
| 117 | Old_Sogdian, | ||
| 118 | Old_South_Arabian, | ||
| 119 | Old_Turkic, | ||
| 120 | Old_Uyghur, | ||
| 121 | Oriya, | ||
| 122 | Osage, | ||
| 123 | Osmanya, | ||
| 124 | Pahawh_Hmong, | ||
| 125 | Palmyrene, | ||
| 126 | Pau_Cin_Hau, | ||
| 127 | Phags_Pa, | ||
| 128 | Phoenician, | ||
| 129 | Psalter_Pahlavi, | ||
| 130 | Rejang, | ||
| 131 | Runic, | ||
| 132 | Samaritan, | ||
| 133 | Saurashtra, | ||
| 134 | Sharada, | ||
| 135 | Shavian, | ||
| 136 | Siddham, | ||
| 137 | SignWriting, | ||
| 138 | Sinhala, | ||
| 139 | Sogdian, | ||
| 140 | Sora_Sompeng, | ||
| 141 | Soyombo, | ||
| 142 | Sundanese, | ||
| 143 | Syloti_Nagri, | ||
| 144 | Syriac, | ||
| 145 | Tagalog, | ||
| 146 | Tagbanwa, | ||
| 147 | Tai_Le, | ||
| 148 | Tai_Tham, | ||
| 149 | Tai_Viet, | ||
| 150 | Takri, | ||
| 151 | Tamil, | ||
| 152 | Tangsa, | ||
| 153 | Tangut, | ||
| 154 | Telugu, | ||
| 155 | Thaana, | ||
| 156 | Thai, | ||
| 157 | Tibetan, | ||
| 158 | Tifinagh, | ||
| 159 | Tirhuta, | ||
| 160 | Toto, | ||
| 161 | Ugaritic, | ||
| 162 | Vai, | ||
| 163 | Vithkuqi, | ||
| 164 | Wancho, | ||
| 165 | Warang_Citi, | ||
| 166 | Yezidi, | ||
| 167 | Yi, | ||
| 168 | Zanabazar_Square, | ||
| 169 | }; | ||
| 170 | |||
| 171 | const block_size = 256; | ||
| 172 | const Block = [block_size]u8; | ||
| 173 | |||
| 174 | const BlockMap = std.HashMap( | ||
| 175 | Block, | ||
| 176 | u16, | ||
| 177 | struct { | ||
| 178 | pub fn hash(_: @This(), k: Block) u64 { | ||
| 179 | var hasher = std.hash.Wyhash.init(0); | ||
| 180 | std.hash.autoHashStrat(&hasher, k, .DeepRecursive); | ||
| 181 | return hasher.final(); | ||
| 182 | } | ||
| 183 | |||
| 184 | pub fn eql(_: @This(), a: Block, b: Block) bool { | ||
| 185 | return std.mem.eql(u8, &a, &b); | ||
| 186 | } | ||
| 187 | }, | ||
| 188 | std.hash_map.default_max_load_percentage, | ||
| 189 | ); | ||
| 190 | |||
| 191 | pub fn main() !void { | ||
| 192 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); | ||
| 193 | defer arena.deinit(); | ||
| 194 | const allocator = arena.allocator(); | ||
| 195 | |||
| 196 | var flat_map = std.AutoHashMap(u21, u8).init(allocator); | ||
| 197 | defer flat_map.deinit(); | ||
| 198 | |||
| 199 | var line_buf: [4096]u8 = undefined; | ||
| 200 | |||
| 201 | // Process DerivedGeneralCategory.txt | ||
| 202 | var in_file = try std.fs.cwd().openFile("data/unicode/Scripts.txt", .{}); | ||
| 203 | defer in_file.close(); | ||
| 204 | var in_buf = std.io.bufferedReader(in_file.reader()); | ||
| 205 | const in_reader = in_buf.reader(); | ||
| 206 | |||
| 207 | while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { | ||
| 208 | if (line.len == 0 or line[0] == '#') continue; | ||
| 209 | |||
| 210 | const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line; | ||
| 211 | |||
| 212 | var field_iter = std.mem.tokenizeAny(u8, no_comment, "; "); | ||
| 213 | var current_code: [2]u21 = undefined; | ||
| 214 | |||
| 215 | var i: usize = 0; | ||
| 216 | while (field_iter.next()) |field| : (i += 1) { | ||
| 217 | switch (i) { | ||
| 218 | 0 => { | ||
| 219 | // Code point(s) | ||
| 220 | if (std.mem.indexOf(u8, field, "..")) |dots| { | ||
| 221 | current_code = .{ | ||
| 222 | try std.fmt.parseInt(u21, field[0..dots], 16), | ||
| 223 | try std.fmt.parseInt(u21, field[dots + 2 ..], 16), | ||
| 224 | }; | ||
| 225 | } else { | ||
| 226 | const code = try std.fmt.parseInt(u21, field, 16); | ||
| 227 | current_code = .{ code, code }; | ||
| 228 | } | ||
| 229 | }, | ||
| 230 | 1 => { | ||
| 231 | // Script | ||
| 232 | const script = std.meta.stringToEnum(Script, field) orelse { | ||
| 233 | std.debug.print("Unknown script: {s}\n", .{field}); | ||
| 234 | return error.UnknownScript; | ||
| 235 | }; | ||
| 236 | for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), @intFromEnum(script)); | ||
| 237 | }, | ||
| 238 | else => {}, | ||
| 239 | } | ||
| 240 | } | ||
| 241 | } | ||
| 242 | |||
| 243 | var blocks_map = BlockMap.init(allocator); | ||
| 244 | defer blocks_map.deinit(); | ||
| 245 | |||
| 246 | var stage1 = std.ArrayList(u16).init(allocator); | ||
| 247 | defer stage1.deinit(); | ||
| 248 | |||
| 249 | var stage2 = std.ArrayList(u8).init(allocator); | ||
| 250 | defer stage2.deinit(); | ||
| 251 | |||
| 252 | var stage3 = std.ArrayList(u8).init(allocator); | ||
| 253 | defer stage3.deinit(); | ||
| 254 | |||
| 255 | var block: Block = [_]u8{0} ** block_size; | ||
| 256 | var block_len: u16 = 0; | ||
| 257 | |||
| 258 | for (0..0x110000) |i| { | ||
| 259 | const cp: u21 = @intCast(i); | ||
| 260 | const script = flat_map.get(cp) orelse 0; | ||
| 261 | |||
| 262 | const stage3_idx = blk: { | ||
| 263 | for (stage3.items, 0..) |script_i, j| { | ||
| 264 | if (script == script_i) break :blk j; | ||
| 265 | } | ||
| 266 | try stage3.append(script); | ||
| 267 | break :blk stage3.items.len - 1; | ||
| 268 | }; | ||
| 269 | |||
| 270 | // Process block | ||
| 271 | block[block_len] = @intCast(stage3_idx); | ||
| 272 | block_len += 1; | ||
| 273 | |||
| 274 | if (block_len < block_size and cp != 0x10ffff) continue; | ||
| 275 | |||
| 276 | const gop = try blocks_map.getOrPut(block); | ||
| 277 | if (!gop.found_existing) { | ||
| 278 | gop.value_ptr.* = @intCast(stage2.items.len); | ||
| 279 | try stage2.appendSlice(&block); | ||
| 280 | } | ||
| 281 | |||
| 282 | try stage1.append(gop.value_ptr.*); | ||
| 283 | block_len = 0; | ||
| 284 | } | ||
| 285 | |||
| 286 | var args_iter = try std.process.argsWithAllocator(allocator); | ||
| 287 | defer args_iter.deinit(); | ||
| 288 | _ = args_iter.skip(); | ||
| 289 | const output_path = args_iter.next() orelse @panic("No output file arg!"); | ||
| 290 | |||
| 291 | const compressor = std.compress.deflate.compressor; | ||
| 292 | var out_file = try std.fs.cwd().createFile(output_path, .{}); | ||
| 293 | defer out_file.close(); | ||
| 294 | var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); | ||
| 295 | defer out_comp.deinit(); | ||
| 296 | const writer = out_comp.writer(); | ||
| 297 | |||
| 298 | const endian = builtin.cpu.arch.endian(); | ||
| 299 | try writer.writeInt(u16, @intCast(stage1.items.len), endian); | ||
| 300 | for (stage1.items) |i| try writer.writeInt(u16, i, endian); | ||
| 301 | |||
| 302 | try writer.writeInt(u16, @intCast(stage2.items.len), endian); | ||
| 303 | for (stage2.items) |i| try writer.writeInt(u8, i, endian); | ||
| 304 | |||
| 305 | try writer.writeInt(u8, @intCast(stage3.items.len), endian); | ||
| 306 | for (stage3.items) |i| try writer.writeInt(u8, i, endian); | ||
| 307 | |||
| 308 | try out_comp.flush(); | ||
| 309 | } | ||