summaryrefslogtreecommitdiff
path: root/codegen
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-26 18:29:59 -0400
committerGravatar Jose Colon Rodriguez2024-02-26 18:29:59 -0400
commit5e9a06c217fbd09aa8cf95da139852560f3da7d0 (patch)
tree7e332579ca97656fc33521f4f00da2993c133e15 /codegen
parentUsing separate data struct model. (diff)
downloadzg-5e9a06c217fbd09aa8cf95da139852560f3da7d0.tar.gz
zg-5e9a06c217fbd09aa8cf95da139852560f3da7d0.tar.xz
zg-5e9a06c217fbd09aa8cf95da139852560f3da7d0.zip
Using NormData and CanonData in Normalizer
Diffstat (limited to 'codegen')
-rw-r--r--codegen/canon.zig68
1 files changed, 68 insertions, 0 deletions
diff --git a/codegen/canon.zig b/codegen/canon.zig
new file mode 100644
index 0000000..9d72edd
--- /dev/null
+++ b/codegen/canon.zig
@@ -0,0 +1,68 @@
1const std = @import("std");
2const builtin = @import("builtin");
3
4pub fn main() !void {
5 var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
6 defer arena.deinit();
7 const allocator = arena.allocator();
8
9 // Process DerivedEastAsianWidth.txt
10 var in_file = try std.fs.cwd().openFile("data/unicode/UnicodeData.txt", .{});
11 defer in_file.close();
12 var in_buf = std.io.bufferedReader(in_file.reader());
13 const in_reader = in_buf.reader();
14
15 var args_iter = try std.process.argsWithAllocator(allocator);
16 defer args_iter.deinit();
17 _ = args_iter.skip();
18 const output_path = args_iter.next() orelse @panic("No output file arg!");
19
20 const compressor = std.compress.deflate.compressor;
21 var out_file = try std.fs.cwd().createFile(output_path, .{});
22 defer out_file.close();
23 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression });
24 defer out_comp.deinit();
25 const writer = out_comp.writer();
26
27 const endian = builtin.cpu.arch.endian();
28 var line_buf: [4096]u8 = undefined;
29
30 lines: while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
31 if (line.len == 0) continue;
32
33 var field_iter = std.mem.splitScalar(u8, line, ';');
34 var cps: [3]u24 = undefined;
35 var len: u8 = 2;
36
37 var i: usize = 0;
38 while (field_iter.next()) |field| : (i += 1) {
39 switch (i) {
40 0 => cps[0] = try std.fmt.parseInt(u24, field, 16),
41
42 5 => {
43 // Not canonical.
44 if (field.len == 0 or field[0] == '<') continue :lines;
45 if (std.mem.indexOfScalar(u8, field, ' ')) |space| {
46 // Canonical
47 len = 3;
48 cps[1] = try std.fmt.parseInt(u24, field[0..space], 16);
49 cps[2] = try std.fmt.parseInt(u24, field[space + 1 ..], 16);
50 } else {
51 // Singleton
52 cps[1] = try std.fmt.parseInt(u24, field, 16);
53 }
54 },
55
56 2 => if (line[0] == '<') continue :lines,
57
58 else => {},
59 }
60 }
61
62 try writer.writeInt(u8, @intCast(len), endian);
63 for (cps[0..len]) |cp| try writer.writeInt(u24, cp, endian);
64 }
65
66 try writer.writeInt(u16, 0, endian);
67 try out_comp.flush();
68}