summaryrefslogtreecommitdiff
path: root/codegen
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-03-01 18:51:43 -0400
committerGravatar Jose Colon Rodriguez2024-03-01 18:51:43 -0400
commit9a0fb96c0c28540493a205b85d1b89d2c9b50f2b (patch)
tree723760b45ef8ef604b235d10c3c60edfadd0bb70 /codegen
parentRemoved dupe tombstone check in Normalizer (diff)
downloadzg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.tar.gz
zg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.tar.xz
zg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.zip
Normalizer.eqlIgnoreCase compatibility caseless matching
Diffstat (limited to 'codegen')
-rw-r--r--codegen/fold.zig76
1 files changed, 76 insertions, 0 deletions
diff --git a/codegen/fold.zig b/codegen/fold.zig
new file mode 100644
index 0000000..7977e61
--- /dev/null
+++ b/codegen/fold.zig
@@ -0,0 +1,76 @@
1const std = @import("std");
2const builtin = @import("builtin");
3const fmt = std.fmt;
4const mem = std.mem;
5
6pub fn main() !void {
7 var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
8 defer arena.deinit();
9 const allocator = arena.allocator();
10
11 // Process DerivedEastAsianWidth.txt
12 var in_file = try std.fs.cwd().openFile("data/unicode/CaseFolding.txt", .{});
13 defer in_file.close();
14 var in_buf = std.io.bufferedReader(in_file.reader());
15 const in_reader = in_buf.reader();
16
17 var args_iter = try std.process.argsWithAllocator(allocator);
18 defer args_iter.deinit();
19 _ = args_iter.skip();
20 const output_path = args_iter.next() orelse @panic("No output file arg!");
21
22 const compressor = std.compress.deflate.compressor;
23 var out_file = try std.fs.cwd().createFile(output_path, .{});
24 defer out_file.close();
25 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression });
26 defer out_comp.deinit();
27 const writer = out_comp.writer();
28
29 const endian = builtin.cpu.arch.endian();
30 var line_buf: [4096]u8 = undefined;
31
32 lines: while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
33 if (line.len == 0 or line[0] == '#') continue;
34
35 const no_comment = if (mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line;
36
37 var field_iter = mem.tokenizeSequence(u8, no_comment, "; ");
38 var cps: [4]u24 = undefined;
39 var len: usize = 2;
40
41 var i: usize = 0;
42 while (field_iter.next()) |field| : (i += 1) {
43 switch (i) {
44 0 => cps[0] = try fmt.parseInt(u24, field, 16),
45
46 1 => {
47 if (!mem.eql(u8, field, "C") and !mem.eql(u8, field, "F")) continue :lines;
48 if (mem.eql(u8, field, "F")) len = 3;
49 },
50
51 2 => {
52 if (len == 3) {
53 // Full case fold
54 // std.debug.print("-->{s} {s}\n", .{ line, field });
55 var cp_iter = mem.tokenizeScalar(u8, field, ' ');
56 len = 1;
57 while (cp_iter.next()) |cp_str| : (len += 1) {
58 cps[len] = try fmt.parseInt(u24, cp_str, 16);
59 }
60 } else {
61 // Common case fold
62 cps[1] = try fmt.parseInt(u24, field, 16);
63 }
64 },
65
66 else => {},
67 }
68 }
69
70 try writer.writeInt(u8, @intCast(len), endian);
71 for (cps[0..len]) |cp| try writer.writeInt(u24, cp, endian);
72 }
73
74 try writer.writeInt(u16, 0, endian);
75 try out_comp.flush();
76}