diff options
| author | 2024-03-01 18:51:43 -0400 | |
|---|---|---|
| committer | 2024-03-01 18:51:43 -0400 | |
| commit | 9a0fb96c0c28540493a205b85d1b89d2c9b50f2b (patch) | |
| tree | 723760b45ef8ef604b235d10c3c60edfadd0bb70 /codegen | |
| parent | Removed dupe tombstone check in Normalizer (diff) | |
| download | zg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.tar.gz zg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.tar.xz zg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.zip | |
Normalizer.eqlIgnoreCase compatibility caseless matching
Diffstat (limited to 'codegen')
| -rw-r--r-- | codegen/fold.zig | 76 |
1 files changed, 76 insertions, 0 deletions
diff --git a/codegen/fold.zig b/codegen/fold.zig new file mode 100644 index 0000000..7977e61 --- /dev/null +++ b/codegen/fold.zig | |||
| @@ -0,0 +1,76 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | const fmt = std.fmt; | ||
| 4 | const mem = std.mem; | ||
| 5 | |||
| 6 | pub fn main() !void { | ||
| 7 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); | ||
| 8 | defer arena.deinit(); | ||
| 9 | const allocator = arena.allocator(); | ||
| 10 | |||
| 11 | // Process DerivedEastAsianWidth.txt | ||
| 12 | var in_file = try std.fs.cwd().openFile("data/unicode/CaseFolding.txt", .{}); | ||
| 13 | defer in_file.close(); | ||
| 14 | var in_buf = std.io.bufferedReader(in_file.reader()); | ||
| 15 | const in_reader = in_buf.reader(); | ||
| 16 | |||
| 17 | var args_iter = try std.process.argsWithAllocator(allocator); | ||
| 18 | defer args_iter.deinit(); | ||
| 19 | _ = args_iter.skip(); | ||
| 20 | const output_path = args_iter.next() orelse @panic("No output file arg!"); | ||
| 21 | |||
| 22 | const compressor = std.compress.deflate.compressor; | ||
| 23 | var out_file = try std.fs.cwd().createFile(output_path, .{}); | ||
| 24 | defer out_file.close(); | ||
| 25 | var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); | ||
| 26 | defer out_comp.deinit(); | ||
| 27 | const writer = out_comp.writer(); | ||
| 28 | |||
| 29 | const endian = builtin.cpu.arch.endian(); | ||
| 30 | var line_buf: [4096]u8 = undefined; | ||
| 31 | |||
| 32 | lines: while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { | ||
| 33 | if (line.len == 0 or line[0] == '#') continue; | ||
| 34 | |||
| 35 | const no_comment = if (mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line; | ||
| 36 | |||
| 37 | var field_iter = mem.tokenizeSequence(u8, no_comment, "; "); | ||
| 38 | var cps: [4]u24 = undefined; | ||
| 39 | var len: usize = 2; | ||
| 40 | |||
| 41 | var i: usize = 0; | ||
| 42 | while (field_iter.next()) |field| : (i += 1) { | ||
| 43 | switch (i) { | ||
| 44 | 0 => cps[0] = try fmt.parseInt(u24, field, 16), | ||
| 45 | |||
| 46 | 1 => { | ||
| 47 | if (!mem.eql(u8, field, "C") and !mem.eql(u8, field, "F")) continue :lines; | ||
| 48 | if (mem.eql(u8, field, "F")) len = 3; | ||
| 49 | }, | ||
| 50 | |||
| 51 | 2 => { | ||
| 52 | if (len == 3) { | ||
| 53 | // Full case fold | ||
| 54 | // std.debug.print("-->{s} {s}\n", .{ line, field }); | ||
| 55 | var cp_iter = mem.tokenizeScalar(u8, field, ' '); | ||
| 56 | len = 1; | ||
| 57 | while (cp_iter.next()) |cp_str| : (len += 1) { | ||
| 58 | cps[len] = try fmt.parseInt(u24, cp_str, 16); | ||
| 59 | } | ||
| 60 | } else { | ||
| 61 | // Common case fold | ||
| 62 | cps[1] = try fmt.parseInt(u24, field, 16); | ||
| 63 | } | ||
| 64 | }, | ||
| 65 | |||
| 66 | else => {}, | ||
| 67 | } | ||
| 68 | } | ||
| 69 | |||
| 70 | try writer.writeInt(u8, @intCast(len), endian); | ||
| 71 | for (cps[0..len]) |cp| try writer.writeInt(u24, cp, endian); | ||
| 72 | } | ||
| 73 | |||
| 74 | try writer.writeInt(u16, 0, endian); | ||
| 75 | try out_comp.flush(); | ||
| 76 | } | ||