codegen/fold.zig


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252

const std = @import("std");
const builtin = @import("builtin");
const mem = std.mem;

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    defer std.debug.assert(gpa.deinit() == .ok);
    const allocator = gpa.allocator();

    // Process DerivedCoreProperties.txt
    var props_file = try std.fs.cwd().openFile("data/unicode/DerivedCoreProperties.txt", .{});
    defer props_file.close();
    var props_buf = std.io.bufferedReader(props_file.reader());
    const props_reader = props_buf.reader();

    var props_map = std.AutoHashMap(u21, void).init(allocator);
    defer props_map.deinit();

    var line_buf: [4096]u8 = undefined;

    props_lines: while (try props_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
        if (line.len == 0 or line[0] == '#') continue;

        const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line;

        var field_iter = std.mem.tokenizeAny(u8, no_comment, "; ");
        var current_code: [2]u21 = undefined;

        var i: usize = 0;
        while (field_iter.next()) |field| : (i += 1) {
            switch (i) {
                0 => {
                    // Code point(s)
                    if (std.mem.indexOf(u8, field, "..")) |dots| {
                        current_code = .{
                            try std.fmt.parseInt(u21, field[0..dots], 16),
                            try std.fmt.parseInt(u21, field[dots + 2 ..], 16),
                        };
                    } else {
                        const code = try std.fmt.parseInt(u21, field, 16);
                        current_code = .{ code, code };
                    }
                },
                1 => {
                    // Core property
                    if (!mem.eql(u8, field, "Changes_When_Casefolded")) continue :props_lines;
                    for (current_code[0]..current_code[1] + 1) |cp| try props_map.put(@intCast(cp), {});
                },
                else => {},
            }
        }
    }

    var codepoint_mapping = std.AutoArrayHashMap(u21, [3]u21).init(allocator);
    defer codepoint_mapping.deinit();

    // Process CaseFolding.txt
    var cp_file = try std.fs.cwd().openFile("data/unicode/CaseFolding.txt", .{});
    defer cp_file.close();
    var cp_buf = std.io.bufferedReader(cp_file.reader());
    const cp_reader = cp_buf.reader();

    while (try cp_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
        if (line.len == 0 or line[0] == '#') continue;

        var field_it = std.mem.splitScalar(u8, line, ';');
        const codepoint_str = field_it.first();
        const codepoint = try std.fmt.parseUnsigned(u21, codepoint_str, 16);

        const status = std.mem.trim(u8, field_it.next() orelse continue, " ");
        // Only interested in 'common' and 'full'
        if (status[0] != 'C' and status[0] != 'F') continue;

        const mapping = std.mem.trim(u8, field_it.next() orelse continue, " ");
        var mapping_it = std.mem.splitScalar(u8, mapping, ' ');
        var mapping_buf = [_]u21{0} ** 3;
        var mapping_i: u8 = 0;
        while (mapping_it.next()) |mapping_c| {
            mapping_buf[mapping_i] = try std.fmt.parseInt(u21, mapping_c, 16);
            mapping_i += 1;
        }

        try codepoint_mapping.putNoClobber(codepoint, mapping_buf);
    }

    var changes_when_casefolded_exceptions = std.ArrayList(u21).init(allocator);
    defer changes_when_casefolded_exceptions.deinit();

    {
        // Codepoints with a case fold mapping can be missing the Changes_When_Casefolded property,
        // but not vice versa.
        for (codepoint_mapping.keys()) |codepoint| {
            if (props_map.get(codepoint) == null) {
                try changes_when_casefolded_exceptions.append(codepoint);
            }
        }
    }

    var offset_to_index = std.AutoHashMap(i32, u8).init(allocator);
    defer offset_to_index.deinit();
    var unique_offsets = std.AutoArrayHashMap(i32, u32).init(allocator);
    defer unique_offsets.deinit();

    // First pass
    {
        var it = codepoint_mapping.iterator();
        while (it.next()) |entry| {
            const codepoint = entry.key_ptr.*;
            const mappings = std.mem.sliceTo(entry.value_ptr, 0);
            if (mappings.len == 1) {
                const offset: i32 = @as(i32, mappings[0]) - @as(i32, codepoint);
                const result = try unique_offsets.getOrPut(offset);
                if (!result.found_existing) result.value_ptr.* = 0;
                result.value_ptr.* += 1;
            }
        }

        // A codepoint mapping to itself (offset=0) is the most common case
        try unique_offsets.put(0, 0x10FFFF);
        const C = struct {
            vals: []u32,

            pub fn lessThan(ctx: @This(), a_index: usize, b_index: usize) bool {
                return ctx.vals[a_index] > ctx.vals[b_index];
            }
        };
        unique_offsets.sort(C{ .vals = unique_offsets.values() });

        var offset_it = unique_offsets.iterator();
        var offset_index: u7 = 0;
        while (offset_it.next()) |entry| {
            try offset_to_index.put(entry.key_ptr.*, offset_index);
            offset_index += 1;
        }
    }

    var mappings_to_index = std.AutoArrayHashMap([3]u21, u8).init(allocator);
    defer mappings_to_index.deinit();
    var codepoint_to_index = std.AutoHashMap(u21, u8).init(allocator);
    defer codepoint_to_index.deinit();

    // Second pass
    {
        var count_multiple_codepoints: u8 = 0;

        var it = codepoint_mapping.iterator();
        while (it.next()) |entry| {
            const codepoint = entry.key_ptr.*;
            const mappings = std.mem.sliceTo(entry.value_ptr, 0);
            if (mappings.len > 1) {
                const result = try mappings_to_index.getOrPut(entry.value_ptr.*);
                if (!result.found_existing) {
                    result.value_ptr.* = 0x80 | count_multiple_codepoints;
                    count_multiple_codepoints += 1;
                }
                const index = result.value_ptr.*;
                try codepoint_to_index.put(codepoint, index);
            } else {
                const offset: i32 = @as(i32, mappings[0]) - @as(i32, codepoint);
                const index = offset_to_index.get(offset).?;
                try codepoint_to_index.put(codepoint, index);
            }
        }
    }

    // Build the stage1/stage2/stage3 arrays and output them
    {
        const Block = [256]u8;
        var stage2_blocks = std.AutoArrayHashMap(Block, void).init(allocator);
        defer stage2_blocks.deinit();

        const empty_block: Block = [_]u8{0} ** 256;
        try stage2_blocks.put(empty_block, {});
        const stage1_len = (0x10FFFF / 256) + 1;
        var stage1: [stage1_len]u8 = undefined;

        var codepoint: u21 = 0;
        var block: Block = undefined;
        while (codepoint <= 0x10FFFF) {
            const data_index = codepoint_to_index.get(codepoint) orelse 0;
            block[codepoint % 256] = data_index;

            codepoint += 1;
            if (codepoint % 256 == 0) {
                const result = try stage2_blocks.getOrPut(block);
                const index = result.index;
                stage1[(codepoint >> 8) - 1] = @intCast(index);
            }
        }

        const last_meaningful_block = std.mem.lastIndexOfNone(u8, &stage1, "\x00").?;
        const meaningful_stage1 = stage1[0 .. last_meaningful_block + 1];
        const codepoint_cutoff = (last_meaningful_block + 1) << 8;
        const multiple_codepoint_start: usize = unique_offsets.count();

        var index: usize = 0;
        const stage3_elems = unique_offsets.count() + mappings_to_index.count() * 3;
        var stage3 = try allocator.alloc(i24, stage3_elems);
        defer allocator.free(stage3);
        for (unique_offsets.keys()) |key| {
            stage3[index] = @intCast(key);
            index += 1;
        }
        for (mappings_to_index.keys()) |key| {
            stage3[index] = @intCast(key[0]);
            stage3[index + 1] = @intCast(key[1]);
            stage3[index + 2] = @intCast(key[2]);
            index += 3;
        }

        const stage2_elems = stage2_blocks.count() * 256;
        var stage2 = try allocator.alloc(u8, stage2_elems);
        defer allocator.free(stage2);
        for (stage2_blocks.keys(), 0..) |key, i| {
            @memcpy(stage2[i * 256 ..][0..256], &key);
        }

        // Write out compressed binary data file.
        var args_iter = try std.process.argsWithAllocator(allocator);
        defer args_iter.deinit();
        _ = args_iter.skip();
        const output_path = args_iter.next() orelse @panic("No output file arg!");

        const compressor = std.compress.flate.deflate.compressor;
        var out_file = try std.fs.cwd().createFile(output_path, .{});
        defer out_file.close();
        var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
        const writer = out_comp.writer();

        const endian = builtin.cpu.arch.endian();
        // Table metadata.
        try writer.writeInt(u24, @intCast(codepoint_cutoff), endian);
        try writer.writeInt(u24, @intCast(multiple_codepoint_start), endian);
        // Stage 1
        try writer.writeInt(u16, @intCast(meaningful_stage1.len), endian);
        try writer.writeAll(meaningful_stage1);
        // Stage 2
        try writer.writeInt(u16, @intCast(stage2.len), endian);
        try writer.writeAll(stage2);
        // Stage 3
        try writer.writeInt(u16, @intCast(stage3.len), endian);
        for (stage3) |offset| try writer.writeInt(i24, offset, endian);
        // Changes when case folded
        // Min and max
        try writer.writeInt(u24, std.mem.min(u21, changes_when_casefolded_exceptions.items), endian);
        try writer.writeInt(u24, std.mem.max(u21, changes_when_casefolded_exceptions.items), endian);
        try writer.writeInt(u16, @intCast(changes_when_casefolded_exceptions.items.len), endian);
        for (changes_when_casefolded_exceptions.items) |cp| try writer.writeInt(u24, cp, endian);

        try out_comp.flush();
    }
}