From 8ada7b4176d2c8afb7ecd01c4ac1aaa0f3b53cc0 Mon Sep 17 00:00:00 2001
From: Jose Colon Rodriguez
Date: Wed, 26 Jun 2024 12:08:08 -0400
Subject: Implemented sqeek502s case fold

---
 codegen/fold.zig | 289 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 192 insertions(+), 97 deletions(-)

(limited to 'codegen/fold.zig')

diff --git a/codegen/fold.zig b/codegen/fold.zig
index 6dc51ac..ec024c5 100644
--- a/codegen/fold.zig
+++ b/codegen/fold.zig
@@ -1,123 +1,218 @@
 const std = @import("std");
 const builtin = @import("builtin");
-const fmt = std.fmt;
-const mem = std.mem;
+
+// From https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt
+// const case_folding_txt = @embedFile("CaseFolding.txt");
 
 pub fn main() !void {
-    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
-    defer arena.deinit();
-    const allocator = arena.allocator();
+    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
+    defer std.debug.assert(gpa.deinit() == .ok);
+    const allocator = gpa.allocator();
+
+    // const unbuf_stdout = std.io.getStdOut().writer();
+    // var buf_stdout = std.io.bufferedWriter(unbuf_stdout);
+    // const writer = buf_stdout.writer();
 
-    // Process DerivedCoreProperties.txt
-    var cp_file = try std.fs.cwd().openFile("data/unicode/DerivedCoreProperties.txt", .{});
+    var codepoint_mapping = std.AutoArrayHashMap(u21, [3]u21).init(allocator);
+    defer codepoint_mapping.deinit();
+
+    // Process
+    var cp_file = try std.fs.cwd().openFile("data/unicode/CaseFolding.txt", .{});
     defer cp_file.close();
     var cp_buf = std.io.bufferedReader(cp_file.reader());
     const cp_reader = cp_buf.reader();
 
-    var cp_map = std.AutoHashMap(u21, void).init(allocator);
-    defer cp_map.deinit();
-
+    // var line_it = std.mem.tokenizeAny(u8, case_folding_txt, "\r\n");
     var line_buf: [4096]u8 = undefined;
 
-    cp_lines: while (try cp_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
+    while (try cp_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
+        // while (line_it.next()) |line| {
         if (line.len == 0 or line[0] == '#') continue;
 
-        const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line;
-
-        var field_iter = std.mem.tokenizeAny(u8, no_comment, "; ");
-        var current_code: [2]u21 = undefined;
-
-        var i: usize = 0;
-        while (field_iter.next()) |field| : (i += 1) {
-            switch (i) {
-                0 => {
-                    // Code point(s)
-                    if (std.mem.indexOf(u8, field, "..")) |dots| {
-                        current_code = .{
-                            try std.fmt.parseInt(u21, field[0..dots], 16),
-                            try std.fmt.parseInt(u21, field[dots + 2 ..], 16),
-                        };
-                    } else {
-                        const code = try std.fmt.parseInt(u21, field, 16);
-                        current_code = .{ code, code };
-                    }
-                },
-                1 => {
-                    // Core property
-                    if (!mem.eql(u8, field, "Changes_When_Casefolded")) continue :cp_lines;
-                    for (current_code[0]..current_code[1] + 1) |cp| try cp_map.put(@intCast(cp), {});
-                },
-                else => {},
-            }
+        var field_it = std.mem.splitScalar(u8, line, ';');
+        const codepoint_str = field_it.first();
+        const codepoint = try std.fmt.parseUnsigned(u21, codepoint_str, 16);
+
+        const status = std.mem.trim(u8, field_it.next() orelse continue, " ");
+        // Only interested in 'common' and 'full'
+        if (status[0] != 'C' and status[0] != 'F') continue;
+
+        const mapping = std.mem.trim(u8, field_it.next() orelse continue, " ");
+        var mapping_it = std.mem.splitScalar(u8, mapping, ' ');
+        var mapping_buf = [_]u21{0} ** 3;
+        var mapping_i: u8 = 0;
+        while (mapping_it.next()) |mapping_c| {
+            mapping_buf[mapping_i] = try std.fmt.parseInt(u21, mapping_c, 16);
+            mapping_i += 1;
         }
-    }
 
-    // Process CaseFolding.txt
-    var in_file = try std.fs.cwd().openFile("data/unicode/CaseFolding.txt", .{});
-    defer in_file.close();
-    var in_buf = std.io.bufferedReader(in_file.reader());
-    const in_reader = in_buf.reader();
+        try codepoint_mapping.putNoClobber(codepoint, mapping_buf);
+    }
 
-    var args_iter = try std.process.argsWithAllocator(allocator);
-    defer args_iter.deinit();
-    _ = args_iter.skip();
-    const output_path = args_iter.next() orelse @panic("No output file arg!");
+    var offset_to_index = std.AutoHashMap(i32, u8).init(allocator);
+    defer offset_to_index.deinit();
+    var unique_offsets = std.AutoArrayHashMap(i32, u32).init(allocator);
+    defer unique_offsets.deinit();
+
+    // First pass
+    {
+        var it = codepoint_mapping.iterator();
+        while (it.next()) |entry| {
+            const codepoint = entry.key_ptr.*;
+            const mappings = std.mem.sliceTo(entry.value_ptr, 0);
+            if (mappings.len == 1) {
+                const offset: i32 = @as(i32, mappings[0]) - @as(i32, codepoint);
+                const result = try unique_offsets.getOrPut(offset);
+                if (!result.found_existing) result.value_ptr.* = 0;
+                result.value_ptr.* += 1;
+            }
+        }
 
-    const compressor = std.compress.flate.deflate.compressor;
-    var out_file = try std.fs.cwd().createFile(output_path, .{});
-    defer out_file.close();
-    var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
-    const writer = out_comp.writer();
+        // A codepoint mapping to itself (offset=0) is the most common case
+        try unique_offsets.put(0, 0x10FFFF);
+        const C = struct {
+            vals: []u32,
 
-    const endian = builtin.cpu.arch.endian();
+            pub fn lessThan(ctx: @This(), a_index: usize, b_index: usize) bool {
+                return ctx.vals[a_index] > ctx.vals[b_index];
+            }
+        };
+        unique_offsets.sort(C{ .vals = unique_offsets.values() });
+
+        var offset_it = unique_offsets.iterator();
+        var offset_index: u7 = 0;
+        while (offset_it.next()) |entry| {
+            try offset_to_index.put(entry.key_ptr.*, offset_index);
+            offset_index += 1;
+        }
+    }
 
-    lines: while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
-        if (line.len == 0 or line[0] == '#') continue;
+    var mappings_to_index = std.AutoArrayHashMap([3]u21, u8).init(allocator);
+    defer mappings_to_index.deinit();
+    var codepoint_to_index = std.AutoHashMap(u21, u8).init(allocator);
+    defer codepoint_to_index.deinit();
+
+    // Second pass
+    {
+        var count_multiple_codepoints: u8 = 0;
+
+        var it = codepoint_mapping.iterator();
+        while (it.next()) |entry| {
+            const codepoint = entry.key_ptr.*;
+            const mappings = std.mem.sliceTo(entry.value_ptr, 0);
+            if (mappings.len > 1) {
+                const result = try mappings_to_index.getOrPut(entry.value_ptr.*);
+                if (!result.found_existing) {
+                    result.value_ptr.* = 0x80 | count_multiple_codepoints;
+                    count_multiple_codepoints += 1;
+                }
+                const index = result.value_ptr.*;
+                try codepoint_to_index.put(codepoint, index);
+            } else {
+                const offset: i32 = @as(i32, mappings[0]) - @as(i32, codepoint);
+                const index = offset_to_index.get(offset).?;
+                try codepoint_to_index.put(codepoint, index);
+            }
+        }
+    }
 
-        const no_comment = if (mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line;
-
-        var field_iter = mem.tokenizeSequence(u8, no_comment, "; ");
-        var cps: [4]u24 = undefined;
-        var len: usize = 2;
-
-        var i: usize = 0;
-        while (field_iter.next()) |field| : (i += 1) {
-            switch (i) {
-                0 => {
-                    var cp = try fmt.parseInt(u21, field, 16);
-                    cp <<= 1;
-                    if (cp_map.contains(cp)) cp |= 1;
-                    cps[0] = cp;
-                },
-
-                1 => {
-                    if (!mem.eql(u8, field, "C") and !mem.eql(u8, field, "F")) continue :lines;
-                    if (mem.eql(u8, field, "F")) len = 3;
-                },
-
-                2 => {
-                    if (len == 3) {
-                        // Full case fold
-                        // std.debug.print("-->{s} {s}\n", .{ line, field });
-                        var cp_iter = mem.tokenizeScalar(u8, field, ' ');
-                        len = 1;
-                        while (cp_iter.next()) |cp_str| : (len += 1) {
-                            cps[len] = try fmt.parseInt(u24, cp_str, 16);
-                        }
-                    } else {
-                        // Common case fold
-                        cps[1] = try fmt.parseInt(u24, field, 16);
-                    }
-                },
-
-                else => {},
+    // Build the stage1/stage2/stage3 arrays and output them
+    {
+        const Block = [256]u8;
+        var stage2_blocks = std.AutoArrayHashMap(Block, void).init(allocator);
+        defer stage2_blocks.deinit();
+
+        const empty_block: Block = [_]u8{0} ** 256;
+        try stage2_blocks.put(empty_block, {});
+        const stage1_len = (0x10FFFF / 256) + 1;
+        var stage1: [stage1_len]u8 = undefined;
+
+        var codepoint: u21 = 0;
+        var block: Block = undefined;
+        while (codepoint <= 0x10FFFF) {
+            const data_index = codepoint_to_index.get(codepoint) orelse 0;
+            block[codepoint % 256] = data_index;
+
+            codepoint += 1;
+            if (codepoint % 256 == 0) {
+                const result = try stage2_blocks.getOrPut(block);
+                const index = result.index;
+                stage1[(codepoint >> 8) - 1] = @intCast(index);
             }
         }
 
-        try writer.writeInt(u8, @intCast(len), endian);
-        for (cps[0..len]) |cp| try writer.writeInt(u24, cp, endian);
+        const last_meaningful_block = std.mem.lastIndexOfNone(u8, &stage1, "\x00").?;
+        const meaningful_stage1 = stage1[0 .. last_meaningful_block + 1];
+        const codepoint_cutoff = (last_meaningful_block + 1) << 8;
+        const multiple_codepoint_start: usize = unique_offsets.count();
+
+        var index: usize = 0;
+        const stage3_elems = unique_offsets.count() + mappings_to_index.count() * 3;
+        var stage3 = try allocator.alloc(i24, stage3_elems);
+        defer allocator.free(stage3);
+        for (unique_offsets.keys()) |key| {
+            stage3[index] = @intCast(key);
+            index += 1;
+        }
+        for (mappings_to_index.keys()) |key| {
+            stage3[index] = @intCast(key[0]);
+            stage3[index + 1] = @intCast(key[1]);
+            stage3[index + 2] = @intCast(key[2]);
+            index += 3;
+        }
+
+        const stage2_elems = stage2_blocks.count() * 256;
+        var stage2 = try allocator.alloc(u8, stage2_elems);
+        defer allocator.free(stage2);
+        for (stage2_blocks.keys(), 0..) |key, i| {
+            @memcpy(stage2[i * 256 ..][0..256], &key);
+        }
+
+        // try writer.print("const cutoff = 0x{X};\n", .{codepoint_cutoff});
+        // try writeArray(writer, u8, "stage1", meaningful_stage1);
+        // try writeArray(writer, u8, "stage2", stage2);
+        // try writer.print("const multiple_start = {};\n", .{multiple_codepoint_start});
+        // try writeArray(writer, i24, "stage3", stage3);
+
+        var args_iter = try std.process.argsWithAllocator(allocator);
+        defer args_iter.deinit();
+        _ = args_iter.skip();
+        const output_path = args_iter.next() orelse @panic("No output file arg!");
+
+        const compressor = std.compress.flate.deflate.compressor;
+        var out_file = try std.fs.cwd().createFile(output_path, .{});
+        defer out_file.close();
+        var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
+        const writer = out_comp.writer();
+
+        const endian = builtin.cpu.arch.endian();
+
+        try writer.writeInt(u24, @intCast(codepoint_cutoff), endian);
+        try writer.writeInt(u24, @intCast(multiple_codepoint_start), endian);
+
+        try writer.writeInt(u16, @intCast(meaningful_stage1.len), endian);
+        try writer.writeAll(meaningful_stage1);
+
+        try writer.writeInt(u16, @intCast(stage2.len), endian);
+        try writer.writeAll(stage2);
+
+        try writer.writeInt(u16, @intCast(stage3.len), endian);
+        for (stage3) |offset| try writer.writeInt(i24, offset, endian);
+
+        try out_comp.flush();
     }
 
-    try writer.writeInt(u16, 0, endian);
-    try out_comp.flush();
+    // try buf_stdout.flush();
 }
+
+// fn writeArray(writer: anytype, comptime T: type, name: []const u8, data: []const T) !void {
+//     try writer.print("const {s} = [{}]{s}{{", .{ name, data.len, @typeName(T) });
+//
+//     for (data, 0..) |v, i| {
+//         if (i % 32 == 0) try writer.writeAll("\n    ");
+//         try writer.print("{},", .{v});
+//         if (i != data.len - 1) try writer.writeByte(' ');
+//     }
+//
+//     try writer.writeAll("\n};\n");
+// }
-- 
cgit v1.2.3