FoldData: Minimize Changes_When_Casefolded data

Only a few codepoints have a mapping in CaseFolding.txt but do not have the Changes_When_Casefolded property set. So, FoldData can just store a list of those particular codepoints and then re-use the encoded CaseFolding.txt data alongside it in order to implement changesWhenCaseFolded. This reduces the size of fold.bin.z from 4,387 bytes (4.28KiB) to 1,165 bytes (1.13KiB). This also seemingly introduced a very slight performance regression in zg_caseless. Before: zg CaseFold.compatCaselessMatch: result: 626, took: 258ns zg CaseFold.canonCaselessMatch: result: 626, took: 129ns After: zg CaseFold.compatCaselessMatch: result: 626, took: 263ns zg CaseFold.canonCaselessMatch: result: 626, took: 131ns
author: Ryan Liptak 2024-06-27 02:33:06 -0700
committer: Ryan Liptak 2024-06-27 02:33:51 -0700
commit: bd7c0cf2998b626879e147e4cec2b30f71015631 (patch)
tree: 9c6a3a96ebd31d960fd4e5312f11ab0fb5470a74 /codegen
parent: Implements new case fold data encoding by @sqeek502 #8 (diff)
download: zg-bd7c0cf2998b626879e147e4cec2b30f71015631.tar.gz
zg-bd7c0cf2998b626879e147e4cec2b30f71015631.tar.xz
zg-bd7c0cf2998b626879e147e4cec2b30f71015631.zip
1 files changed, 18 insertions, 3 deletions
diff --git a/codegen/fold.zig b/codegen/fold.zig
index 53ed3c4..cb73cca 100644
--- a/codegen/fold.zig
+++ b/codegen/fold.zig
@@ -83,6 +83,19 @@ pub fn main() !void {
        try codepoint_mapping.putNoClobber(codepoint, mapping_buf);
    }
+    var changes_when_casefolded_exceptions = std.ArrayList(u21).init(allocator);
+    defer changes_when_casefolded_exceptions.deinit();
+    {
+        // Codepoints with a case fold mapping can be missing the Changes_When_Casefolded property,
+        // but not vice versa.
+        for (codepoint_mapping.keys()) |codepoint| {
+            if (props_map.get(codepoint) == null) {
+                try changes_when_casefolded_exceptions.append(codepoint);
+            }
+        }
+    }
    var offset_to_index = std.AutoHashMap(i32, u8).init(allocator);
    defer offset_to_index.deinit();
    var unique_offsets = std.AutoArrayHashMap(i32, u32).init(allocator);
@@ -228,9 +241,11 @@ pub fn main() !void {
        try writer.writeInt(u16, @intCast(stage3.len), endian);
        for (stage3) |offset| try writer.writeInt(i24, offset, endian);
        // Changes when case folded
-        try writer.writeInt(u16, @intCast(props_map.count()), endian);
+        // Min and max
-        var iter = props_map.keyIterator();
+        try writer.writeInt(u24, std.mem.min(u21, changes_when_casefolded_exceptions.items), endian);
-        while (iter.next()) |key_ptr| try writer.writeInt(u24, key_ptr.*, endian);
+        try writer.writeInt(u24, std.mem.max(u21, changes_when_casefolded_exceptions.items), endian);
+        try writer.writeInt(u16, @intCast(changes_when_casefolded_exceptions.items.len), endian);
+        for (changes_when_casefolded_exceptions.items) |cp| try writer.writeInt(u24, cp, endian);
        try out_comp.flush();
    }
author	Ryan Liptak	2024-06-27 02:33:06 -0700
committer	Ryan Liptak	2024-06-27 02:33:51 -0700
commit	bd7c0cf2998b626879e147e4cec2b30f71015631 (patch)
tree	9c6a3a96ebd31d960fd4e5312f11ab0fb5470a74 /codegen
parent	Implements new case fold data encoding by @sqeek502 #8 (diff)
download	zg-bd7c0cf2998b626879e147e4cec2b30f71015631.tar.gz zg-bd7c0cf2998b626879e147e4cec2b30f71015631.tar.xz zg-bd7c0cf2998b626879e147e4cec2b30f71015631.zip