From bd7c0cf2998b626879e147e4cec2b30f71015631 Mon Sep 17 00:00:00 2001 From: Ryan Liptak Date: Thu, 27 Jun 2024 02:33:06 -0700 Subject: FoldData: Minimize Changes_When_Casefolded data Only a few codepoints have a mapping in CaseFolding.txt but do not have the Changes_When_Casefolded property set. So, FoldData can just store a list of those particular codepoints and then re-use the encoded CaseFolding.txt data alongside it in order to implement changesWhenCaseFolded. This reduces the size of fold.bin.z from 4,387 bytes (4.28KiB) to 1,165 bytes (1.13KiB). This also seemingly introduced a very slight performance regression in zg_caseless. Before: zg CaseFold.compatCaselessMatch: result: 626, took: 258ns zg CaseFold.canonCaselessMatch: result: 626, took: 129ns After: zg CaseFold.compatCaselessMatch: result: 626, took: 263ns zg CaseFold.canonCaselessMatch: result: 626, took: 131ns --- codegen/fold.zig | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'codegen') diff --git a/codegen/fold.zig b/codegen/fold.zig index 53ed3c4..cb73cca 100644 --- a/codegen/fold.zig +++ b/codegen/fold.zig @@ -83,6 +83,19 @@ pub fn main() !void { try codepoint_mapping.putNoClobber(codepoint, mapping_buf); } + var changes_when_casefolded_exceptions = std.ArrayList(u21).init(allocator); + defer changes_when_casefolded_exceptions.deinit(); + + { + // Codepoints with a case fold mapping can be missing the Changes_When_Casefolded property, + // but not vice versa. + for (codepoint_mapping.keys()) |codepoint| { + if (props_map.get(codepoint) == null) { + try changes_when_casefolded_exceptions.append(codepoint); + } + } + } + var offset_to_index = std.AutoHashMap(i32, u8).init(allocator); defer offset_to_index.deinit(); var unique_offsets = std.AutoArrayHashMap(i32, u32).init(allocator); @@ -228,9 +241,11 @@ pub fn main() !void { try writer.writeInt(u16, @intCast(stage3.len), endian); for (stage3) |offset| try writer.writeInt(i24, offset, endian); // Changes when case folded - try writer.writeInt(u16, @intCast(props_map.count()), endian); - var iter = props_map.keyIterator(); - while (iter.next()) |key_ptr| try writer.writeInt(u24, key_ptr.*, endian); + // Min and max + try writer.writeInt(u24, std.mem.min(u21, changes_when_casefolded_exceptions.items), endian); + try writer.writeInt(u24, std.mem.max(u21, changes_when_casefolded_exceptions.items), endian); + try writer.writeInt(u16, @intCast(changes_when_casefolded_exceptions.items.len), endian); + for (changes_when_casefolded_exceptions.items) |cp| try writer.writeInt(u24, cp, endian); try out_comp.flush(); } -- cgit v1.2.3