diff options
| author | 2024-06-27 02:33:06 -0700 | |
|---|---|---|
| committer | 2024-06-27 02:33:51 -0700 | |
| commit | bd7c0cf2998b626879e147e4cec2b30f71015631 (patch) | |
| tree | 9c6a3a96ebd31d960fd4e5312f11ab0fb5470a74 /codegen | |
| parent | Implements new case fold data encoding by @sqeek502 #8 (diff) | |
| download | zg-bd7c0cf2998b626879e147e4cec2b30f71015631.tar.gz zg-bd7c0cf2998b626879e147e4cec2b30f71015631.tar.xz zg-bd7c0cf2998b626879e147e4cec2b30f71015631.zip | |
FoldData: Minimize Changes_When_Casefolded data
Only a few codepoints have a mapping in CaseFolding.txt but do not have the Changes_When_Casefolded property set. So, FoldData can just store a list of those particular codepoints and then re-use the encoded CaseFolding.txt data alongside it in order to implement changesWhenCaseFolded.
This reduces the size of fold.bin.z from 4,387 bytes (4.28KiB) to 1,165 bytes (1.13KiB).
This also seemingly introduced a very slight performance regression in zg_caseless.
Before:
zg CaseFold.compatCaselessMatch: result: 626, took: 258ns
zg CaseFold.canonCaselessMatch: result: 626, took: 129ns
After:
zg CaseFold.compatCaselessMatch: result: 626, took: 263ns
zg CaseFold.canonCaselessMatch: result: 626, took: 131ns
Diffstat (limited to 'codegen')
| -rw-r--r-- | codegen/fold.zig | 21 |
1 files changed, 18 insertions, 3 deletions
diff --git a/codegen/fold.zig b/codegen/fold.zig index 53ed3c4..cb73cca 100644 --- a/codegen/fold.zig +++ b/codegen/fold.zig | |||
| @@ -83,6 +83,19 @@ pub fn main() !void { | |||
| 83 | try codepoint_mapping.putNoClobber(codepoint, mapping_buf); | 83 | try codepoint_mapping.putNoClobber(codepoint, mapping_buf); |
| 84 | } | 84 | } |
| 85 | 85 | ||
| 86 | var changes_when_casefolded_exceptions = std.ArrayList(u21).init(allocator); | ||
| 87 | defer changes_when_casefolded_exceptions.deinit(); | ||
| 88 | |||
| 89 | { | ||
| 90 | // Codepoints with a case fold mapping can be missing the Changes_When_Casefolded property, | ||
| 91 | // but not vice versa. | ||
| 92 | for (codepoint_mapping.keys()) |codepoint| { | ||
| 93 | if (props_map.get(codepoint) == null) { | ||
| 94 | try changes_when_casefolded_exceptions.append(codepoint); | ||
| 95 | } | ||
| 96 | } | ||
| 97 | } | ||
| 98 | |||
| 86 | var offset_to_index = std.AutoHashMap(i32, u8).init(allocator); | 99 | var offset_to_index = std.AutoHashMap(i32, u8).init(allocator); |
| 87 | defer offset_to_index.deinit(); | 100 | defer offset_to_index.deinit(); |
| 88 | var unique_offsets = std.AutoArrayHashMap(i32, u32).init(allocator); | 101 | var unique_offsets = std.AutoArrayHashMap(i32, u32).init(allocator); |
| @@ -228,9 +241,11 @@ pub fn main() !void { | |||
| 228 | try writer.writeInt(u16, @intCast(stage3.len), endian); | 241 | try writer.writeInt(u16, @intCast(stage3.len), endian); |
| 229 | for (stage3) |offset| try writer.writeInt(i24, offset, endian); | 242 | for (stage3) |offset| try writer.writeInt(i24, offset, endian); |
| 230 | // Changes when case folded | 243 | // Changes when case folded |
| 231 | try writer.writeInt(u16, @intCast(props_map.count()), endian); | 244 | // Min and max |
| 232 | var iter = props_map.keyIterator(); | 245 | try writer.writeInt(u24, std.mem.min(u21, changes_when_casefolded_exceptions.items), endian); |
| 233 | while (iter.next()) |key_ptr| try writer.writeInt(u24, key_ptr.*, endian); | 246 | try writer.writeInt(u24, std.mem.max(u21, changes_when_casefolded_exceptions.items), endian); |
| 247 | try writer.writeInt(u16, @intCast(changes_when_casefolded_exceptions.items.len), endian); | ||
| 248 | for (changes_when_casefolded_exceptions.items) |cp| try writer.writeInt(u24, cp, endian); | ||
| 234 | 249 | ||
| 235 | try out_comp.flush(); | 250 | try out_comp.flush(); |
| 236 | } | 251 | } |