diff options
| author | 2024-06-27 02:33:06 -0700 | |
|---|---|---|
| committer | 2024-06-27 02:33:51 -0700 | |
| commit | bd7c0cf2998b626879e147e4cec2b30f71015631 (patch) | |
| tree | 9c6a3a96ebd31d960fd4e5312f11ab0fb5470a74 | |
| parent | Implements new case fold data encoding by @sqeek502 #8 (diff) | |
| download | zg-bd7c0cf2998b626879e147e4cec2b30f71015631.tar.gz zg-bd7c0cf2998b626879e147e4cec2b30f71015631.tar.xz zg-bd7c0cf2998b626879e147e4cec2b30f71015631.zip | |
FoldData: Minimize Changes_When_Casefolded data
Only a few codepoints have a mapping in CaseFolding.txt but do not have the Changes_When_Casefolded property set. So, FoldData can just store a list of those particular codepoints and then re-use the encoded CaseFolding.txt data alongside it in order to implement changesWhenCaseFolded.
This reduces the size of fold.bin.z from 4,387 bytes (4.28KiB) to 1,165 bytes (1.13KiB).
This also seemingly introduced a very slight performance regression in zg_caseless.
Before:
zg CaseFold.compatCaselessMatch: result: 626, took: 258ns
zg CaseFold.canonCaselessMatch: result: 626, took: 129ns
After:
zg CaseFold.compatCaselessMatch: result: 626, took: 263ns
zg CaseFold.canonCaselessMatch: result: 626, took: 131ns
| -rw-r--r-- | codegen/fold.zig | 21 | ||||
| -rw-r--r-- | src/FoldData.zig | 21 |
2 files changed, 34 insertions, 8 deletions
diff --git a/codegen/fold.zig b/codegen/fold.zig index 53ed3c4..cb73cca 100644 --- a/codegen/fold.zig +++ b/codegen/fold.zig | |||
| @@ -83,6 +83,19 @@ pub fn main() !void { | |||
| 83 | try codepoint_mapping.putNoClobber(codepoint, mapping_buf); | 83 | try codepoint_mapping.putNoClobber(codepoint, mapping_buf); |
| 84 | } | 84 | } |
| 85 | 85 | ||
| 86 | var changes_when_casefolded_exceptions = std.ArrayList(u21).init(allocator); | ||
| 87 | defer changes_when_casefolded_exceptions.deinit(); | ||
| 88 | |||
| 89 | { | ||
| 90 | // Codepoints with a case fold mapping can be missing the Changes_When_Casefolded property, | ||
| 91 | // but not vice versa. | ||
| 92 | for (codepoint_mapping.keys()) |codepoint| { | ||
| 93 | if (props_map.get(codepoint) == null) { | ||
| 94 | try changes_when_casefolded_exceptions.append(codepoint); | ||
| 95 | } | ||
| 96 | } | ||
| 97 | } | ||
| 98 | |||
| 86 | var offset_to_index = std.AutoHashMap(i32, u8).init(allocator); | 99 | var offset_to_index = std.AutoHashMap(i32, u8).init(allocator); |
| 87 | defer offset_to_index.deinit(); | 100 | defer offset_to_index.deinit(); |
| 88 | var unique_offsets = std.AutoArrayHashMap(i32, u32).init(allocator); | 101 | var unique_offsets = std.AutoArrayHashMap(i32, u32).init(allocator); |
| @@ -228,9 +241,11 @@ pub fn main() !void { | |||
| 228 | try writer.writeInt(u16, @intCast(stage3.len), endian); | 241 | try writer.writeInt(u16, @intCast(stage3.len), endian); |
| 229 | for (stage3) |offset| try writer.writeInt(i24, offset, endian); | 242 | for (stage3) |offset| try writer.writeInt(i24, offset, endian); |
| 230 | // Changes when case folded | 243 | // Changes when case folded |
| 231 | try writer.writeInt(u16, @intCast(props_map.count()), endian); | 244 | // Min and max |
| 232 | var iter = props_map.keyIterator(); | 245 | try writer.writeInt(u24, std.mem.min(u21, changes_when_casefolded_exceptions.items), endian); |
| 233 | while (iter.next()) |key_ptr| try writer.writeInt(u24, key_ptr.*, endian); | 246 | try writer.writeInt(u24, std.mem.max(u21, changes_when_casefolded_exceptions.items), endian); |
| 247 | try writer.writeInt(u16, @intCast(changes_when_casefolded_exceptions.items.len), endian); | ||
| 248 | for (changes_when_casefolded_exceptions.items) |cp| try writer.writeInt(u24, cp, endian); | ||
| 234 | 249 | ||
| 235 | try out_comp.flush(); | 250 | try out_comp.flush(); |
| 236 | } | 251 | } |
diff --git a/src/FoldData.zig b/src/FoldData.zig index b7bbbd1..d425178 100644 --- a/src/FoldData.zig +++ b/src/FoldData.zig | |||
| @@ -3,11 +3,11 @@ const builtin = @import("builtin"); | |||
| 3 | const compress = std.compress; | 3 | const compress = std.compress; |
| 4 | const mem = std.mem; | 4 | const mem = std.mem; |
| 5 | 5 | ||
| 6 | const cwcf_max = 0x1e950; | ||
| 7 | |||
| 8 | allocator: mem.Allocator, | 6 | allocator: mem.Allocator, |
| 9 | cutoff: u21 = undefined, | 7 | cutoff: u21 = undefined, |
| 10 | cwcf: [cwcf_max]bool = [_]bool{false} ** cwcf_max, | 8 | cwcf_exceptions_min: u21 = undefined, |
| 9 | cwcf_exceptions_max: u21 = undefined, | ||
| 10 | cwcf_exceptions: []u21 = undefined, | ||
| 11 | multiple_start: u21 = undefined, | 11 | multiple_start: u21 = undefined, |
| 12 | stage1: []u8 = undefined, | 12 | stage1: []u8 = undefined, |
| 13 | stage2: []u8 = undefined, | 13 | stage2: []u8 = undefined, |
| @@ -43,8 +43,11 @@ pub fn init(allocator: mem.Allocator) !Self { | |||
| 43 | errdefer allocator.free(self.stage3); | 43 | errdefer allocator.free(self.stage3); |
| 44 | for (0..len) |i| self.stage3[i] = try reader.readInt(i24, endian); | 44 | for (0..len) |i| self.stage3[i] = try reader.readInt(i24, endian); |
| 45 | 45 | ||
| 46 | self.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian)); | ||
| 47 | self.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian)); | ||
| 46 | len = try reader.readInt(u16, endian); | 48 | len = try reader.readInt(u16, endian); |
| 47 | for (0..len) |_| self.cwcf[try reader.readInt(u24, endian)] = true; | 49 | self.cwcf_exceptions = try allocator.alloc(u21, len); |
| 50 | for (0..len) |i| self.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian)); | ||
| 48 | 51 | ||
| 49 | return self; | 52 | return self; |
| 50 | } | 53 | } |
| @@ -83,5 +86,13 @@ pub fn caseFold(self: Self, cp: u21, buf: []u21) []const u21 { | |||
| 83 | 86 | ||
| 84 | /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). | 87 | /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). |
| 85 | pub fn changesWhenCaseFolded(self: Self, cp: u21) bool { | 88 | pub fn changesWhenCaseFolded(self: Self, cp: u21) bool { |
| 86 | return cp < cwcf_max and self.cwcf[cp]; | 89 | var buf: [3]u21 = undefined; |
| 90 | const has_mapping = self.caseFold(cp, &buf).len != 0; | ||
| 91 | return has_mapping and !self.isCwcfException(cp); | ||
| 92 | } | ||
| 93 | |||
| 94 | fn isCwcfException(self: Self, cp: u21) bool { | ||
| 95 | return cp >= self.cwcf_exceptions_min and | ||
| 96 | cp <= self.cwcf_exceptions_max and | ||
| 97 | std.mem.indexOfScalar(u21, self.cwcf_exceptions, cp) != null; | ||
| 87 | } | 98 | } |