diff options
| author | 2024-06-27 02:33:06 -0700 | |
|---|---|---|
| committer | 2024-06-27 02:33:51 -0700 | |
| commit | bd7c0cf2998b626879e147e4cec2b30f71015631 (patch) | |
| tree | 9c6a3a96ebd31d960fd4e5312f11ab0fb5470a74 /src | |
| parent | Implements new case fold data encoding by @sqeek502 #8 (diff) | |
| download | zg-bd7c0cf2998b626879e147e4cec2b30f71015631.tar.gz zg-bd7c0cf2998b626879e147e4cec2b30f71015631.tar.xz zg-bd7c0cf2998b626879e147e4cec2b30f71015631.zip | |
FoldData: Minimize Changes_When_Casefolded data
Only a few codepoints have a mapping in CaseFolding.txt but do not have the Changes_When_Casefolded property set. So, FoldData can just store a list of those particular codepoints and then re-use the encoded CaseFolding.txt data alongside it in order to implement changesWhenCaseFolded.
This reduces the size of fold.bin.z from 4,387 bytes (4.28KiB) to 1,165 bytes (1.13KiB).
This also seemingly introduced a very slight performance regression in zg_caseless.
Before:
zg CaseFold.compatCaselessMatch: result: 626, took: 258ns
zg CaseFold.canonCaselessMatch: result: 626, took: 129ns
After:
zg CaseFold.compatCaselessMatch: result: 626, took: 263ns
zg CaseFold.canonCaselessMatch: result: 626, took: 131ns
Diffstat (limited to 'src')
| -rw-r--r-- | src/FoldData.zig | 21 |
1 files changed, 16 insertions, 5 deletions
diff --git a/src/FoldData.zig b/src/FoldData.zig index b7bbbd1..d425178 100644 --- a/src/FoldData.zig +++ b/src/FoldData.zig | |||
| @@ -3,11 +3,11 @@ const builtin = @import("builtin"); | |||
| 3 | const compress = std.compress; | 3 | const compress = std.compress; |
| 4 | const mem = std.mem; | 4 | const mem = std.mem; |
| 5 | 5 | ||
| 6 | const cwcf_max = 0x1e950; | ||
| 7 | |||
| 8 | allocator: mem.Allocator, | 6 | allocator: mem.Allocator, |
| 9 | cutoff: u21 = undefined, | 7 | cutoff: u21 = undefined, |
| 10 | cwcf: [cwcf_max]bool = [_]bool{false} ** cwcf_max, | 8 | cwcf_exceptions_min: u21 = undefined, |
| 9 | cwcf_exceptions_max: u21 = undefined, | ||
| 10 | cwcf_exceptions: []u21 = undefined, | ||
| 11 | multiple_start: u21 = undefined, | 11 | multiple_start: u21 = undefined, |
| 12 | stage1: []u8 = undefined, | 12 | stage1: []u8 = undefined, |
| 13 | stage2: []u8 = undefined, | 13 | stage2: []u8 = undefined, |
| @@ -43,8 +43,11 @@ pub fn init(allocator: mem.Allocator) !Self { | |||
| 43 | errdefer allocator.free(self.stage3); | 43 | errdefer allocator.free(self.stage3); |
| 44 | for (0..len) |i| self.stage3[i] = try reader.readInt(i24, endian); | 44 | for (0..len) |i| self.stage3[i] = try reader.readInt(i24, endian); |
| 45 | 45 | ||
| 46 | self.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian)); | ||
| 47 | self.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian)); | ||
| 46 | len = try reader.readInt(u16, endian); | 48 | len = try reader.readInt(u16, endian); |
| 47 | for (0..len) |_| self.cwcf[try reader.readInt(u24, endian)] = true; | 49 | self.cwcf_exceptions = try allocator.alloc(u21, len); |
| 50 | for (0..len) |i| self.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian)); | ||
| 48 | 51 | ||
| 49 | return self; | 52 | return self; |
| 50 | } | 53 | } |
| @@ -83,5 +86,13 @@ pub fn caseFold(self: Self, cp: u21, buf: []u21) []const u21 { | |||
| 83 | 86 | ||
| 84 | /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). | 87 | /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). |
| 85 | pub fn changesWhenCaseFolded(self: Self, cp: u21) bool { | 88 | pub fn changesWhenCaseFolded(self: Self, cp: u21) bool { |
| 86 | return cp < cwcf_max and self.cwcf[cp]; | 89 | var buf: [3]u21 = undefined; |
| 90 | const has_mapping = self.caseFold(cp, &buf).len != 0; | ||
| 91 | return has_mapping and !self.isCwcfException(cp); | ||
| 92 | } | ||
| 93 | |||
| 94 | fn isCwcfException(self: Self, cp: u21) bool { | ||
| 95 | return cp >= self.cwcf_exceptions_min and | ||
| 96 | cp <= self.cwcf_exceptions_max and | ||
| 97 | std.mem.indexOfScalar(u21, self.cwcf_exceptions, cp) != null; | ||
| 87 | } | 98 | } |