FoldData: Minimize Changes_When_Casefolded data

Only a few codepoints have a mapping in CaseFolding.txt but do not have the Changes_When_Casefolded property set. So, FoldData can just store a list of those particular codepoints and then re-use the encoded CaseFolding.txt data alongside it in order to implement changesWhenCaseFolded. This reduces the size of fold.bin.z from 4,387 bytes (4.28KiB) to 1,165 bytes (1.13KiB). This also seemingly introduced a very slight performance regression in zg_caseless. Before: zg CaseFold.compatCaselessMatch: result: 626, took: 258ns zg CaseFold.canonCaselessMatch: result: 626, took: 129ns After: zg CaseFold.compatCaselessMatch: result: 626, took: 263ns zg CaseFold.canonCaselessMatch: result: 626, took: 131ns
author: Ryan Liptak 2024-06-27 02:33:06 -0700
committer: Ryan Liptak 2024-06-27 02:33:51 -0700
commit: bd7c0cf2998b626879e147e4cec2b30f71015631 (patch)
tree: 9c6a3a96ebd31d960fd4e5312f11ab0fb5470a74 /src/FoldData.zig
parent: Implements new case fold data encoding by @sqeek502 #8 (diff)
download: zg-bd7c0cf2998b626879e147e4cec2b30f71015631.tar.gz
zg-bd7c0cf2998b626879e147e4cec2b30f71015631.tar.xz
zg-bd7c0cf2998b626879e147e4cec2b30f71015631.zip
1 files changed, 16 insertions, 5 deletions
diff --git a/src/FoldData.zig b/src/FoldData.zig
index b7bbbd1..d425178 100644
--- a/src/FoldData.zig
+++ b/src/FoldData.zig
@@ -3,11 +3,11 @@ const builtin = @import("builtin");
 const compress = std.compress;
 const mem = std.mem;
-const cwcf_max = 0x1e950;
 allocator: mem.Allocator,
 cutoff: u21 = undefined,
-cwcf: [cwcf_max]bool = [_]bool{false} ** cwcf_max,
+cwcf_exceptions_min: u21 = undefined,
+cwcf_exceptions_max: u21 = undefined,
+cwcf_exceptions: []u21 = undefined,
 multiple_start: u21 = undefined,
 stage1: []u8 = undefined,
 stage2: []u8 = undefined,
@@ -43,8 +43,11 @@ pub fn init(allocator: mem.Allocator) !Self {
    errdefer allocator.free(self.stage3);
    for (0..len) |i| self.stage3[i] = try reader.readInt(i24, endian);
+    self.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian));
+    self.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian));
    len = try reader.readInt(u16, endian);
-    for (0..len) |_| self.cwcf[try reader.readInt(u24, endian)] = true;
+    self.cwcf_exceptions = try allocator.alloc(u21, len);
+    for (0..len) |i| self.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian));
    return self;
 }
@@ -83,5 +86,13 @@ pub fn caseFold(self: Self, cp: u21, buf: []u21) []const u21 {
 /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`).
 pub fn changesWhenCaseFolded(self: Self, cp: u21) bool {
-    return cp < cwcf_max and self.cwcf[cp];
+    var buf: [3]u21 = undefined;
+    const has_mapping = self.caseFold(cp, &buf).len != 0;
+    return has_mapping and !self.isCwcfException(cp);
+}
+fn isCwcfException(self: Self, cp: u21) bool {
+    return cp >= self.cwcf_exceptions_min and
+        cp <= self.cwcf_exceptions_max and
+        std.mem.indexOfScalar(u21, self.cwcf_exceptions, cp) != null;
 }
author	Ryan Liptak	2024-06-27 02:33:06 -0700
committer	Ryan Liptak	2024-06-27 02:33:51 -0700
commit	bd7c0cf2998b626879e147e4cec2b30f71015631 (patch)
tree	9c6a3a96ebd31d960fd4e5312f11ab0fb5470a74 /src/FoldData.zig
parent	Implements new case fold data encoding by @sqeek502 #8 (diff)
download	zg-bd7c0cf2998b626879e147e4cec2b30f71015631.tar.gz zg-bd7c0cf2998b626879e147e4cec2b30f71015631.tar.xz zg-bd7c0cf2998b626879e147e4cec2b30f71015631.zip