summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Ryan Liptak2024-06-27 02:33:06 -0700
committerGravatar Ryan Liptak2024-06-27 02:33:51 -0700
commitbd7c0cf2998b626879e147e4cec2b30f71015631 (patch)
tree9c6a3a96ebd31d960fd4e5312f11ab0fb5470a74
parentImplements new case fold data encoding by @sqeek502 #8 (diff)
downloadzg-bd7c0cf2998b626879e147e4cec2b30f71015631.tar.gz
zg-bd7c0cf2998b626879e147e4cec2b30f71015631.tar.xz
zg-bd7c0cf2998b626879e147e4cec2b30f71015631.zip
FoldData: Minimize Changes_When_Casefolded data
Only a few codepoints have a mapping in CaseFolding.txt but do not have the Changes_When_Casefolded property set. So, FoldData can just store a list of those particular codepoints and then re-use the encoded CaseFolding.txt data alongside it in order to implement changesWhenCaseFolded. This reduces the size of fold.bin.z from 4,387 bytes (4.28KiB) to 1,165 bytes (1.13KiB). This also seemingly introduced a very slight performance regression in zg_caseless. Before: zg CaseFold.compatCaselessMatch: result: 626, took: 258ns zg CaseFold.canonCaselessMatch: result: 626, took: 129ns After: zg CaseFold.compatCaselessMatch: result: 626, took: 263ns zg CaseFold.canonCaselessMatch: result: 626, took: 131ns
-rw-r--r--codegen/fold.zig21
-rw-r--r--src/FoldData.zig21
2 files changed, 34 insertions, 8 deletions
diff --git a/codegen/fold.zig b/codegen/fold.zig
index 53ed3c4..cb73cca 100644
--- a/codegen/fold.zig
+++ b/codegen/fold.zig
@@ -83,6 +83,19 @@ pub fn main() !void {
83 try codepoint_mapping.putNoClobber(codepoint, mapping_buf); 83 try codepoint_mapping.putNoClobber(codepoint, mapping_buf);
84 } 84 }
85 85
86 var changes_when_casefolded_exceptions = std.ArrayList(u21).init(allocator);
87 defer changes_when_casefolded_exceptions.deinit();
88
89 {
90 // Codepoints with a case fold mapping can be missing the Changes_When_Casefolded property,
91 // but not vice versa.
92 for (codepoint_mapping.keys()) |codepoint| {
93 if (props_map.get(codepoint) == null) {
94 try changes_when_casefolded_exceptions.append(codepoint);
95 }
96 }
97 }
98
86 var offset_to_index = std.AutoHashMap(i32, u8).init(allocator); 99 var offset_to_index = std.AutoHashMap(i32, u8).init(allocator);
87 defer offset_to_index.deinit(); 100 defer offset_to_index.deinit();
88 var unique_offsets = std.AutoArrayHashMap(i32, u32).init(allocator); 101 var unique_offsets = std.AutoArrayHashMap(i32, u32).init(allocator);
@@ -228,9 +241,11 @@ pub fn main() !void {
228 try writer.writeInt(u16, @intCast(stage3.len), endian); 241 try writer.writeInt(u16, @intCast(stage3.len), endian);
229 for (stage3) |offset| try writer.writeInt(i24, offset, endian); 242 for (stage3) |offset| try writer.writeInt(i24, offset, endian);
230 // Changes when case folded 243 // Changes when case folded
231 try writer.writeInt(u16, @intCast(props_map.count()), endian); 244 // Min and max
232 var iter = props_map.keyIterator(); 245 try writer.writeInt(u24, std.mem.min(u21, changes_when_casefolded_exceptions.items), endian);
233 while (iter.next()) |key_ptr| try writer.writeInt(u24, key_ptr.*, endian); 246 try writer.writeInt(u24, std.mem.max(u21, changes_when_casefolded_exceptions.items), endian);
247 try writer.writeInt(u16, @intCast(changes_when_casefolded_exceptions.items.len), endian);
248 for (changes_when_casefolded_exceptions.items) |cp| try writer.writeInt(u24, cp, endian);
234 249
235 try out_comp.flush(); 250 try out_comp.flush();
236 } 251 }
diff --git a/src/FoldData.zig b/src/FoldData.zig
index b7bbbd1..d425178 100644
--- a/src/FoldData.zig
+++ b/src/FoldData.zig
@@ -3,11 +3,11 @@ const builtin = @import("builtin");
3const compress = std.compress; 3const compress = std.compress;
4const mem = std.mem; 4const mem = std.mem;
5 5
6const cwcf_max = 0x1e950;
7
8allocator: mem.Allocator, 6allocator: mem.Allocator,
9cutoff: u21 = undefined, 7cutoff: u21 = undefined,
10cwcf: [cwcf_max]bool = [_]bool{false} ** cwcf_max, 8cwcf_exceptions_min: u21 = undefined,
9cwcf_exceptions_max: u21 = undefined,
10cwcf_exceptions: []u21 = undefined,
11multiple_start: u21 = undefined, 11multiple_start: u21 = undefined,
12stage1: []u8 = undefined, 12stage1: []u8 = undefined,
13stage2: []u8 = undefined, 13stage2: []u8 = undefined,
@@ -43,8 +43,11 @@ pub fn init(allocator: mem.Allocator) !Self {
43 errdefer allocator.free(self.stage3); 43 errdefer allocator.free(self.stage3);
44 for (0..len) |i| self.stage3[i] = try reader.readInt(i24, endian); 44 for (0..len) |i| self.stage3[i] = try reader.readInt(i24, endian);
45 45
46 self.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian));
47 self.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian));
46 len = try reader.readInt(u16, endian); 48 len = try reader.readInt(u16, endian);
47 for (0..len) |_| self.cwcf[try reader.readInt(u24, endian)] = true; 49 self.cwcf_exceptions = try allocator.alloc(u21, len);
50 for (0..len) |i| self.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian));
48 51
49 return self; 52 return self;
50} 53}
@@ -83,5 +86,13 @@ pub fn caseFold(self: Self, cp: u21, buf: []u21) []const u21 {
83 86
84/// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). 87/// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`).
85pub fn changesWhenCaseFolded(self: Self, cp: u21) bool { 88pub fn changesWhenCaseFolded(self: Self, cp: u21) bool {
86 return cp < cwcf_max and self.cwcf[cp]; 89 var buf: [3]u21 = undefined;
90 const has_mapping = self.caseFold(cp, &buf).len != 0;
91 return has_mapping and !self.isCwcfException(cp);
92}
93
94fn isCwcfException(self: Self, cp: u21) bool {
95 return cp >= self.cwcf_exceptions_min and
96 cp <= self.cwcf_exceptions_max and
97 std.mem.indexOfScalar(u21, self.cwcf_exceptions, cp) != null;
87} 98}