summaryrefslogtreecommitdiff
path: root/src/FoldData.zig
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-06-26 12:08:08 -0400
committerGravatar Jose Colon Rodriguez2024-06-26 12:08:08 -0400
commit8ada7b4176d2c8afb7ecd01c4ac1aaa0f3b53cc0 (patch)
tree4c73852d462aea2a800964ab2345b9d2f3d95607 /src/FoldData.zig
parentMerge pull request 'Normalize: Mark utf8Encode errors as unreachable, use exp... (diff)
downloadzg-8ada7b4176d2c8afb7ecd01c4ac1aaa0f3b53cc0.tar.gz
zg-8ada7b4176d2c8afb7ecd01c4ac1aaa0f3b53cc0.tar.xz
zg-8ada7b4176d2c8afb7ecd01c4ac1aaa0f3b53cc0.zip
Implemented sqeek502s case fold
Diffstat (limited to 'src/FoldData.zig')
-rw-r--r--src/FoldData.zig86
1 files changed, 51 insertions, 35 deletions
diff --git a/src/FoldData.zig b/src/FoldData.zig
index d4312b0..93613fe 100644
--- a/src/FoldData.zig
+++ b/src/FoldData.zig
@@ -4,8 +4,11 @@ const compress = std.compress;
4const mem = std.mem; 4const mem = std.mem;
5 5
6allocator: mem.Allocator, 6allocator: mem.Allocator,
7fold: [][]u21 = undefined, 7cutoff: u21 = undefined,
8cwcf: []bool = undefined, 8multiple_start: u21 = undefined,
9stage1: []u8 = undefined,
10stage2: []u8 = undefined,
11stage3: []i24 = undefined,
9 12
10const Self = @This(); 13const Self = @This();
11 14
@@ -17,49 +20,62 @@ pub fn init(allocator: mem.Allocator) !Self {
17 var reader = in_decomp.reader(); 20 var reader = in_decomp.reader();
18 21
19 const endian = builtin.cpu.arch.endian(); 22 const endian = builtin.cpu.arch.endian();
20 var self = Self{
21 .allocator = allocator,
22 .fold = try allocator.alloc([]u21, 0x110000),
23 .cwcf = try allocator.alloc(bool, 0x110000),
24 };
25
26 var slices: usize = 0;
27 errdefer {
28 for (self.fold[0..slices]) |slice| self.allocator.free(slice);
29 self.allocator.free(self.fold);
30 self.allocator.free(self.cwcf);
31 }
32 23
33 @memset(self.fold, &.{}); 24 var self = Self{ .allocator = allocator };
34 @memset(self.cwcf, false); 25 self.cutoff = @intCast(try reader.readInt(u24, endian));
35 26 self.multiple_start = @intCast(try reader.readInt(u24, endian));
36 while (true) { 27
37 const len: u8 = try reader.readInt(u8, endian); 28 var len = try reader.readInt(u16, endian);
38 if (len == 0) break; 29 self.stage1 = try allocator.alloc(u8, len);
39 const cp = try reader.readInt(u24, endian); 30 errdefer allocator.free(self.stage1);
40 self.fold[cp >> 1] = try allocator.alloc(u21, len - 1); 31 for (0..len) |i| self.stage1[i] = try reader.readInt(u8, endian);
41 slices += 1; 32
42 for (0..len - 1) |i| { 33 len = try reader.readInt(u16, endian);
43 self.fold[cp >> 1][i] = @intCast(try reader.readInt(u24, endian)); 34 self.stage2 = try allocator.alloc(u8, len);
44 } 35 errdefer allocator.free(self.stage2);
45 self.cwcf[cp >> 1] = cp & 1 == 1; 36 for (0..len) |i| self.stage2[i] = try reader.readInt(u8, endian);
46 } 37
38 len = try reader.readInt(u16, endian);
39 self.stage3 = try allocator.alloc(i24, len);
40 errdefer allocator.free(self.stage3);
41 for (0..len) |i| self.stage3[i] = try reader.readInt(i24, endian);
47 42
48 return self; 43 return self;
49} 44}
50 45
51pub fn deinit(self: *const Self) void { 46pub fn deinit(self: *const Self) void {
52 for (self.fold) |slice| self.allocator.free(slice); 47 self.allocator.free(self.stage1);
53 self.allocator.free(self.fold); 48 self.allocator.free(self.stage2);
54 self.allocator.free(self.cwcf); 49 self.allocator.free(self.stage3);
55} 50}
56 51
57/// Returns the case fold for `cp`. 52/// Returns the case fold for `cp`.
58pub inline fn caseFold(self: Self, cp: u21) []const u21 { 53pub inline fn caseFold(self: Self, cp: u21, buf: []u21) []const u21 {
59 return self.fold[cp]; 54 if (cp >= self.cutoff) return &.{};
55
56 const stage1_val = self.stage1[cp >> 8];
57 if (stage1_val == 0) return &.{};
58
59 const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF);
60 const stage3_index = self.stage2[stage2_index];
61
62 if (stage3_index & 0x80 != 0) {
63 const real_index = @as(usize, self.multiple_start) + (stage3_index ^ 0x80) * 3;
64 const mapping = mem.sliceTo(self.stage3[real_index..][0..3], 0);
65 for (mapping, 0..) |c, i| buf[i] = @intCast(c);
66
67 return buf[0..mapping.len];
68 }
69
70 const offset = self.stage3[stage3_index];
71 if (offset == 0) return &.{};
72
73 buf[0] = @intCast(@as(i32, cp) + offset);
74
75 return buf[0..1];
60} 76}
61 77
62/// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). 78/// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`).
63pub inline fn changesWhenCaseFolded(self: Self, cp: u21) bool { 79pub inline fn changesWhenCaseFolded(_: Self, _: u21) bool {
64 return self.cwcf[cp]; 80 return true;
65} 81}