diff options
| author | 2024-06-26 12:08:08 -0400 | |
|---|---|---|
| committer | 2024-06-26 12:08:08 -0400 | |
| commit | 8ada7b4176d2c8afb7ecd01c4ac1aaa0f3b53cc0 (patch) | |
| tree | 4c73852d462aea2a800964ab2345b9d2f3d95607 /src/FoldData.zig | |
| parent | Merge pull request 'Normalize: Mark utf8Encode errors as unreachable, use exp... (diff) | |
| download | zg-8ada7b4176d2c8afb7ecd01c4ac1aaa0f3b53cc0.tar.gz zg-8ada7b4176d2c8afb7ecd01c4ac1aaa0f3b53cc0.tar.xz zg-8ada7b4176d2c8afb7ecd01c4ac1aaa0f3b53cc0.zip | |
Implemented sqeek502s case fold
Diffstat (limited to 'src/FoldData.zig')
| -rw-r--r-- | src/FoldData.zig | 86 |
1 files changed, 51 insertions, 35 deletions
diff --git a/src/FoldData.zig b/src/FoldData.zig index d4312b0..93613fe 100644 --- a/src/FoldData.zig +++ b/src/FoldData.zig | |||
| @@ -4,8 +4,11 @@ const compress = std.compress; | |||
| 4 | const mem = std.mem; | 4 | const mem = std.mem; |
| 5 | 5 | ||
| 6 | allocator: mem.Allocator, | 6 | allocator: mem.Allocator, |
| 7 | fold: [][]u21 = undefined, | 7 | cutoff: u21 = undefined, |
| 8 | cwcf: []bool = undefined, | 8 | multiple_start: u21 = undefined, |
| 9 | stage1: []u8 = undefined, | ||
| 10 | stage2: []u8 = undefined, | ||
| 11 | stage3: []i24 = undefined, | ||
| 9 | 12 | ||
| 10 | const Self = @This(); | 13 | const Self = @This(); |
| 11 | 14 | ||
| @@ -17,49 +20,62 @@ pub fn init(allocator: mem.Allocator) !Self { | |||
| 17 | var reader = in_decomp.reader(); | 20 | var reader = in_decomp.reader(); |
| 18 | 21 | ||
| 19 | const endian = builtin.cpu.arch.endian(); | 22 | const endian = builtin.cpu.arch.endian(); |
| 20 | var self = Self{ | ||
| 21 | .allocator = allocator, | ||
| 22 | .fold = try allocator.alloc([]u21, 0x110000), | ||
| 23 | .cwcf = try allocator.alloc(bool, 0x110000), | ||
| 24 | }; | ||
| 25 | |||
| 26 | var slices: usize = 0; | ||
| 27 | errdefer { | ||
| 28 | for (self.fold[0..slices]) |slice| self.allocator.free(slice); | ||
| 29 | self.allocator.free(self.fold); | ||
| 30 | self.allocator.free(self.cwcf); | ||
| 31 | } | ||
| 32 | 23 | ||
| 33 | @memset(self.fold, &.{}); | 24 | var self = Self{ .allocator = allocator }; |
| 34 | @memset(self.cwcf, false); | 25 | self.cutoff = @intCast(try reader.readInt(u24, endian)); |
| 35 | 26 | self.multiple_start = @intCast(try reader.readInt(u24, endian)); | |
| 36 | while (true) { | 27 | |
| 37 | const len: u8 = try reader.readInt(u8, endian); | 28 | var len = try reader.readInt(u16, endian); |
| 38 | if (len == 0) break; | 29 | self.stage1 = try allocator.alloc(u8, len); |
| 39 | const cp = try reader.readInt(u24, endian); | 30 | errdefer allocator.free(self.stage1); |
| 40 | self.fold[cp >> 1] = try allocator.alloc(u21, len - 1); | 31 | for (0..len) |i| self.stage1[i] = try reader.readInt(u8, endian); |
| 41 | slices += 1; | 32 | |
| 42 | for (0..len - 1) |i| { | 33 | len = try reader.readInt(u16, endian); |
| 43 | self.fold[cp >> 1][i] = @intCast(try reader.readInt(u24, endian)); | 34 | self.stage2 = try allocator.alloc(u8, len); |
| 44 | } | 35 | errdefer allocator.free(self.stage2); |
| 45 | self.cwcf[cp >> 1] = cp & 1 == 1; | 36 | for (0..len) |i| self.stage2[i] = try reader.readInt(u8, endian); |
| 46 | } | 37 | |
| 38 | len = try reader.readInt(u16, endian); | ||
| 39 | self.stage3 = try allocator.alloc(i24, len); | ||
| 40 | errdefer allocator.free(self.stage3); | ||
| 41 | for (0..len) |i| self.stage3[i] = try reader.readInt(i24, endian); | ||
| 47 | 42 | ||
| 48 | return self; | 43 | return self; |
| 49 | } | 44 | } |
| 50 | 45 | ||
| 51 | pub fn deinit(self: *const Self) void { | 46 | pub fn deinit(self: *const Self) void { |
| 52 | for (self.fold) |slice| self.allocator.free(slice); | 47 | self.allocator.free(self.stage1); |
| 53 | self.allocator.free(self.fold); | 48 | self.allocator.free(self.stage2); |
| 54 | self.allocator.free(self.cwcf); | 49 | self.allocator.free(self.stage3); |
| 55 | } | 50 | } |
| 56 | 51 | ||
| 57 | /// Returns the case fold for `cp`. | 52 | /// Returns the case fold for `cp`. |
| 58 | pub inline fn caseFold(self: Self, cp: u21) []const u21 { | 53 | pub inline fn caseFold(self: Self, cp: u21, buf: []u21) []const u21 { |
| 59 | return self.fold[cp]; | 54 | if (cp >= self.cutoff) return &.{}; |
| 55 | |||
| 56 | const stage1_val = self.stage1[cp >> 8]; | ||
| 57 | if (stage1_val == 0) return &.{}; | ||
| 58 | |||
| 59 | const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF); | ||
| 60 | const stage3_index = self.stage2[stage2_index]; | ||
| 61 | |||
| 62 | if (stage3_index & 0x80 != 0) { | ||
| 63 | const real_index = @as(usize, self.multiple_start) + (stage3_index ^ 0x80) * 3; | ||
| 64 | const mapping = mem.sliceTo(self.stage3[real_index..][0..3], 0); | ||
| 65 | for (mapping, 0..) |c, i| buf[i] = @intCast(c); | ||
| 66 | |||
| 67 | return buf[0..mapping.len]; | ||
| 68 | } | ||
| 69 | |||
| 70 | const offset = self.stage3[stage3_index]; | ||
| 71 | if (offset == 0) return &.{}; | ||
| 72 | |||
| 73 | buf[0] = @intCast(@as(i32, cp) + offset); | ||
| 74 | |||
| 75 | return buf[0..1]; | ||
| 60 | } | 76 | } |
| 61 | 77 | ||
| 62 | /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). | 78 | /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). |
| 63 | pub inline fn changesWhenCaseFolded(self: Self, cp: u21) bool { | 79 | pub inline fn changesWhenCaseFolded(_: Self, _: u21) bool { |
| 64 | return self.cwcf[cp]; | 80 | return true; |
| 65 | } | 81 | } |