From 8ada7b4176d2c8afb7ecd01c4ac1aaa0f3b53cc0 Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Wed, 26 Jun 2024 12:08:08 -0400 Subject: Implemented sqeek502s case fold --- src/CaseFold.zig | 3 +- src/FoldData.zig | 86 +++++++++++++++++++++++++++++++++----------------------- 2 files changed, 53 insertions(+), 36 deletions(-) (limited to 'src') diff --git a/src/CaseFold.zig b/src/CaseFold.zig index 3e7535e..19c9da8 100644 --- a/src/CaseFold.zig +++ b/src/CaseFold.zig @@ -19,9 +19,10 @@ pub fn caseFold( ) ![]const u21 { var cfcps = std.ArrayList(u21).init(allocator); defer cfcps.deinit(); + var buf: [3]u21 = undefined; for (cps) |cp| { - const cf = self.fold_data.caseFold(cp); + const cf = self.fold_data.caseFold(cp, &buf); if (cf.len == 0) { try cfcps.append(cp); diff --git a/src/FoldData.zig b/src/FoldData.zig index d4312b0..93613fe 100644 --- a/src/FoldData.zig +++ b/src/FoldData.zig @@ -4,8 +4,11 @@ const compress = std.compress; const mem = std.mem; allocator: mem.Allocator, -fold: [][]u21 = undefined, -cwcf: []bool = undefined, +cutoff: u21 = undefined, +multiple_start: u21 = undefined, +stage1: []u8 = undefined, +stage2: []u8 = undefined, +stage3: []i24 = undefined, const Self = @This(); @@ -17,49 +20,62 @@ pub fn init(allocator: mem.Allocator) !Self { var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); - var self = Self{ - .allocator = allocator, - .fold = try allocator.alloc([]u21, 0x110000), - .cwcf = try allocator.alloc(bool, 0x110000), - }; - - var slices: usize = 0; - errdefer { - for (self.fold[0..slices]) |slice| self.allocator.free(slice); - self.allocator.free(self.fold); - self.allocator.free(self.cwcf); - } - @memset(self.fold, &.{}); - @memset(self.cwcf, false); - - while (true) { - const len: u8 = try reader.readInt(u8, endian); - if (len == 0) break; - const cp = try reader.readInt(u24, endian); - self.fold[cp >> 1] = try allocator.alloc(u21, len - 1); - slices += 1; - for (0..len - 1) |i| { - self.fold[cp >> 1][i] = @intCast(try reader.readInt(u24, endian)); - } - self.cwcf[cp >> 1] = cp & 1 == 1; - } + var self = Self{ .allocator = allocator }; + self.cutoff = @intCast(try reader.readInt(u24, endian)); + self.multiple_start = @intCast(try reader.readInt(u24, endian)); + + var len = try reader.readInt(u16, endian); + self.stage1 = try allocator.alloc(u8, len); + errdefer allocator.free(self.stage1); + for (0..len) |i| self.stage1[i] = try reader.readInt(u8, endian); + + len = try reader.readInt(u16, endian); + self.stage2 = try allocator.alloc(u8, len); + errdefer allocator.free(self.stage2); + for (0..len) |i| self.stage2[i] = try reader.readInt(u8, endian); + + len = try reader.readInt(u16, endian); + self.stage3 = try allocator.alloc(i24, len); + errdefer allocator.free(self.stage3); + for (0..len) |i| self.stage3[i] = try reader.readInt(i24, endian); return self; } pub fn deinit(self: *const Self) void { - for (self.fold) |slice| self.allocator.free(slice); - self.allocator.free(self.fold); - self.allocator.free(self.cwcf); + self.allocator.free(self.stage1); + self.allocator.free(self.stage2); + self.allocator.free(self.stage3); } /// Returns the case fold for `cp`. -pub inline fn caseFold(self: Self, cp: u21) []const u21 { - return self.fold[cp]; +pub inline fn caseFold(self: Self, cp: u21, buf: []u21) []const u21 { + if (cp >= self.cutoff) return &.{}; + + const stage1_val = self.stage1[cp >> 8]; + if (stage1_val == 0) return &.{}; + + const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF); + const stage3_index = self.stage2[stage2_index]; + + if (stage3_index & 0x80 != 0) { + const real_index = @as(usize, self.multiple_start) + (stage3_index ^ 0x80) * 3; + const mapping = mem.sliceTo(self.stage3[real_index..][0..3], 0); + for (mapping, 0..) |c, i| buf[i] = @intCast(c); + + return buf[0..mapping.len]; + } + + const offset = self.stage3[stage3_index]; + if (offset == 0) return &.{}; + + buf[0] = @intCast(@as(i32, cp) + offset); + + return buf[0..1]; } /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). -pub inline fn changesWhenCaseFolded(self: Self, cp: u21) bool { - return self.cwcf[cp]; +pub inline fn changesWhenCaseFolded(_: Self, _: u21) bool { + return true; } -- cgit v1.2.3