diff options
| author | 2025-04-30 15:23:14 -0400 | |
|---|---|---|
| committer | 2025-04-30 15:23:14 -0400 | |
| commit | 312ca415bb01212a320acacda743896ed59a7b82 (patch) | |
| tree | 8568a7af7084e5df6805388afc4f4008b8205c6c /src | |
| parent | Merge NormData with Normalize (diff) | |
| download | zg-312ca415bb01212a320acacda743896ed59a7b82.tar.gz zg-312ca415bb01212a320acacda743896ed59a7b82.tar.xz zg-312ca415bb01212a320acacda743896ed59a7b82.zip | |
Remove FoldData, make CaseFolding
CaseFolding now has the FoldData, and can be initialized with a copy
of Normalize if wanted.
Diffstat (limited to 'src')
| -rw-r--r-- | src/CanonData.zig | 14 | ||||
| -rw-r--r-- | src/CaseFold.zig | 256 | ||||
| -rw-r--r-- | src/FoldData.zig | 99 | ||||
| -rw-r--r-- | src/Normalize.zig | 16 |
4 files changed, 218 insertions, 167 deletions
diff --git a/src/CanonData.zig b/src/CanonData.zig index c67d1d6..d95a5be 100644 --- a/src/CanonData.zig +++ b/src/CanonData.zig | |||
| @@ -17,11 +17,11 @@ pub fn init(allocator: mem.Allocator) !CanonData { | |||
| 17 | .nfc = .empty, | 17 | .nfc = .empty, |
| 18 | .nfd = try allocator.alloc([]u21, 0x110000), | 18 | .nfd = try allocator.alloc([]u21, 0x110000), |
| 19 | }; | 19 | }; |
| 20 | var _cp: u24 = undefined; | ||
| 20 | 21 | ||
| 21 | var slices: usize = 0; | ||
| 22 | errdefer { | 22 | errdefer { |
| 23 | cdata.nfc.deinit(allocator); | 23 | cdata.nfc.deinit(allocator); |
| 24 | for (cdata.nfd[0..slices]) |slice| allocator.free(slice); | 24 | for (cdata.nfd[0.._cp]) |slice| allocator.free(slice); |
| 25 | allocator.free(cdata.nfd); | 25 | allocator.free(cdata.nfd); |
| 26 | } | 26 | } |
| 27 | 27 | ||
| @@ -31,14 +31,16 @@ pub fn init(allocator: mem.Allocator) !CanonData { | |||
| 31 | const len: u8 = try reader.readInt(u8, endian); | 31 | const len: u8 = try reader.readInt(u8, endian); |
| 32 | if (len == 0) break; | 32 | if (len == 0) break; |
| 33 | const cp = try reader.readInt(u24, endian); | 33 | const cp = try reader.readInt(u24, endian); |
| 34 | cdata.nfd[cp] = try allocator.alloc(u21, len - 1); | 34 | _cp = cp; |
| 35 | slices += 1; | 35 | const nfd_cp = try allocator.alloc(u21, len - 1); |
| 36 | errdefer allocator.free(nfd_cp); | ||
| 36 | for (0..len - 1) |i| { | 37 | for (0..len - 1) |i| { |
| 37 | cdata.nfd[cp][i] = @intCast(try reader.readInt(u24, endian)); | 38 | nfd_cp[i] = @intCast(try reader.readInt(u24, endian)); |
| 38 | } | 39 | } |
| 39 | if (len == 3) { | 40 | if (len == 3) { |
| 40 | try cdata.nfc.put(allocator, cdata.nfd[cp][0..2].*, @intCast(cp)); | 41 | try cdata.nfc.put(allocator, nfd_cp[0..2].*, @intCast(cp)); |
| 41 | } | 42 | } |
| 43 | cdata.nfd[cp] = nfd_cp; | ||
| 42 | } | 44 | } |
| 43 | 45 | ||
| 44 | return cdata; | 46 | return cdata; |
diff --git a/src/CaseFold.zig b/src/CaseFold.zig index 6490aea..162e82f 100644 --- a/src/CaseFold.zig +++ b/src/CaseFold.zig | |||
| @@ -1,28 +1,124 @@ | |||
| 1 | const std = @import("std"); | 1 | cutoff: u21 = undefined, |
| 2 | const mem = std.mem; | 2 | cwcf_exceptions_min: u21 = undefined, |
| 3 | const testing = std.testing; | 3 | cwcf_exceptions_max: u21 = undefined, |
| 4 | cwcf_exceptions: []u21 = undefined, | ||
| 5 | multiple_start: u21 = undefined, | ||
| 6 | stage1: []u8 = undefined, | ||
| 7 | stage2: []u8 = undefined, | ||
| 8 | stage3: []i24 = undefined, | ||
| 9 | normalize: Normalize, | ||
| 10 | owns_normalize: bool, | ||
| 11 | |||
| 12 | const CaseFolding = @This(); | ||
| 13 | |||
| 14 | pub fn init(allocator: Allocator) !CaseFolding { | ||
| 15 | var case_fold: CaseFolding = undefined; | ||
| 16 | try case_fold.setup(allocator); | ||
| 17 | return case_fold; | ||
| 18 | } | ||
| 4 | 19 | ||
| 5 | const ascii = @import("ascii"); | 20 | pub fn initWithNormalize(allocator: Allocator, norm: Normalize) !CaseFolding { |
| 6 | pub const FoldData = @import("FoldData"); | 21 | var casefold: CaseFolding = undefined; |
| 7 | const Normalize = @import("Normalize"); | 22 | try casefold.setupWithNormalize(allocator, norm); |
| 23 | return casefold; | ||
| 24 | } | ||
| 25 | |||
| 26 | pub fn setup(casefold: *CaseFolding, allocator: Allocator) !void { | ||
| 27 | try casefold.setupImpl(allocator); | ||
| 28 | casefold.owns_normalize = false; | ||
| 29 | errdefer casefold.deinit(allocator); | ||
| 30 | try casefold.normalize.setup(allocator); | ||
| 31 | casefold.owns_normalize = true; | ||
| 32 | } | ||
| 33 | |||
| 34 | pub fn setupWithNormalize(casefold: *CaseFolding, allocator: Allocator, norm: Normalize) !void { | ||
| 35 | try casefold.setupImpl(allocator); | ||
| 36 | casefold.normalize = norm; | ||
| 37 | casefold.owns_normalize = false; | ||
| 38 | } | ||
| 8 | 39 | ||
| 9 | fold_data: *const FoldData, | 40 | fn setupImpl(casefold: *CaseFolding, allocator: Allocator) !void { |
| 41 | const decompressor = compress.flate.inflate.decompressor; | ||
| 42 | const in_bytes = @embedFile("fold"); | ||
| 43 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 44 | var in_decomp = decompressor(.raw, in_fbs.reader()); | ||
| 45 | var reader = in_decomp.reader(); | ||
| 46 | |||
| 47 | const endian = builtin.cpu.arch.endian(); | ||
| 48 | |||
| 49 | casefold.cutoff = @intCast(try reader.readInt(u24, endian)); | ||
| 50 | casefold.multiple_start = @intCast(try reader.readInt(u24, endian)); | ||
| 51 | |||
| 52 | var len = try reader.readInt(u16, endian); | ||
| 53 | casefold.stage1 = try allocator.alloc(u8, len); | ||
| 54 | errdefer allocator.free(casefold.stage1); | ||
| 55 | for (0..len) |i| casefold.stage1[i] = try reader.readInt(u8, endian); | ||
| 56 | |||
| 57 | len = try reader.readInt(u16, endian); | ||
| 58 | casefold.stage2 = try allocator.alloc(u8, len); | ||
| 59 | errdefer allocator.free(casefold.stage2); | ||
| 60 | for (0..len) |i| casefold.stage2[i] = try reader.readInt(u8, endian); | ||
| 61 | |||
| 62 | len = try reader.readInt(u16, endian); | ||
| 63 | casefold.stage3 = try allocator.alloc(i24, len); | ||
| 64 | errdefer allocator.free(casefold.stage3); | ||
| 65 | for (0..len) |i| casefold.stage3[i] = try reader.readInt(i24, endian); | ||
| 66 | |||
| 67 | casefold.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian)); | ||
| 68 | casefold.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian)); | ||
| 69 | len = try reader.readInt(u16, endian); | ||
| 70 | casefold.cwcf_exceptions = try allocator.alloc(u21, len); | ||
| 71 | errdefer allocator.free(casefold.cwcf_exceptions); | ||
| 72 | for (0..len) |i| casefold.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian)); | ||
| 73 | } | ||
| 74 | |||
| 75 | pub fn deinit(fdata: *const CaseFolding, allocator: mem.Allocator) void { | ||
| 76 | allocator.free(fdata.stage1); | ||
| 77 | allocator.free(fdata.stage2); | ||
| 78 | allocator.free(fdata.stage3); | ||
| 79 | allocator.free(fdata.cwcf_exceptions); | ||
| 80 | if (fdata.owns_normalize) fdata.normalize.deinit(allocator); | ||
| 81 | } | ||
| 82 | |||
| 83 | /// Returns the case fold for `cp`. | ||
| 84 | pub fn caseFold(fdata: *const CaseFolding, cp: u21, buf: []u21) []const u21 { | ||
| 85 | if (cp >= fdata.cutoff) return &.{}; | ||
| 86 | |||
| 87 | const stage1_val = fdata.stage1[cp >> 8]; | ||
| 88 | if (stage1_val == 0) return &.{}; | ||
| 89 | |||
| 90 | const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF); | ||
| 91 | const stage3_index = fdata.stage2[stage2_index]; | ||
| 10 | 92 | ||
| 11 | const Self = @This(); | 93 | if (stage3_index & 0x80 != 0) { |
| 94 | const real_index = @as(usize, fdata.multiple_start) + (stage3_index ^ 0x80) * 3; | ||
| 95 | const mapping = mem.sliceTo(fdata.stage3[real_index..][0..3], 0); | ||
| 96 | for (mapping, 0..) |c, i| buf[i] = @intCast(c); | ||
| 97 | |||
| 98 | return buf[0..mapping.len]; | ||
| 99 | } | ||
| 100 | |||
| 101 | const offset = fdata.stage3[stage3_index]; | ||
| 102 | if (offset == 0) return &.{}; | ||
| 103 | |||
| 104 | buf[0] = @intCast(@as(i32, cp) + offset); | ||
| 105 | |||
| 106 | return buf[0..1]; | ||
| 107 | } | ||
| 12 | 108 | ||
| 13 | /// Produces the case folded code points for `cps`. Caller must free returned | 109 | /// Produces the case folded code points for `cps`. Caller must free returned |
| 14 | /// slice with `allocator`. | 110 | /// slice with `allocator`. |
| 15 | pub fn caseFold( | 111 | pub fn caseFoldAlloc( |
| 16 | self: Self, | 112 | casefold: *const CaseFolding, |
| 17 | allocator: mem.Allocator, | 113 | allocator: Allocator, |
| 18 | cps: []const u21, | 114 | cps: []const u21, |
| 19 | ) ![]const u21 { | 115 | ) Allocator.Error![]const u21 { |
| 20 | var cfcps = std.ArrayList(u21).init(allocator); | 116 | var cfcps = std.ArrayList(u21).init(allocator); |
| 21 | defer cfcps.deinit(); | 117 | defer cfcps.deinit(); |
| 22 | var buf: [3]u21 = undefined; | 118 | var buf: [3]u21 = undefined; |
| 23 | 119 | ||
| 24 | for (cps) |cp| { | 120 | for (cps) |cp| { |
| 25 | const cf = self.fold_data.caseFold(cp, &buf); | 121 | const cf = casefold.caseFold(cp, &buf); |
| 26 | 122 | ||
| 27 | if (cf.len == 0) { | 123 | if (cf.len == 0) { |
| 28 | try cfcps.append(cp); | 124 | try cfcps.append(cp); |
| @@ -34,59 +130,71 @@ pub fn caseFold( | |||
| 34 | return try cfcps.toOwnedSlice(); | 130 | return try cfcps.toOwnedSlice(); |
| 35 | } | 131 | } |
| 36 | 132 | ||
| 37 | fn changesWhenCaseFolded(self: Self, cps: []const u21) bool { | 133 | /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). |
| 134 | pub fn cpChangesWhenCaseFolded(casefold: *const CaseFolding, cp: u21) bool { | ||
| 135 | var buf: [3]u21 = undefined; | ||
| 136 | const has_mapping = casefold.caseFold(cp, &buf).len != 0; | ||
| 137 | return has_mapping and !casefold.isCwcfException(cp); | ||
| 138 | } | ||
| 139 | |||
| 140 | pub fn changesWhenCaseFolded(casefold: *const CaseFolding, cps: []const u21) bool { | ||
| 38 | return for (cps) |cp| { | 141 | return for (cps) |cp| { |
| 39 | if (self.fold_data.changesWhenCaseFolded(cp)) break true; | 142 | if (casefold.cpChangesWhenCaseFolded(cp)) break true; |
| 40 | } else false; | 143 | } else false; |
| 41 | } | 144 | } |
| 42 | 145 | ||
| 146 | fn isCwcfException(casefold: *const CaseFolding, cp: u21) bool { | ||
| 147 | return cp >= casefold.cwcf_exceptions_min and | ||
| 148 | cp <= casefold.cwcf_exceptions_max and | ||
| 149 | std.mem.indexOfScalar(u21, casefold.cwcf_exceptions, cp) != null; | ||
| 150 | } | ||
| 151 | |||
| 43 | /// Caseless compare `a` and `b` by decomposing to NFKD. This is the most | 152 | /// Caseless compare `a` and `b` by decomposing to NFKD. This is the most |
| 44 | /// comprehensive comparison possible, but slower than `canonCaselessMatch`. | 153 | /// comprehensive comparison possible, but slower than `canonCaselessMatch`. |
| 45 | pub fn compatCaselessMatch( | 154 | pub fn compatCaselessMatch( |
| 46 | self: Self, | 155 | casefold: *const CaseFolding, |
| 47 | allocator: mem.Allocator, | 156 | allocator: Allocator, |
| 48 | normalizer: *const Normalize, | ||
| 49 | a: []const u8, | 157 | a: []const u8, |
| 50 | b: []const u8, | 158 | b: []const u8, |
| 51 | ) !bool { | 159 | ) Allocator.Error!bool { |
| 52 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); | 160 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); |
| 53 | 161 | ||
| 54 | // Process a | 162 | // Process a |
| 55 | const nfd_a = try normalizer.nfxdCodePoints(allocator, a, .nfd); | 163 | const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd); |
| 56 | defer allocator.free(nfd_a); | 164 | defer allocator.free(nfd_a); |
| 57 | 165 | ||
| 58 | var need_free_cf_nfd_a = false; | 166 | var need_free_cf_nfd_a = false; |
| 59 | var cf_nfd_a: []const u21 = nfd_a; | 167 | var cf_nfd_a: []const u21 = nfd_a; |
| 60 | if (self.changesWhenCaseFolded(nfd_a)) { | 168 | if (casefold.changesWhenCaseFolded(nfd_a)) { |
| 61 | cf_nfd_a = try self.caseFold(allocator, nfd_a); | 169 | cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a); |
| 62 | need_free_cf_nfd_a = true; | 170 | need_free_cf_nfd_a = true; |
| 63 | } | 171 | } |
| 64 | defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); | 172 | defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); |
| 65 | 173 | ||
| 66 | const nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfd_a); | 174 | const nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_a); |
| 67 | defer allocator.free(nfkd_cf_nfd_a); | 175 | defer allocator.free(nfkd_cf_nfd_a); |
| 68 | const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a); | 176 | const cf_nfkd_cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_a); |
| 69 | defer allocator.free(cf_nfkd_cf_nfd_a); | 177 | defer allocator.free(cf_nfkd_cf_nfd_a); |
| 70 | const nfkd_cf_nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); | 178 | const nfkd_cf_nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); |
| 71 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); | 179 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); |
| 72 | 180 | ||
| 73 | // Process b | 181 | // Process b |
| 74 | const nfd_b = try normalizer.nfxdCodePoints(allocator, b, .nfd); | 182 | const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd); |
| 75 | defer allocator.free(nfd_b); | 183 | defer allocator.free(nfd_b); |
| 76 | 184 | ||
| 77 | var need_free_cf_nfd_b = false; | 185 | var need_free_cf_nfd_b = false; |
| 78 | var cf_nfd_b: []const u21 = nfd_b; | 186 | var cf_nfd_b: []const u21 = nfd_b; |
| 79 | if (self.changesWhenCaseFolded(nfd_b)) { | 187 | if (casefold.changesWhenCaseFolded(nfd_b)) { |
| 80 | cf_nfd_b = try self.caseFold(allocator, nfd_b); | 188 | cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b); |
| 81 | need_free_cf_nfd_b = true; | 189 | need_free_cf_nfd_b = true; |
| 82 | } | 190 | } |
| 83 | defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); | 191 | defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); |
| 84 | 192 | ||
| 85 | const nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfd_b); | 193 | const nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_b); |
| 86 | defer allocator.free(nfkd_cf_nfd_b); | 194 | defer allocator.free(nfkd_cf_nfd_b); |
| 87 | const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b); | 195 | const cf_nfkd_cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_b); |
| 88 | defer allocator.free(cf_nfkd_cf_nfd_b); | 196 | defer allocator.free(cf_nfkd_cf_nfd_b); |
| 89 | const nfkd_cf_nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); | 197 | const nfkd_cf_nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); |
| 90 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); | 198 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); |
| 91 | 199 | ||
| 92 | return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); | 200 | return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); |
| @@ -95,42 +203,37 @@ pub fn compatCaselessMatch( | |||
| 95 | test "compatCaselessMatch" { | 203 | test "compatCaselessMatch" { |
| 96 | const allocator = testing.allocator; | 204 | const allocator = testing.allocator; |
| 97 | 205 | ||
| 98 | const n = try Normalize.init(allocator); | 206 | const caser = try CaseFolding.init(allocator); |
| 99 | defer n.deinit(allocator); | 207 | defer caser.deinit(allocator); |
| 100 | 208 | ||
| 101 | const fold_data = try FoldData.init(allocator); | 209 | try testing.expect(try caser.compatCaselessMatch(allocator, "ascii only!", "ASCII Only!")); |
| 102 | defer fold_data.deinit(allocator); | ||
| 103 | const caser = Self{ .fold_data = &fold_data }; | ||
| 104 | |||
| 105 | try testing.expect(try caser.compatCaselessMatch(allocator, &n, "ascii only!", "ASCII Only!")); | ||
| 106 | 210 | ||
| 107 | const a = "Héllo World! \u{3d3}"; | 211 | const a = "Héllo World! \u{3d3}"; |
| 108 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; | 212 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; |
| 109 | try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, b)); | 213 | try testing.expect(try caser.compatCaselessMatch(allocator, a, b)); |
| 110 | 214 | ||
| 111 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; | 215 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; |
| 112 | try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, c)); | 216 | try testing.expect(try caser.compatCaselessMatch(allocator, a, c)); |
| 113 | } | 217 | } |
| 114 | 218 | ||
| 115 | /// Performs canonical caseless string matching by decomposing to NFD. This is | 219 | /// Performs canonical caseless string matching by decomposing to NFD. This is |
| 116 | /// faster than `compatCaselessMatch`, but less comprehensive. | 220 | /// faster than `compatCaselessMatch`, but less comprehensive. |
| 117 | pub fn canonCaselessMatch( | 221 | pub fn canonCaselessMatch( |
| 118 | self: Self, | 222 | casefold: *const CaseFolding, |
| 119 | allocator: mem.Allocator, | 223 | allocator: Allocator, |
| 120 | normalizer: *const Normalize, | ||
| 121 | a: []const u8, | 224 | a: []const u8, |
| 122 | b: []const u8, | 225 | b: []const u8, |
| 123 | ) !bool { | 226 | ) Allocator.Error!bool { |
| 124 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); | 227 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); |
| 125 | 228 | ||
| 126 | // Process a | 229 | // Process a |
| 127 | const nfd_a = try normalizer.nfxdCodePoints(allocator, a, .nfd); | 230 | const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd); |
| 128 | defer allocator.free(nfd_a); | 231 | defer allocator.free(nfd_a); |
| 129 | 232 | ||
| 130 | var need_free_cf_nfd_a = false; | 233 | var need_free_cf_nfd_a = false; |
| 131 | var cf_nfd_a: []const u21 = nfd_a; | 234 | var cf_nfd_a: []const u21 = nfd_a; |
| 132 | if (self.changesWhenCaseFolded(nfd_a)) { | 235 | if (casefold.changesWhenCaseFolded(nfd_a)) { |
| 133 | cf_nfd_a = try self.caseFold(allocator, nfd_a); | 236 | cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a); |
| 134 | need_free_cf_nfd_a = true; | 237 | need_free_cf_nfd_a = true; |
| 135 | } | 238 | } |
| 136 | defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); | 239 | defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); |
| @@ -138,19 +241,19 @@ pub fn canonCaselessMatch( | |||
| 138 | var need_free_nfd_cf_nfd_a = false; | 241 | var need_free_nfd_cf_nfd_a = false; |
| 139 | var nfd_cf_nfd_a = cf_nfd_a; | 242 | var nfd_cf_nfd_a = cf_nfd_a; |
| 140 | if (!need_free_cf_nfd_a) { | 243 | if (!need_free_cf_nfd_a) { |
| 141 | nfd_cf_nfd_a = try normalizer.nfdCodePoints(allocator, cf_nfd_a); | 244 | nfd_cf_nfd_a = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_a); |
| 142 | need_free_nfd_cf_nfd_a = true; | 245 | need_free_nfd_cf_nfd_a = true; |
| 143 | } | 246 | } |
| 144 | defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); | 247 | defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); |
| 145 | 248 | ||
| 146 | // Process b | 249 | // Process b |
| 147 | const nfd_b = try normalizer.nfxdCodePoints(allocator, b, .nfd); | 250 | const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd); |
| 148 | defer allocator.free(nfd_b); | 251 | defer allocator.free(nfd_b); |
| 149 | 252 | ||
| 150 | var need_free_cf_nfd_b = false; | 253 | var need_free_cf_nfd_b = false; |
| 151 | var cf_nfd_b: []const u21 = nfd_b; | 254 | var cf_nfd_b: []const u21 = nfd_b; |
| 152 | if (self.changesWhenCaseFolded(nfd_b)) { | 255 | if (casefold.changesWhenCaseFolded(nfd_b)) { |
| 153 | cf_nfd_b = try self.caseFold(allocator, nfd_b); | 256 | cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b); |
| 154 | need_free_cf_nfd_b = true; | 257 | need_free_cf_nfd_b = true; |
| 155 | } | 258 | } |
| 156 | defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); | 259 | defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); |
| @@ -158,7 +261,7 @@ pub fn canonCaselessMatch( | |||
| 158 | var need_free_nfd_cf_nfd_b = false; | 261 | var need_free_nfd_cf_nfd_b = false; |
| 159 | var nfd_cf_nfd_b = cf_nfd_b; | 262 | var nfd_cf_nfd_b = cf_nfd_b; |
| 160 | if (!need_free_cf_nfd_b) { | 263 | if (!need_free_cf_nfd_b) { |
| 161 | nfd_cf_nfd_b = try normalizer.nfdCodePoints(allocator, cf_nfd_b); | 264 | nfd_cf_nfd_b = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_b); |
| 162 | need_free_nfd_cf_nfd_b = true; | 265 | need_free_nfd_cf_nfd_b = true; |
| 163 | } | 266 | } |
| 164 | defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b); | 267 | defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b); |
| @@ -169,19 +272,50 @@ pub fn canonCaselessMatch( | |||
| 169 | test "canonCaselessMatch" { | 272 | test "canonCaselessMatch" { |
| 170 | const allocator = testing.allocator; | 273 | const allocator = testing.allocator; |
| 171 | 274 | ||
| 172 | const n = try Normalize.init(allocator); | 275 | const caser = try CaseFolding.init(allocator); |
| 173 | defer n.deinit(allocator); | 276 | defer caser.deinit(allocator); |
| 174 | |||
| 175 | const fold_data = try FoldData.init(allocator); | ||
| 176 | defer fold_data.deinit(allocator); | ||
| 177 | const caser = Self{ .fold_data = &fold_data }; | ||
| 178 | 277 | ||
| 179 | try testing.expect(try caser.canonCaselessMatch(allocator, &n, "ascii only!", "ASCII Only!")); | 278 | try testing.expect(try caser.canonCaselessMatch(allocator, "ascii only!", "ASCII Only!")); |
| 180 | 279 | ||
| 181 | const a = "Héllo World! \u{3d3}"; | 280 | const a = "Héllo World! \u{3d3}"; |
| 182 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; | 281 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; |
| 183 | try testing.expect(!try caser.canonCaselessMatch(allocator, &n, a, b)); | 282 | try testing.expect(!try caser.canonCaselessMatch(allocator, a, b)); |
| 184 | 283 | ||
| 185 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; | 284 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; |
| 186 | try testing.expect(try caser.canonCaselessMatch(allocator, &n, a, c)); | 285 | try testing.expect(try caser.canonCaselessMatch(allocator, a, c)); |
| 187 | } | 286 | } |
| 287 | |||
| 288 | fn testAllocations(allocator: Allocator) !void { | ||
| 289 | // With normalize provided | ||
| 290 | { | ||
| 291 | const normalize = try Normalize.init(allocator); | ||
| 292 | defer normalize.deinit(allocator); | ||
| 293 | const caser1 = try CaseFolding.initWithNormalize(allocator, normalize); | ||
| 294 | defer caser1.deinit(allocator); | ||
| 295 | } | ||
| 296 | // With normalize owned | ||
| 297 | { | ||
| 298 | const caser2 = try CaseFolding.init(allocator); | ||
| 299 | defer caser2.deinit(allocator); | ||
| 300 | } | ||
| 301 | } | ||
| 302 | |||
| 303 | // test "Allocation Failures" { | ||
| 304 | // if (true) return error.SkipZigTest; // XXX: remove | ||
| 305 | // try testing.checkAllAllocationFailures( | ||
| 306 | // testing.allocator, | ||
| 307 | // testAllocations, | ||
| 308 | // .{}, | ||
| 309 | // ); | ||
| 310 | // } | ||
| 311 | |||
| 312 | const std = @import("std"); | ||
| 313 | const builtin = @import("builtin"); | ||
| 314 | const mem = std.mem; | ||
| 315 | const testing = std.testing; | ||
| 316 | const Allocator = mem.Allocator; | ||
| 317 | |||
| 318 | const ascii = @import("ascii"); | ||
| 319 | const Normalize = @import("Normalize"); | ||
| 320 | |||
| 321 | const compress = std.compress; | ||
diff --git a/src/FoldData.zig b/src/FoldData.zig deleted file mode 100644 index b7fdceb..0000000 --- a/src/FoldData.zig +++ /dev/null | |||
| @@ -1,99 +0,0 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | const compress = std.compress; | ||
| 4 | const mem = std.mem; | ||
| 5 | |||
| 6 | cutoff: u21 = undefined, | ||
| 7 | cwcf_exceptions_min: u21 = undefined, | ||
| 8 | cwcf_exceptions_max: u21 = undefined, | ||
| 9 | cwcf_exceptions: []u21 = undefined, | ||
| 10 | multiple_start: u21 = undefined, | ||
| 11 | stage1: []u8 = undefined, | ||
| 12 | stage2: []u8 = undefined, | ||
| 13 | stage3: []i24 = undefined, | ||
| 14 | |||
| 15 | const FoldData = @This(); | ||
| 16 | |||
| 17 | pub fn init(allocator: mem.Allocator) !FoldData { | ||
| 18 | const decompressor = compress.flate.inflate.decompressor; | ||
| 19 | const in_bytes = @embedFile("fold"); | ||
| 20 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 21 | var in_decomp = decompressor(.raw, in_fbs.reader()); | ||
| 22 | var reader = in_decomp.reader(); | ||
| 23 | |||
| 24 | const endian = builtin.cpu.arch.endian(); | ||
| 25 | |||
| 26 | var fdata = FoldData{}; | ||
| 27 | fdata.cutoff = @intCast(try reader.readInt(u24, endian)); | ||
| 28 | fdata.multiple_start = @intCast(try reader.readInt(u24, endian)); | ||
| 29 | |||
| 30 | var len = try reader.readInt(u16, endian); | ||
| 31 | fdata.stage1 = try allocator.alloc(u8, len); | ||
| 32 | errdefer allocator.free(fdata.stage1); | ||
| 33 | for (0..len) |i| fdata.stage1[i] = try reader.readInt(u8, endian); | ||
| 34 | |||
| 35 | len = try reader.readInt(u16, endian); | ||
| 36 | fdata.stage2 = try allocator.alloc(u8, len); | ||
| 37 | errdefer allocator.free(fdata.stage2); | ||
| 38 | for (0..len) |i| fdata.stage2[i] = try reader.readInt(u8, endian); | ||
| 39 | |||
| 40 | len = try reader.readInt(u16, endian); | ||
| 41 | fdata.stage3 = try allocator.alloc(i24, len); | ||
| 42 | errdefer allocator.free(fdata.stage3); | ||
| 43 | for (0..len) |i| fdata.stage3[i] = try reader.readInt(i24, endian); | ||
| 44 | |||
| 45 | fdata.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian)); | ||
| 46 | fdata.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian)); | ||
| 47 | len = try reader.readInt(u16, endian); | ||
| 48 | fdata.cwcf_exceptions = try allocator.alloc(u21, len); | ||
| 49 | errdefer allocator.free(fdata.cwcf_exceptions); | ||
| 50 | for (0..len) |i| fdata.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian)); | ||
| 51 | |||
| 52 | return fdata; | ||
| 53 | } | ||
| 54 | |||
| 55 | pub fn deinit(fdata: *const FoldData, allocator: mem.Allocator) void { | ||
| 56 | allocator.free(fdata.stage1); | ||
| 57 | allocator.free(fdata.stage2); | ||
| 58 | allocator.free(fdata.stage3); | ||
| 59 | allocator.free(fdata.cwcf_exceptions); | ||
| 60 | } | ||
| 61 | |||
| 62 | /// Returns the case fold for `cp`. | ||
| 63 | pub fn caseFold(fdata: *const FoldData, cp: u21, buf: []u21) []const u21 { | ||
| 64 | if (cp >= fdata.cutoff) return &.{}; | ||
| 65 | |||
| 66 | const stage1_val = fdata.stage1[cp >> 8]; | ||
| 67 | if (stage1_val == 0) return &.{}; | ||
| 68 | |||
| 69 | const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF); | ||
| 70 | const stage3_index = fdata.stage2[stage2_index]; | ||
| 71 | |||
| 72 | if (stage3_index & 0x80 != 0) { | ||
| 73 | const real_index = @as(usize, fdata.multiple_start) + (stage3_index ^ 0x80) * 3; | ||
| 74 | const mapping = mem.sliceTo(fdata.stage3[real_index..][0..3], 0); | ||
| 75 | for (mapping, 0..) |c, i| buf[i] = @intCast(c); | ||
| 76 | |||
| 77 | return buf[0..mapping.len]; | ||
| 78 | } | ||
| 79 | |||
| 80 | const offset = fdata.stage3[stage3_index]; | ||
| 81 | if (offset == 0) return &.{}; | ||
| 82 | |||
| 83 | buf[0] = @intCast(@as(i32, cp) + offset); | ||
| 84 | |||
| 85 | return buf[0..1]; | ||
| 86 | } | ||
| 87 | |||
| 88 | /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). | ||
| 89 | pub fn changesWhenCaseFolded(fdata: *const FoldData, cp: u21) bool { | ||
| 90 | var buf: [3]u21 = undefined; | ||
| 91 | const has_mapping = fdata.caseFold(cp, &buf).len != 0; | ||
| 92 | return has_mapping and !fdata.isCwcfException(cp); | ||
| 93 | } | ||
| 94 | |||
| 95 | fn isCwcfException(fdata: *const FoldData, cp: u21) bool { | ||
| 96 | return cp >= fdata.cwcf_exceptions_min and | ||
| 97 | cp <= fdata.cwcf_exceptions_max and | ||
| 98 | std.mem.indexOfScalar(u21, fdata.cwcf_exceptions, cp) != null; | ||
| 99 | } | ||
diff --git a/src/Normalize.zig b/src/Normalize.zig index 4f014cf..d8c867d 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig | |||
| @@ -632,6 +632,21 @@ test "isLatin1Only" { | |||
| 632 | try testing.expect(!isLatin1Only(not_latin1_only)); | 632 | try testing.expect(!isLatin1Only(not_latin1_only)); |
| 633 | } | 633 | } |
| 634 | 634 | ||
| 635 | // NOTE: These tests take way waaaaay too long to run, because | ||
| 636 | // the amount of allocations in a couple of the inflators is | ||
| 637 | // completely excessive and is also costing memory for metadata. | ||
| 638 | // I'm leaving this here for when I fix that. | ||
| 639 | // | ||
| 640 | // fn testAllocations(allocator: Allocator) !void { | ||
| 641 | // const norm = try Normalize.init(allocator); | ||
| 642 | // norm.deinit(allocator); | ||
| 643 | // } | ||
| 644 | // | ||
| 645 | // test "allocation failures" { | ||
| 646 | // if (true) return error.SkipZigTest; | ||
| 647 | // try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{}); | ||
| 648 | // } | ||
| 649 | |||
| 635 | const std = @import("std"); | 650 | const std = @import("std"); |
| 636 | const debug = std.debug; | 651 | const debug = std.debug; |
| 637 | const assert = debug.assert; | 652 | const assert = debug.assert; |
| @@ -649,6 +664,5 @@ const CodePointIterator = @import("code_point").Iterator; | |||
| 649 | const CanonData = @import("CanonData"); | 664 | const CanonData = @import("CanonData"); |
| 650 | const CccData = @import("CombiningData"); | 665 | const CccData = @import("CombiningData"); |
| 651 | const CompatData = @import("CompatData"); | 666 | const CompatData = @import("CompatData"); |
| 652 | const FoldData = @import("FoldData"); | ||
| 653 | const HangulData = @import("HangulData"); | 667 | const HangulData = @import("HangulData"); |
| 654 | const NormPropsData = @import("NormPropsData"); | 668 | const NormPropsData = @import("NormPropsData"); |