diff options
| author | 2025-04-30 15:32:34 -0400 | |
|---|---|---|
| committer | 2025-04-30 15:32:34 -0400 | |
| commit | 958c13ba442e7077a50d7163fdeb9bba378f95c2 (patch) | |
| tree | 0727fd03ea2344ebbad842daa05b55ea0a143a6c /src/CaseFold.zig | |
| parent | Remove FoldData, make CaseFolding (diff) | |
| download | zg-958c13ba442e7077a50d7163fdeb9bba378f95c2.tar.gz zg-958c13ba442e7077a50d7163fdeb9bba378f95c2.tar.xz zg-958c13ba442e7077a50d7163fdeb9bba378f95c2.zip | |
Rest of the Renamings
These get different names, but don't otherwise change.
Diffstat (limited to 'src/CaseFold.zig')
| -rw-r--r-- | src/CaseFold.zig | 321 |
1 files changed, 0 insertions, 321 deletions
diff --git a/src/CaseFold.zig b/src/CaseFold.zig deleted file mode 100644 index 162e82f..0000000 --- a/src/CaseFold.zig +++ /dev/null | |||
| @@ -1,321 +0,0 @@ | |||
| 1 | cutoff: u21 = undefined, | ||
| 2 | cwcf_exceptions_min: u21 = undefined, | ||
| 3 | cwcf_exceptions_max: u21 = undefined, | ||
| 4 | cwcf_exceptions: []u21 = undefined, | ||
| 5 | multiple_start: u21 = undefined, | ||
| 6 | stage1: []u8 = undefined, | ||
| 7 | stage2: []u8 = undefined, | ||
| 8 | stage3: []i24 = undefined, | ||
| 9 | normalize: Normalize, | ||
| 10 | owns_normalize: bool, | ||
| 11 | |||
| 12 | const CaseFolding = @This(); | ||
| 13 | |||
| 14 | pub fn init(allocator: Allocator) !CaseFolding { | ||
| 15 | var case_fold: CaseFolding = undefined; | ||
| 16 | try case_fold.setup(allocator); | ||
| 17 | return case_fold; | ||
| 18 | } | ||
| 19 | |||
| 20 | pub fn initWithNormalize(allocator: Allocator, norm: Normalize) !CaseFolding { | ||
| 21 | var casefold: CaseFolding = undefined; | ||
| 22 | try casefold.setupWithNormalize(allocator, norm); | ||
| 23 | return casefold; | ||
| 24 | } | ||
| 25 | |||
| 26 | pub fn setup(casefold: *CaseFolding, allocator: Allocator) !void { | ||
| 27 | try casefold.setupImpl(allocator); | ||
| 28 | casefold.owns_normalize = false; | ||
| 29 | errdefer casefold.deinit(allocator); | ||
| 30 | try casefold.normalize.setup(allocator); | ||
| 31 | casefold.owns_normalize = true; | ||
| 32 | } | ||
| 33 | |||
| 34 | pub fn setupWithNormalize(casefold: *CaseFolding, allocator: Allocator, norm: Normalize) !void { | ||
| 35 | try casefold.setupImpl(allocator); | ||
| 36 | casefold.normalize = norm; | ||
| 37 | casefold.owns_normalize = false; | ||
| 38 | } | ||
| 39 | |||
| 40 | fn setupImpl(casefold: *CaseFolding, allocator: Allocator) !void { | ||
| 41 | const decompressor = compress.flate.inflate.decompressor; | ||
| 42 | const in_bytes = @embedFile("fold"); | ||
| 43 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 44 | var in_decomp = decompressor(.raw, in_fbs.reader()); | ||
| 45 | var reader = in_decomp.reader(); | ||
| 46 | |||
| 47 | const endian = builtin.cpu.arch.endian(); | ||
| 48 | |||
| 49 | casefold.cutoff = @intCast(try reader.readInt(u24, endian)); | ||
| 50 | casefold.multiple_start = @intCast(try reader.readInt(u24, endian)); | ||
| 51 | |||
| 52 | var len = try reader.readInt(u16, endian); | ||
| 53 | casefold.stage1 = try allocator.alloc(u8, len); | ||
| 54 | errdefer allocator.free(casefold.stage1); | ||
| 55 | for (0..len) |i| casefold.stage1[i] = try reader.readInt(u8, endian); | ||
| 56 | |||
| 57 | len = try reader.readInt(u16, endian); | ||
| 58 | casefold.stage2 = try allocator.alloc(u8, len); | ||
| 59 | errdefer allocator.free(casefold.stage2); | ||
| 60 | for (0..len) |i| casefold.stage2[i] = try reader.readInt(u8, endian); | ||
| 61 | |||
| 62 | len = try reader.readInt(u16, endian); | ||
| 63 | casefold.stage3 = try allocator.alloc(i24, len); | ||
| 64 | errdefer allocator.free(casefold.stage3); | ||
| 65 | for (0..len) |i| casefold.stage3[i] = try reader.readInt(i24, endian); | ||
| 66 | |||
| 67 | casefold.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian)); | ||
| 68 | casefold.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian)); | ||
| 69 | len = try reader.readInt(u16, endian); | ||
| 70 | casefold.cwcf_exceptions = try allocator.alloc(u21, len); | ||
| 71 | errdefer allocator.free(casefold.cwcf_exceptions); | ||
| 72 | for (0..len) |i| casefold.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian)); | ||
| 73 | } | ||
| 74 | |||
| 75 | pub fn deinit(fdata: *const CaseFolding, allocator: mem.Allocator) void { | ||
| 76 | allocator.free(fdata.stage1); | ||
| 77 | allocator.free(fdata.stage2); | ||
| 78 | allocator.free(fdata.stage3); | ||
| 79 | allocator.free(fdata.cwcf_exceptions); | ||
| 80 | if (fdata.owns_normalize) fdata.normalize.deinit(allocator); | ||
| 81 | } | ||
| 82 | |||
| 83 | /// Returns the case fold for `cp`. | ||
| 84 | pub fn caseFold(fdata: *const CaseFolding, cp: u21, buf: []u21) []const u21 { | ||
| 85 | if (cp >= fdata.cutoff) return &.{}; | ||
| 86 | |||
| 87 | const stage1_val = fdata.stage1[cp >> 8]; | ||
| 88 | if (stage1_val == 0) return &.{}; | ||
| 89 | |||
| 90 | const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF); | ||
| 91 | const stage3_index = fdata.stage2[stage2_index]; | ||
| 92 | |||
| 93 | if (stage3_index & 0x80 != 0) { | ||
| 94 | const real_index = @as(usize, fdata.multiple_start) + (stage3_index ^ 0x80) * 3; | ||
| 95 | const mapping = mem.sliceTo(fdata.stage3[real_index..][0..3], 0); | ||
| 96 | for (mapping, 0..) |c, i| buf[i] = @intCast(c); | ||
| 97 | |||
| 98 | return buf[0..mapping.len]; | ||
| 99 | } | ||
| 100 | |||
| 101 | const offset = fdata.stage3[stage3_index]; | ||
| 102 | if (offset == 0) return &.{}; | ||
| 103 | |||
| 104 | buf[0] = @intCast(@as(i32, cp) + offset); | ||
| 105 | |||
| 106 | return buf[0..1]; | ||
| 107 | } | ||
| 108 | |||
| 109 | /// Produces the case folded code points for `cps`. Caller must free returned | ||
| 110 | /// slice with `allocator`. | ||
| 111 | pub fn caseFoldAlloc( | ||
| 112 | casefold: *const CaseFolding, | ||
| 113 | allocator: Allocator, | ||
| 114 | cps: []const u21, | ||
| 115 | ) Allocator.Error![]const u21 { | ||
| 116 | var cfcps = std.ArrayList(u21).init(allocator); | ||
| 117 | defer cfcps.deinit(); | ||
| 118 | var buf: [3]u21 = undefined; | ||
| 119 | |||
| 120 | for (cps) |cp| { | ||
| 121 | const cf = casefold.caseFold(cp, &buf); | ||
| 122 | |||
| 123 | if (cf.len == 0) { | ||
| 124 | try cfcps.append(cp); | ||
| 125 | } else { | ||
| 126 | try cfcps.appendSlice(cf); | ||
| 127 | } | ||
| 128 | } | ||
| 129 | |||
| 130 | return try cfcps.toOwnedSlice(); | ||
| 131 | } | ||
| 132 | |||
| 133 | /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). | ||
| 134 | pub fn cpChangesWhenCaseFolded(casefold: *const CaseFolding, cp: u21) bool { | ||
| 135 | var buf: [3]u21 = undefined; | ||
| 136 | const has_mapping = casefold.caseFold(cp, &buf).len != 0; | ||
| 137 | return has_mapping and !casefold.isCwcfException(cp); | ||
| 138 | } | ||
| 139 | |||
| 140 | pub fn changesWhenCaseFolded(casefold: *const CaseFolding, cps: []const u21) bool { | ||
| 141 | return for (cps) |cp| { | ||
| 142 | if (casefold.cpChangesWhenCaseFolded(cp)) break true; | ||
| 143 | } else false; | ||
| 144 | } | ||
| 145 | |||
| 146 | fn isCwcfException(casefold: *const CaseFolding, cp: u21) bool { | ||
| 147 | return cp >= casefold.cwcf_exceptions_min and | ||
| 148 | cp <= casefold.cwcf_exceptions_max and | ||
| 149 | std.mem.indexOfScalar(u21, casefold.cwcf_exceptions, cp) != null; | ||
| 150 | } | ||
| 151 | |||
| 152 | /// Caseless compare `a` and `b` by decomposing to NFKD. This is the most | ||
| 153 | /// comprehensive comparison possible, but slower than `canonCaselessMatch`. | ||
| 154 | pub fn compatCaselessMatch( | ||
| 155 | casefold: *const CaseFolding, | ||
| 156 | allocator: Allocator, | ||
| 157 | a: []const u8, | ||
| 158 | b: []const u8, | ||
| 159 | ) Allocator.Error!bool { | ||
| 160 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); | ||
| 161 | |||
| 162 | // Process a | ||
| 163 | const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd); | ||
| 164 | defer allocator.free(nfd_a); | ||
| 165 | |||
| 166 | var need_free_cf_nfd_a = false; | ||
| 167 | var cf_nfd_a: []const u21 = nfd_a; | ||
| 168 | if (casefold.changesWhenCaseFolded(nfd_a)) { | ||
| 169 | cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a); | ||
| 170 | need_free_cf_nfd_a = true; | ||
| 171 | } | ||
| 172 | defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); | ||
| 173 | |||
| 174 | const nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_a); | ||
| 175 | defer allocator.free(nfkd_cf_nfd_a); | ||
| 176 | const cf_nfkd_cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_a); | ||
| 177 | defer allocator.free(cf_nfkd_cf_nfd_a); | ||
| 178 | const nfkd_cf_nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); | ||
| 179 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); | ||
| 180 | |||
| 181 | // Process b | ||
| 182 | const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd); | ||
| 183 | defer allocator.free(nfd_b); | ||
| 184 | |||
| 185 | var need_free_cf_nfd_b = false; | ||
| 186 | var cf_nfd_b: []const u21 = nfd_b; | ||
| 187 | if (casefold.changesWhenCaseFolded(nfd_b)) { | ||
| 188 | cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b); | ||
| 189 | need_free_cf_nfd_b = true; | ||
| 190 | } | ||
| 191 | defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); | ||
| 192 | |||
| 193 | const nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_b); | ||
| 194 | defer allocator.free(nfkd_cf_nfd_b); | ||
| 195 | const cf_nfkd_cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_b); | ||
| 196 | defer allocator.free(cf_nfkd_cf_nfd_b); | ||
| 197 | const nfkd_cf_nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); | ||
| 198 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); | ||
| 199 | |||
| 200 | return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); | ||
| 201 | } | ||
| 202 | |||
| 203 | test "compatCaselessMatch" { | ||
| 204 | const allocator = testing.allocator; | ||
| 205 | |||
| 206 | const caser = try CaseFolding.init(allocator); | ||
| 207 | defer caser.deinit(allocator); | ||
| 208 | |||
| 209 | try testing.expect(try caser.compatCaselessMatch(allocator, "ascii only!", "ASCII Only!")); | ||
| 210 | |||
| 211 | const a = "Héllo World! \u{3d3}"; | ||
| 212 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; | ||
| 213 | try testing.expect(try caser.compatCaselessMatch(allocator, a, b)); | ||
| 214 | |||
| 215 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; | ||
| 216 | try testing.expect(try caser.compatCaselessMatch(allocator, a, c)); | ||
| 217 | } | ||
| 218 | |||
| 219 | /// Performs canonical caseless string matching by decomposing to NFD. This is | ||
| 220 | /// faster than `compatCaselessMatch`, but less comprehensive. | ||
| 221 | pub fn canonCaselessMatch( | ||
| 222 | casefold: *const CaseFolding, | ||
| 223 | allocator: Allocator, | ||
| 224 | a: []const u8, | ||
| 225 | b: []const u8, | ||
| 226 | ) Allocator.Error!bool { | ||
| 227 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); | ||
| 228 | |||
| 229 | // Process a | ||
| 230 | const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd); | ||
| 231 | defer allocator.free(nfd_a); | ||
| 232 | |||
| 233 | var need_free_cf_nfd_a = false; | ||
| 234 | var cf_nfd_a: []const u21 = nfd_a; | ||
| 235 | if (casefold.changesWhenCaseFolded(nfd_a)) { | ||
| 236 | cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a); | ||
| 237 | need_free_cf_nfd_a = true; | ||
| 238 | } | ||
| 239 | defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); | ||
| 240 | |||
| 241 | var need_free_nfd_cf_nfd_a = false; | ||
| 242 | var nfd_cf_nfd_a = cf_nfd_a; | ||
| 243 | if (!need_free_cf_nfd_a) { | ||
| 244 | nfd_cf_nfd_a = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_a); | ||
| 245 | need_free_nfd_cf_nfd_a = true; | ||
| 246 | } | ||
| 247 | defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); | ||
| 248 | |||
| 249 | // Process b | ||
| 250 | const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd); | ||
| 251 | defer allocator.free(nfd_b); | ||
| 252 | |||
| 253 | var need_free_cf_nfd_b = false; | ||
| 254 | var cf_nfd_b: []const u21 = nfd_b; | ||
| 255 | if (casefold.changesWhenCaseFolded(nfd_b)) { | ||
| 256 | cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b); | ||
| 257 | need_free_cf_nfd_b = true; | ||
| 258 | } | ||
| 259 | defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); | ||
| 260 | |||
| 261 | var need_free_nfd_cf_nfd_b = false; | ||
| 262 | var nfd_cf_nfd_b = cf_nfd_b; | ||
| 263 | if (!need_free_cf_nfd_b) { | ||
| 264 | nfd_cf_nfd_b = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_b); | ||
| 265 | need_free_nfd_cf_nfd_b = true; | ||
| 266 | } | ||
| 267 | defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b); | ||
| 268 | |||
| 269 | return mem.eql(u21, nfd_cf_nfd_a, nfd_cf_nfd_b); | ||
| 270 | } | ||
| 271 | |||
| 272 | test "canonCaselessMatch" { | ||
| 273 | const allocator = testing.allocator; | ||
| 274 | |||
| 275 | const caser = try CaseFolding.init(allocator); | ||
| 276 | defer caser.deinit(allocator); | ||
| 277 | |||
| 278 | try testing.expect(try caser.canonCaselessMatch(allocator, "ascii only!", "ASCII Only!")); | ||
| 279 | |||
| 280 | const a = "Héllo World! \u{3d3}"; | ||
| 281 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; | ||
| 282 | try testing.expect(!try caser.canonCaselessMatch(allocator, a, b)); | ||
| 283 | |||
| 284 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; | ||
| 285 | try testing.expect(try caser.canonCaselessMatch(allocator, a, c)); | ||
| 286 | } | ||
| 287 | |||
| 288 | fn testAllocations(allocator: Allocator) !void { | ||
| 289 | // With normalize provided | ||
| 290 | { | ||
| 291 | const normalize = try Normalize.init(allocator); | ||
| 292 | defer normalize.deinit(allocator); | ||
| 293 | const caser1 = try CaseFolding.initWithNormalize(allocator, normalize); | ||
| 294 | defer caser1.deinit(allocator); | ||
| 295 | } | ||
| 296 | // With normalize owned | ||
| 297 | { | ||
| 298 | const caser2 = try CaseFolding.init(allocator); | ||
| 299 | defer caser2.deinit(allocator); | ||
| 300 | } | ||
| 301 | } | ||
| 302 | |||
| 303 | // test "Allocation Failures" { | ||
| 304 | // if (true) return error.SkipZigTest; // XXX: remove | ||
| 305 | // try testing.checkAllAllocationFailures( | ||
| 306 | // testing.allocator, | ||
| 307 | // testAllocations, | ||
| 308 | // .{}, | ||
| 309 | // ); | ||
| 310 | // } | ||
| 311 | |||
| 312 | const std = @import("std"); | ||
| 313 | const builtin = @import("builtin"); | ||
| 314 | const mem = std.mem; | ||
| 315 | const testing = std.testing; | ||
| 316 | const Allocator = mem.Allocator; | ||
| 317 | |||
| 318 | const ascii = @import("ascii"); | ||
| 319 | const Normalize = @import("Normalize"); | ||
| 320 | |||
| 321 | const compress = std.compress; | ||