summaryrefslogtreecommitdiff
path: root/src/CaseFold.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/CaseFold.zig')
-rw-r--r--src/CaseFold.zig256
1 files changed, 195 insertions, 61 deletions
diff --git a/src/CaseFold.zig b/src/CaseFold.zig
index 6490aea..162e82f 100644
--- a/src/CaseFold.zig
+++ b/src/CaseFold.zig
@@ -1,28 +1,124 @@
1const std = @import("std"); 1cutoff: u21 = undefined,
2const mem = std.mem; 2cwcf_exceptions_min: u21 = undefined,
3const testing = std.testing; 3cwcf_exceptions_max: u21 = undefined,
4cwcf_exceptions: []u21 = undefined,
5multiple_start: u21 = undefined,
6stage1: []u8 = undefined,
7stage2: []u8 = undefined,
8stage3: []i24 = undefined,
9normalize: Normalize,
10owns_normalize: bool,
11
12const CaseFolding = @This();
13
14pub fn init(allocator: Allocator) !CaseFolding {
15 var case_fold: CaseFolding = undefined;
16 try case_fold.setup(allocator);
17 return case_fold;
18}
4 19
5const ascii = @import("ascii"); 20pub fn initWithNormalize(allocator: Allocator, norm: Normalize) !CaseFolding {
6pub const FoldData = @import("FoldData"); 21 var casefold: CaseFolding = undefined;
7const Normalize = @import("Normalize"); 22 try casefold.setupWithNormalize(allocator, norm);
23 return casefold;
24}
25
26pub fn setup(casefold: *CaseFolding, allocator: Allocator) !void {
27 try casefold.setupImpl(allocator);
28 casefold.owns_normalize = false;
29 errdefer casefold.deinit(allocator);
30 try casefold.normalize.setup(allocator);
31 casefold.owns_normalize = true;
32}
33
34pub fn setupWithNormalize(casefold: *CaseFolding, allocator: Allocator, norm: Normalize) !void {
35 try casefold.setupImpl(allocator);
36 casefold.normalize = norm;
37 casefold.owns_normalize = false;
38}
8 39
9fold_data: *const FoldData, 40fn setupImpl(casefold: *CaseFolding, allocator: Allocator) !void {
41 const decompressor = compress.flate.inflate.decompressor;
42 const in_bytes = @embedFile("fold");
43 var in_fbs = std.io.fixedBufferStream(in_bytes);
44 var in_decomp = decompressor(.raw, in_fbs.reader());
45 var reader = in_decomp.reader();
46
47 const endian = builtin.cpu.arch.endian();
48
49 casefold.cutoff = @intCast(try reader.readInt(u24, endian));
50 casefold.multiple_start = @intCast(try reader.readInt(u24, endian));
51
52 var len = try reader.readInt(u16, endian);
53 casefold.stage1 = try allocator.alloc(u8, len);
54 errdefer allocator.free(casefold.stage1);
55 for (0..len) |i| casefold.stage1[i] = try reader.readInt(u8, endian);
56
57 len = try reader.readInt(u16, endian);
58 casefold.stage2 = try allocator.alloc(u8, len);
59 errdefer allocator.free(casefold.stage2);
60 for (0..len) |i| casefold.stage2[i] = try reader.readInt(u8, endian);
61
62 len = try reader.readInt(u16, endian);
63 casefold.stage3 = try allocator.alloc(i24, len);
64 errdefer allocator.free(casefold.stage3);
65 for (0..len) |i| casefold.stage3[i] = try reader.readInt(i24, endian);
66
67 casefold.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian));
68 casefold.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian));
69 len = try reader.readInt(u16, endian);
70 casefold.cwcf_exceptions = try allocator.alloc(u21, len);
71 errdefer allocator.free(casefold.cwcf_exceptions);
72 for (0..len) |i| casefold.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian));
73}
74
75pub fn deinit(fdata: *const CaseFolding, allocator: mem.Allocator) void {
76 allocator.free(fdata.stage1);
77 allocator.free(fdata.stage2);
78 allocator.free(fdata.stage3);
79 allocator.free(fdata.cwcf_exceptions);
80 if (fdata.owns_normalize) fdata.normalize.deinit(allocator);
81}
82
83/// Returns the case fold for `cp`.
84pub fn caseFold(fdata: *const CaseFolding, cp: u21, buf: []u21) []const u21 {
85 if (cp >= fdata.cutoff) return &.{};
86
87 const stage1_val = fdata.stage1[cp >> 8];
88 if (stage1_val == 0) return &.{};
89
90 const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF);
91 const stage3_index = fdata.stage2[stage2_index];
10 92
11const Self = @This(); 93 if (stage3_index & 0x80 != 0) {
94 const real_index = @as(usize, fdata.multiple_start) + (stage3_index ^ 0x80) * 3;
95 const mapping = mem.sliceTo(fdata.stage3[real_index..][0..3], 0);
96 for (mapping, 0..) |c, i| buf[i] = @intCast(c);
97
98 return buf[0..mapping.len];
99 }
100
101 const offset = fdata.stage3[stage3_index];
102 if (offset == 0) return &.{};
103
104 buf[0] = @intCast(@as(i32, cp) + offset);
105
106 return buf[0..1];
107}
12 108
13/// Produces the case folded code points for `cps`. Caller must free returned 109/// Produces the case folded code points for `cps`. Caller must free returned
14/// slice with `allocator`. 110/// slice with `allocator`.
15pub fn caseFold( 111pub fn caseFoldAlloc(
16 self: Self, 112 casefold: *const CaseFolding,
17 allocator: mem.Allocator, 113 allocator: Allocator,
18 cps: []const u21, 114 cps: []const u21,
19) ![]const u21 { 115) Allocator.Error![]const u21 {
20 var cfcps = std.ArrayList(u21).init(allocator); 116 var cfcps = std.ArrayList(u21).init(allocator);
21 defer cfcps.deinit(); 117 defer cfcps.deinit();
22 var buf: [3]u21 = undefined; 118 var buf: [3]u21 = undefined;
23 119
24 for (cps) |cp| { 120 for (cps) |cp| {
25 const cf = self.fold_data.caseFold(cp, &buf); 121 const cf = casefold.caseFold(cp, &buf);
26 122
27 if (cf.len == 0) { 123 if (cf.len == 0) {
28 try cfcps.append(cp); 124 try cfcps.append(cp);
@@ -34,59 +130,71 @@ pub fn caseFold(
34 return try cfcps.toOwnedSlice(); 130 return try cfcps.toOwnedSlice();
35} 131}
36 132
37fn changesWhenCaseFolded(self: Self, cps: []const u21) bool { 133/// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`).
134pub fn cpChangesWhenCaseFolded(casefold: *const CaseFolding, cp: u21) bool {
135 var buf: [3]u21 = undefined;
136 const has_mapping = casefold.caseFold(cp, &buf).len != 0;
137 return has_mapping and !casefold.isCwcfException(cp);
138}
139
140pub fn changesWhenCaseFolded(casefold: *const CaseFolding, cps: []const u21) bool {
38 return for (cps) |cp| { 141 return for (cps) |cp| {
39 if (self.fold_data.changesWhenCaseFolded(cp)) break true; 142 if (casefold.cpChangesWhenCaseFolded(cp)) break true;
40 } else false; 143 } else false;
41} 144}
42 145
146fn isCwcfException(casefold: *const CaseFolding, cp: u21) bool {
147 return cp >= casefold.cwcf_exceptions_min and
148 cp <= casefold.cwcf_exceptions_max and
149 std.mem.indexOfScalar(u21, casefold.cwcf_exceptions, cp) != null;
150}
151
43/// Caseless compare `a` and `b` by decomposing to NFKD. This is the most 152/// Caseless compare `a` and `b` by decomposing to NFKD. This is the most
44/// comprehensive comparison possible, but slower than `canonCaselessMatch`. 153/// comprehensive comparison possible, but slower than `canonCaselessMatch`.
45pub fn compatCaselessMatch( 154pub fn compatCaselessMatch(
46 self: Self, 155 casefold: *const CaseFolding,
47 allocator: mem.Allocator, 156 allocator: Allocator,
48 normalizer: *const Normalize,
49 a: []const u8, 157 a: []const u8,
50 b: []const u8, 158 b: []const u8,
51) !bool { 159) Allocator.Error!bool {
52 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); 160 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
53 161
54 // Process a 162 // Process a
55 const nfd_a = try normalizer.nfxdCodePoints(allocator, a, .nfd); 163 const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd);
56 defer allocator.free(nfd_a); 164 defer allocator.free(nfd_a);
57 165
58 var need_free_cf_nfd_a = false; 166 var need_free_cf_nfd_a = false;
59 var cf_nfd_a: []const u21 = nfd_a; 167 var cf_nfd_a: []const u21 = nfd_a;
60 if (self.changesWhenCaseFolded(nfd_a)) { 168 if (casefold.changesWhenCaseFolded(nfd_a)) {
61 cf_nfd_a = try self.caseFold(allocator, nfd_a); 169 cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a);
62 need_free_cf_nfd_a = true; 170 need_free_cf_nfd_a = true;
63 } 171 }
64 defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); 172 defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a);
65 173
66 const nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfd_a); 174 const nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_a);
67 defer allocator.free(nfkd_cf_nfd_a); 175 defer allocator.free(nfkd_cf_nfd_a);
68 const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a); 176 const cf_nfkd_cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_a);
69 defer allocator.free(cf_nfkd_cf_nfd_a); 177 defer allocator.free(cf_nfkd_cf_nfd_a);
70 const nfkd_cf_nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); 178 const nfkd_cf_nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a);
71 defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); 179 defer allocator.free(nfkd_cf_nfkd_cf_nfd_a);
72 180
73 // Process b 181 // Process b
74 const nfd_b = try normalizer.nfxdCodePoints(allocator, b, .nfd); 182 const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd);
75 defer allocator.free(nfd_b); 183 defer allocator.free(nfd_b);
76 184
77 var need_free_cf_nfd_b = false; 185 var need_free_cf_nfd_b = false;
78 var cf_nfd_b: []const u21 = nfd_b; 186 var cf_nfd_b: []const u21 = nfd_b;
79 if (self.changesWhenCaseFolded(nfd_b)) { 187 if (casefold.changesWhenCaseFolded(nfd_b)) {
80 cf_nfd_b = try self.caseFold(allocator, nfd_b); 188 cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b);
81 need_free_cf_nfd_b = true; 189 need_free_cf_nfd_b = true;
82 } 190 }
83 defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); 191 defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b);
84 192
85 const nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfd_b); 193 const nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_b);
86 defer allocator.free(nfkd_cf_nfd_b); 194 defer allocator.free(nfkd_cf_nfd_b);
87 const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b); 195 const cf_nfkd_cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_b);
88 defer allocator.free(cf_nfkd_cf_nfd_b); 196 defer allocator.free(cf_nfkd_cf_nfd_b);
89 const nfkd_cf_nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); 197 const nfkd_cf_nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b);
90 defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); 198 defer allocator.free(nfkd_cf_nfkd_cf_nfd_b);
91 199
92 return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); 200 return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b);
@@ -95,42 +203,37 @@ pub fn compatCaselessMatch(
95test "compatCaselessMatch" { 203test "compatCaselessMatch" {
96 const allocator = testing.allocator; 204 const allocator = testing.allocator;
97 205
98 const n = try Normalize.init(allocator); 206 const caser = try CaseFolding.init(allocator);
99 defer n.deinit(allocator); 207 defer caser.deinit(allocator);
100 208
101 const fold_data = try FoldData.init(allocator); 209 try testing.expect(try caser.compatCaselessMatch(allocator, "ascii only!", "ASCII Only!"));
102 defer fold_data.deinit(allocator);
103 const caser = Self{ .fold_data = &fold_data };
104
105 try testing.expect(try caser.compatCaselessMatch(allocator, &n, "ascii only!", "ASCII Only!"));
106 210
107 const a = "Héllo World! \u{3d3}"; 211 const a = "Héllo World! \u{3d3}";
108 const b = "He\u{301}llo World! \u{3a5}\u{301}"; 212 const b = "He\u{301}llo World! \u{3a5}\u{301}";
109 try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, b)); 213 try testing.expect(try caser.compatCaselessMatch(allocator, a, b));
110 214
111 const c = "He\u{301}llo World! \u{3d2}\u{301}"; 215 const c = "He\u{301}llo World! \u{3d2}\u{301}";
112 try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, c)); 216 try testing.expect(try caser.compatCaselessMatch(allocator, a, c));
113} 217}
114 218
115/// Performs canonical caseless string matching by decomposing to NFD. This is 219/// Performs canonical caseless string matching by decomposing to NFD. This is
116/// faster than `compatCaselessMatch`, but less comprehensive. 220/// faster than `compatCaselessMatch`, but less comprehensive.
117pub fn canonCaselessMatch( 221pub fn canonCaselessMatch(
118 self: Self, 222 casefold: *const CaseFolding,
119 allocator: mem.Allocator, 223 allocator: Allocator,
120 normalizer: *const Normalize,
121 a: []const u8, 224 a: []const u8,
122 b: []const u8, 225 b: []const u8,
123) !bool { 226) Allocator.Error!bool {
124 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); 227 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
125 228
126 // Process a 229 // Process a
127 const nfd_a = try normalizer.nfxdCodePoints(allocator, a, .nfd); 230 const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd);
128 defer allocator.free(nfd_a); 231 defer allocator.free(nfd_a);
129 232
130 var need_free_cf_nfd_a = false; 233 var need_free_cf_nfd_a = false;
131 var cf_nfd_a: []const u21 = nfd_a; 234 var cf_nfd_a: []const u21 = nfd_a;
132 if (self.changesWhenCaseFolded(nfd_a)) { 235 if (casefold.changesWhenCaseFolded(nfd_a)) {
133 cf_nfd_a = try self.caseFold(allocator, nfd_a); 236 cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a);
134 need_free_cf_nfd_a = true; 237 need_free_cf_nfd_a = true;
135 } 238 }
136 defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); 239 defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a);
@@ -138,19 +241,19 @@ pub fn canonCaselessMatch(
138 var need_free_nfd_cf_nfd_a = false; 241 var need_free_nfd_cf_nfd_a = false;
139 var nfd_cf_nfd_a = cf_nfd_a; 242 var nfd_cf_nfd_a = cf_nfd_a;
140 if (!need_free_cf_nfd_a) { 243 if (!need_free_cf_nfd_a) {
141 nfd_cf_nfd_a = try normalizer.nfdCodePoints(allocator, cf_nfd_a); 244 nfd_cf_nfd_a = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_a);
142 need_free_nfd_cf_nfd_a = true; 245 need_free_nfd_cf_nfd_a = true;
143 } 246 }
144 defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); 247 defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a);
145 248
146 // Process b 249 // Process b
147 const nfd_b = try normalizer.nfxdCodePoints(allocator, b, .nfd); 250 const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd);
148 defer allocator.free(nfd_b); 251 defer allocator.free(nfd_b);
149 252
150 var need_free_cf_nfd_b = false; 253 var need_free_cf_nfd_b = false;
151 var cf_nfd_b: []const u21 = nfd_b; 254 var cf_nfd_b: []const u21 = nfd_b;
152 if (self.changesWhenCaseFolded(nfd_b)) { 255 if (casefold.changesWhenCaseFolded(nfd_b)) {
153 cf_nfd_b = try self.caseFold(allocator, nfd_b); 256 cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b);
154 need_free_cf_nfd_b = true; 257 need_free_cf_nfd_b = true;
155 } 258 }
156 defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); 259 defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b);
@@ -158,7 +261,7 @@ pub fn canonCaselessMatch(
158 var need_free_nfd_cf_nfd_b = false; 261 var need_free_nfd_cf_nfd_b = false;
159 var nfd_cf_nfd_b = cf_nfd_b; 262 var nfd_cf_nfd_b = cf_nfd_b;
160 if (!need_free_cf_nfd_b) { 263 if (!need_free_cf_nfd_b) {
161 nfd_cf_nfd_b = try normalizer.nfdCodePoints(allocator, cf_nfd_b); 264 nfd_cf_nfd_b = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_b);
162 need_free_nfd_cf_nfd_b = true; 265 need_free_nfd_cf_nfd_b = true;
163 } 266 }
164 defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b); 267 defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b);
@@ -169,19 +272,50 @@ pub fn canonCaselessMatch(
169test "canonCaselessMatch" { 272test "canonCaselessMatch" {
170 const allocator = testing.allocator; 273 const allocator = testing.allocator;
171 274
172 const n = try Normalize.init(allocator); 275 const caser = try CaseFolding.init(allocator);
173 defer n.deinit(allocator); 276 defer caser.deinit(allocator);
174
175 const fold_data = try FoldData.init(allocator);
176 defer fold_data.deinit(allocator);
177 const caser = Self{ .fold_data = &fold_data };
178 277
179 try testing.expect(try caser.canonCaselessMatch(allocator, &n, "ascii only!", "ASCII Only!")); 278 try testing.expect(try caser.canonCaselessMatch(allocator, "ascii only!", "ASCII Only!"));
180 279
181 const a = "Héllo World! \u{3d3}"; 280 const a = "Héllo World! \u{3d3}";
182 const b = "He\u{301}llo World! \u{3a5}\u{301}"; 281 const b = "He\u{301}llo World! \u{3a5}\u{301}";
183 try testing.expect(!try caser.canonCaselessMatch(allocator, &n, a, b)); 282 try testing.expect(!try caser.canonCaselessMatch(allocator, a, b));
184 283
185 const c = "He\u{301}llo World! \u{3d2}\u{301}"; 284 const c = "He\u{301}llo World! \u{3d2}\u{301}";
186 try testing.expect(try caser.canonCaselessMatch(allocator, &n, a, c)); 285 try testing.expect(try caser.canonCaselessMatch(allocator, a, c));
187} 286}
287
288fn testAllocations(allocator: Allocator) !void {
289 // With normalize provided
290 {
291 const normalize = try Normalize.init(allocator);
292 defer normalize.deinit(allocator);
293 const caser1 = try CaseFolding.initWithNormalize(allocator, normalize);
294 defer caser1.deinit(allocator);
295 }
296 // With normalize owned
297 {
298 const caser2 = try CaseFolding.init(allocator);
299 defer caser2.deinit(allocator);
300 }
301}
302
303// test "Allocation Failures" {
304// if (true) return error.SkipZigTest; // XXX: remove
305// try testing.checkAllAllocationFailures(
306// testing.allocator,
307// testAllocations,
308// .{},
309// );
310// }
311
312const std = @import("std");
313const builtin = @import("builtin");
314const mem = std.mem;
315const testing = std.testing;
316const Allocator = mem.Allocator;
317
318const ascii = @import("ascii");
319const Normalize = @import("Normalize");
320
321const compress = std.compress;