summaryrefslogtreecommitdiff
path: root/src/CaseFold.zig
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-04-30 15:32:34 -0400
committerGravatar Sam Atman2025-04-30 15:32:34 -0400
commit958c13ba442e7077a50d7163fdeb9bba378f95c2 (patch)
tree0727fd03ea2344ebbad842daa05b55ea0a143a6c /src/CaseFold.zig
parentRemove FoldData, make CaseFolding (diff)
downloadzg-958c13ba442e7077a50d7163fdeb9bba378f95c2.tar.gz
zg-958c13ba442e7077a50d7163fdeb9bba378f95c2.tar.xz
zg-958c13ba442e7077a50d7163fdeb9bba378f95c2.zip
Rest of the Renamings
These get different names, but don't otherwise change.
Diffstat (limited to 'src/CaseFold.zig')
-rw-r--r--src/CaseFold.zig321
1 files changed, 0 insertions, 321 deletions
diff --git a/src/CaseFold.zig b/src/CaseFold.zig
deleted file mode 100644
index 162e82f..0000000
--- a/src/CaseFold.zig
+++ /dev/null
@@ -1,321 +0,0 @@
1cutoff: u21 = undefined,
2cwcf_exceptions_min: u21 = undefined,
3cwcf_exceptions_max: u21 = undefined,
4cwcf_exceptions: []u21 = undefined,
5multiple_start: u21 = undefined,
6stage1: []u8 = undefined,
7stage2: []u8 = undefined,
8stage3: []i24 = undefined,
9normalize: Normalize,
10owns_normalize: bool,
11
12const CaseFolding = @This();
13
14pub fn init(allocator: Allocator) !CaseFolding {
15 var case_fold: CaseFolding = undefined;
16 try case_fold.setup(allocator);
17 return case_fold;
18}
19
20pub fn initWithNormalize(allocator: Allocator, norm: Normalize) !CaseFolding {
21 var casefold: CaseFolding = undefined;
22 try casefold.setupWithNormalize(allocator, norm);
23 return casefold;
24}
25
26pub fn setup(casefold: *CaseFolding, allocator: Allocator) !void {
27 try casefold.setupImpl(allocator);
28 casefold.owns_normalize = false;
29 errdefer casefold.deinit(allocator);
30 try casefold.normalize.setup(allocator);
31 casefold.owns_normalize = true;
32}
33
34pub fn setupWithNormalize(casefold: *CaseFolding, allocator: Allocator, norm: Normalize) !void {
35 try casefold.setupImpl(allocator);
36 casefold.normalize = norm;
37 casefold.owns_normalize = false;
38}
39
40fn setupImpl(casefold: *CaseFolding, allocator: Allocator) !void {
41 const decompressor = compress.flate.inflate.decompressor;
42 const in_bytes = @embedFile("fold");
43 var in_fbs = std.io.fixedBufferStream(in_bytes);
44 var in_decomp = decompressor(.raw, in_fbs.reader());
45 var reader = in_decomp.reader();
46
47 const endian = builtin.cpu.arch.endian();
48
49 casefold.cutoff = @intCast(try reader.readInt(u24, endian));
50 casefold.multiple_start = @intCast(try reader.readInt(u24, endian));
51
52 var len = try reader.readInt(u16, endian);
53 casefold.stage1 = try allocator.alloc(u8, len);
54 errdefer allocator.free(casefold.stage1);
55 for (0..len) |i| casefold.stage1[i] = try reader.readInt(u8, endian);
56
57 len = try reader.readInt(u16, endian);
58 casefold.stage2 = try allocator.alloc(u8, len);
59 errdefer allocator.free(casefold.stage2);
60 for (0..len) |i| casefold.stage2[i] = try reader.readInt(u8, endian);
61
62 len = try reader.readInt(u16, endian);
63 casefold.stage3 = try allocator.alloc(i24, len);
64 errdefer allocator.free(casefold.stage3);
65 for (0..len) |i| casefold.stage3[i] = try reader.readInt(i24, endian);
66
67 casefold.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian));
68 casefold.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian));
69 len = try reader.readInt(u16, endian);
70 casefold.cwcf_exceptions = try allocator.alloc(u21, len);
71 errdefer allocator.free(casefold.cwcf_exceptions);
72 for (0..len) |i| casefold.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian));
73}
74
75pub fn deinit(fdata: *const CaseFolding, allocator: mem.Allocator) void {
76 allocator.free(fdata.stage1);
77 allocator.free(fdata.stage2);
78 allocator.free(fdata.stage3);
79 allocator.free(fdata.cwcf_exceptions);
80 if (fdata.owns_normalize) fdata.normalize.deinit(allocator);
81}
82
83/// Returns the case fold for `cp`.
84pub fn caseFold(fdata: *const CaseFolding, cp: u21, buf: []u21) []const u21 {
85 if (cp >= fdata.cutoff) return &.{};
86
87 const stage1_val = fdata.stage1[cp >> 8];
88 if (stage1_val == 0) return &.{};
89
90 const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF);
91 const stage3_index = fdata.stage2[stage2_index];
92
93 if (stage3_index & 0x80 != 0) {
94 const real_index = @as(usize, fdata.multiple_start) + (stage3_index ^ 0x80) * 3;
95 const mapping = mem.sliceTo(fdata.stage3[real_index..][0..3], 0);
96 for (mapping, 0..) |c, i| buf[i] = @intCast(c);
97
98 return buf[0..mapping.len];
99 }
100
101 const offset = fdata.stage3[stage3_index];
102 if (offset == 0) return &.{};
103
104 buf[0] = @intCast(@as(i32, cp) + offset);
105
106 return buf[0..1];
107}
108
109/// Produces the case folded code points for `cps`. Caller must free returned
110/// slice with `allocator`.
111pub fn caseFoldAlloc(
112 casefold: *const CaseFolding,
113 allocator: Allocator,
114 cps: []const u21,
115) Allocator.Error![]const u21 {
116 var cfcps = std.ArrayList(u21).init(allocator);
117 defer cfcps.deinit();
118 var buf: [3]u21 = undefined;
119
120 for (cps) |cp| {
121 const cf = casefold.caseFold(cp, &buf);
122
123 if (cf.len == 0) {
124 try cfcps.append(cp);
125 } else {
126 try cfcps.appendSlice(cf);
127 }
128 }
129
130 return try cfcps.toOwnedSlice();
131}
132
133/// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`).
134pub fn cpChangesWhenCaseFolded(casefold: *const CaseFolding, cp: u21) bool {
135 var buf: [3]u21 = undefined;
136 const has_mapping = casefold.caseFold(cp, &buf).len != 0;
137 return has_mapping and !casefold.isCwcfException(cp);
138}
139
140pub fn changesWhenCaseFolded(casefold: *const CaseFolding, cps: []const u21) bool {
141 return for (cps) |cp| {
142 if (casefold.cpChangesWhenCaseFolded(cp)) break true;
143 } else false;
144}
145
146fn isCwcfException(casefold: *const CaseFolding, cp: u21) bool {
147 return cp >= casefold.cwcf_exceptions_min and
148 cp <= casefold.cwcf_exceptions_max and
149 std.mem.indexOfScalar(u21, casefold.cwcf_exceptions, cp) != null;
150}
151
152/// Caseless compare `a` and `b` by decomposing to NFKD. This is the most
153/// comprehensive comparison possible, but slower than `canonCaselessMatch`.
154pub fn compatCaselessMatch(
155 casefold: *const CaseFolding,
156 allocator: Allocator,
157 a: []const u8,
158 b: []const u8,
159) Allocator.Error!bool {
160 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
161
162 // Process a
163 const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd);
164 defer allocator.free(nfd_a);
165
166 var need_free_cf_nfd_a = false;
167 var cf_nfd_a: []const u21 = nfd_a;
168 if (casefold.changesWhenCaseFolded(nfd_a)) {
169 cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a);
170 need_free_cf_nfd_a = true;
171 }
172 defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a);
173
174 const nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_a);
175 defer allocator.free(nfkd_cf_nfd_a);
176 const cf_nfkd_cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_a);
177 defer allocator.free(cf_nfkd_cf_nfd_a);
178 const nfkd_cf_nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a);
179 defer allocator.free(nfkd_cf_nfkd_cf_nfd_a);
180
181 // Process b
182 const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd);
183 defer allocator.free(nfd_b);
184
185 var need_free_cf_nfd_b = false;
186 var cf_nfd_b: []const u21 = nfd_b;
187 if (casefold.changesWhenCaseFolded(nfd_b)) {
188 cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b);
189 need_free_cf_nfd_b = true;
190 }
191 defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b);
192
193 const nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_b);
194 defer allocator.free(nfkd_cf_nfd_b);
195 const cf_nfkd_cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_b);
196 defer allocator.free(cf_nfkd_cf_nfd_b);
197 const nfkd_cf_nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b);
198 defer allocator.free(nfkd_cf_nfkd_cf_nfd_b);
199
200 return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b);
201}
202
203test "compatCaselessMatch" {
204 const allocator = testing.allocator;
205
206 const caser = try CaseFolding.init(allocator);
207 defer caser.deinit(allocator);
208
209 try testing.expect(try caser.compatCaselessMatch(allocator, "ascii only!", "ASCII Only!"));
210
211 const a = "Héllo World! \u{3d3}";
212 const b = "He\u{301}llo World! \u{3a5}\u{301}";
213 try testing.expect(try caser.compatCaselessMatch(allocator, a, b));
214
215 const c = "He\u{301}llo World! \u{3d2}\u{301}";
216 try testing.expect(try caser.compatCaselessMatch(allocator, a, c));
217}
218
219/// Performs canonical caseless string matching by decomposing to NFD. This is
220/// faster than `compatCaselessMatch`, but less comprehensive.
221pub fn canonCaselessMatch(
222 casefold: *const CaseFolding,
223 allocator: Allocator,
224 a: []const u8,
225 b: []const u8,
226) Allocator.Error!bool {
227 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
228
229 // Process a
230 const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd);
231 defer allocator.free(nfd_a);
232
233 var need_free_cf_nfd_a = false;
234 var cf_nfd_a: []const u21 = nfd_a;
235 if (casefold.changesWhenCaseFolded(nfd_a)) {
236 cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a);
237 need_free_cf_nfd_a = true;
238 }
239 defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a);
240
241 var need_free_nfd_cf_nfd_a = false;
242 var nfd_cf_nfd_a = cf_nfd_a;
243 if (!need_free_cf_nfd_a) {
244 nfd_cf_nfd_a = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_a);
245 need_free_nfd_cf_nfd_a = true;
246 }
247 defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a);
248
249 // Process b
250 const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd);
251 defer allocator.free(nfd_b);
252
253 var need_free_cf_nfd_b = false;
254 var cf_nfd_b: []const u21 = nfd_b;
255 if (casefold.changesWhenCaseFolded(nfd_b)) {
256 cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b);
257 need_free_cf_nfd_b = true;
258 }
259 defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b);
260
261 var need_free_nfd_cf_nfd_b = false;
262 var nfd_cf_nfd_b = cf_nfd_b;
263 if (!need_free_cf_nfd_b) {
264 nfd_cf_nfd_b = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_b);
265 need_free_nfd_cf_nfd_b = true;
266 }
267 defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b);
268
269 return mem.eql(u21, nfd_cf_nfd_a, nfd_cf_nfd_b);
270}
271
272test "canonCaselessMatch" {
273 const allocator = testing.allocator;
274
275 const caser = try CaseFolding.init(allocator);
276 defer caser.deinit(allocator);
277
278 try testing.expect(try caser.canonCaselessMatch(allocator, "ascii only!", "ASCII Only!"));
279
280 const a = "Héllo World! \u{3d3}";
281 const b = "He\u{301}llo World! \u{3a5}\u{301}";
282 try testing.expect(!try caser.canonCaselessMatch(allocator, a, b));
283
284 const c = "He\u{301}llo World! \u{3d2}\u{301}";
285 try testing.expect(try caser.canonCaselessMatch(allocator, a, c));
286}
287
288fn testAllocations(allocator: Allocator) !void {
289 // With normalize provided
290 {
291 const normalize = try Normalize.init(allocator);
292 defer normalize.deinit(allocator);
293 const caser1 = try CaseFolding.initWithNormalize(allocator, normalize);
294 defer caser1.deinit(allocator);
295 }
296 // With normalize owned
297 {
298 const caser2 = try CaseFolding.init(allocator);
299 defer caser2.deinit(allocator);
300 }
301}
302
303// test "Allocation Failures" {
304// if (true) return error.SkipZigTest; // XXX: remove
305// try testing.checkAllAllocationFailures(
306// testing.allocator,
307// testAllocations,
308// .{},
309// );
310// }
311
312const std = @import("std");
313const builtin = @import("builtin");
314const mem = std.mem;
315const testing = std.testing;
316const Allocator = mem.Allocator;
317
318const ascii = @import("ascii");
319const Normalize = @import("Normalize");
320
321const compress = std.compress;