summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-04-30 15:23:14 -0400
committerGravatar Sam Atman2025-04-30 15:23:14 -0400
commit312ca415bb01212a320acacda743896ed59a7b82 (patch)
tree8568a7af7084e5df6805388afc4f4008b8205c6c /src
parentMerge NormData with Normalize (diff)
downloadzg-312ca415bb01212a320acacda743896ed59a7b82.tar.gz
zg-312ca415bb01212a320acacda743896ed59a7b82.tar.xz
zg-312ca415bb01212a320acacda743896ed59a7b82.zip
Remove FoldData, make CaseFolding
CaseFolding now has the FoldData, and can be initialized with a copy of Normalize if wanted.
Diffstat (limited to 'src')
-rw-r--r--src/CanonData.zig14
-rw-r--r--src/CaseFold.zig256
-rw-r--r--src/FoldData.zig99
-rw-r--r--src/Normalize.zig16
4 files changed, 218 insertions, 167 deletions
diff --git a/src/CanonData.zig b/src/CanonData.zig
index c67d1d6..d95a5be 100644
--- a/src/CanonData.zig
+++ b/src/CanonData.zig
@@ -17,11 +17,11 @@ pub fn init(allocator: mem.Allocator) !CanonData {
17 .nfc = .empty, 17 .nfc = .empty,
18 .nfd = try allocator.alloc([]u21, 0x110000), 18 .nfd = try allocator.alloc([]u21, 0x110000),
19 }; 19 };
20 var _cp: u24 = undefined;
20 21
21 var slices: usize = 0;
22 errdefer { 22 errdefer {
23 cdata.nfc.deinit(allocator); 23 cdata.nfc.deinit(allocator);
24 for (cdata.nfd[0..slices]) |slice| allocator.free(slice); 24 for (cdata.nfd[0.._cp]) |slice| allocator.free(slice);
25 allocator.free(cdata.nfd); 25 allocator.free(cdata.nfd);
26 } 26 }
27 27
@@ -31,14 +31,16 @@ pub fn init(allocator: mem.Allocator) !CanonData {
31 const len: u8 = try reader.readInt(u8, endian); 31 const len: u8 = try reader.readInt(u8, endian);
32 if (len == 0) break; 32 if (len == 0) break;
33 const cp = try reader.readInt(u24, endian); 33 const cp = try reader.readInt(u24, endian);
34 cdata.nfd[cp] = try allocator.alloc(u21, len - 1); 34 _cp = cp;
35 slices += 1; 35 const nfd_cp = try allocator.alloc(u21, len - 1);
36 errdefer allocator.free(nfd_cp);
36 for (0..len - 1) |i| { 37 for (0..len - 1) |i| {
37 cdata.nfd[cp][i] = @intCast(try reader.readInt(u24, endian)); 38 nfd_cp[i] = @intCast(try reader.readInt(u24, endian));
38 } 39 }
39 if (len == 3) { 40 if (len == 3) {
40 try cdata.nfc.put(allocator, cdata.nfd[cp][0..2].*, @intCast(cp)); 41 try cdata.nfc.put(allocator, nfd_cp[0..2].*, @intCast(cp));
41 } 42 }
43 cdata.nfd[cp] = nfd_cp;
42 } 44 }
43 45
44 return cdata; 46 return cdata;
diff --git a/src/CaseFold.zig b/src/CaseFold.zig
index 6490aea..162e82f 100644
--- a/src/CaseFold.zig
+++ b/src/CaseFold.zig
@@ -1,28 +1,124 @@
1const std = @import("std"); 1cutoff: u21 = undefined,
2const mem = std.mem; 2cwcf_exceptions_min: u21 = undefined,
3const testing = std.testing; 3cwcf_exceptions_max: u21 = undefined,
4cwcf_exceptions: []u21 = undefined,
5multiple_start: u21 = undefined,
6stage1: []u8 = undefined,
7stage2: []u8 = undefined,
8stage3: []i24 = undefined,
9normalize: Normalize,
10owns_normalize: bool,
11
12const CaseFolding = @This();
13
14pub fn init(allocator: Allocator) !CaseFolding {
15 var case_fold: CaseFolding = undefined;
16 try case_fold.setup(allocator);
17 return case_fold;
18}
4 19
5const ascii = @import("ascii"); 20pub fn initWithNormalize(allocator: Allocator, norm: Normalize) !CaseFolding {
6pub const FoldData = @import("FoldData"); 21 var casefold: CaseFolding = undefined;
7const Normalize = @import("Normalize"); 22 try casefold.setupWithNormalize(allocator, norm);
23 return casefold;
24}
25
26pub fn setup(casefold: *CaseFolding, allocator: Allocator) !void {
27 try casefold.setupImpl(allocator);
28 casefold.owns_normalize = false;
29 errdefer casefold.deinit(allocator);
30 try casefold.normalize.setup(allocator);
31 casefold.owns_normalize = true;
32}
33
34pub fn setupWithNormalize(casefold: *CaseFolding, allocator: Allocator, norm: Normalize) !void {
35 try casefold.setupImpl(allocator);
36 casefold.normalize = norm;
37 casefold.owns_normalize = false;
38}
8 39
9fold_data: *const FoldData, 40fn setupImpl(casefold: *CaseFolding, allocator: Allocator) !void {
41 const decompressor = compress.flate.inflate.decompressor;
42 const in_bytes = @embedFile("fold");
43 var in_fbs = std.io.fixedBufferStream(in_bytes);
44 var in_decomp = decompressor(.raw, in_fbs.reader());
45 var reader = in_decomp.reader();
46
47 const endian = builtin.cpu.arch.endian();
48
49 casefold.cutoff = @intCast(try reader.readInt(u24, endian));
50 casefold.multiple_start = @intCast(try reader.readInt(u24, endian));
51
52 var len = try reader.readInt(u16, endian);
53 casefold.stage1 = try allocator.alloc(u8, len);
54 errdefer allocator.free(casefold.stage1);
55 for (0..len) |i| casefold.stage1[i] = try reader.readInt(u8, endian);
56
57 len = try reader.readInt(u16, endian);
58 casefold.stage2 = try allocator.alloc(u8, len);
59 errdefer allocator.free(casefold.stage2);
60 for (0..len) |i| casefold.stage2[i] = try reader.readInt(u8, endian);
61
62 len = try reader.readInt(u16, endian);
63 casefold.stage3 = try allocator.alloc(i24, len);
64 errdefer allocator.free(casefold.stage3);
65 for (0..len) |i| casefold.stage3[i] = try reader.readInt(i24, endian);
66
67 casefold.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian));
68 casefold.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian));
69 len = try reader.readInt(u16, endian);
70 casefold.cwcf_exceptions = try allocator.alloc(u21, len);
71 errdefer allocator.free(casefold.cwcf_exceptions);
72 for (0..len) |i| casefold.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian));
73}
74
75pub fn deinit(fdata: *const CaseFolding, allocator: mem.Allocator) void {
76 allocator.free(fdata.stage1);
77 allocator.free(fdata.stage2);
78 allocator.free(fdata.stage3);
79 allocator.free(fdata.cwcf_exceptions);
80 if (fdata.owns_normalize) fdata.normalize.deinit(allocator);
81}
82
83/// Returns the case fold for `cp`.
84pub fn caseFold(fdata: *const CaseFolding, cp: u21, buf: []u21) []const u21 {
85 if (cp >= fdata.cutoff) return &.{};
86
87 const stage1_val = fdata.stage1[cp >> 8];
88 if (stage1_val == 0) return &.{};
89
90 const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF);
91 const stage3_index = fdata.stage2[stage2_index];
10 92
11const Self = @This(); 93 if (stage3_index & 0x80 != 0) {
94 const real_index = @as(usize, fdata.multiple_start) + (stage3_index ^ 0x80) * 3;
95 const mapping = mem.sliceTo(fdata.stage3[real_index..][0..3], 0);
96 for (mapping, 0..) |c, i| buf[i] = @intCast(c);
97
98 return buf[0..mapping.len];
99 }
100
101 const offset = fdata.stage3[stage3_index];
102 if (offset == 0) return &.{};
103
104 buf[0] = @intCast(@as(i32, cp) + offset);
105
106 return buf[0..1];
107}
12 108
13/// Produces the case folded code points for `cps`. Caller must free returned 109/// Produces the case folded code points for `cps`. Caller must free returned
14/// slice with `allocator`. 110/// slice with `allocator`.
15pub fn caseFold( 111pub fn caseFoldAlloc(
16 self: Self, 112 casefold: *const CaseFolding,
17 allocator: mem.Allocator, 113 allocator: Allocator,
18 cps: []const u21, 114 cps: []const u21,
19) ![]const u21 { 115) Allocator.Error![]const u21 {
20 var cfcps = std.ArrayList(u21).init(allocator); 116 var cfcps = std.ArrayList(u21).init(allocator);
21 defer cfcps.deinit(); 117 defer cfcps.deinit();
22 var buf: [3]u21 = undefined; 118 var buf: [3]u21 = undefined;
23 119
24 for (cps) |cp| { 120 for (cps) |cp| {
25 const cf = self.fold_data.caseFold(cp, &buf); 121 const cf = casefold.caseFold(cp, &buf);
26 122
27 if (cf.len == 0) { 123 if (cf.len == 0) {
28 try cfcps.append(cp); 124 try cfcps.append(cp);
@@ -34,59 +130,71 @@ pub fn caseFold(
34 return try cfcps.toOwnedSlice(); 130 return try cfcps.toOwnedSlice();
35} 131}
36 132
37fn changesWhenCaseFolded(self: Self, cps: []const u21) bool { 133/// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`).
134pub fn cpChangesWhenCaseFolded(casefold: *const CaseFolding, cp: u21) bool {
135 var buf: [3]u21 = undefined;
136 const has_mapping = casefold.caseFold(cp, &buf).len != 0;
137 return has_mapping and !casefold.isCwcfException(cp);
138}
139
140pub fn changesWhenCaseFolded(casefold: *const CaseFolding, cps: []const u21) bool {
38 return for (cps) |cp| { 141 return for (cps) |cp| {
39 if (self.fold_data.changesWhenCaseFolded(cp)) break true; 142 if (casefold.cpChangesWhenCaseFolded(cp)) break true;
40 } else false; 143 } else false;
41} 144}
42 145
146fn isCwcfException(casefold: *const CaseFolding, cp: u21) bool {
147 return cp >= casefold.cwcf_exceptions_min and
148 cp <= casefold.cwcf_exceptions_max and
149 std.mem.indexOfScalar(u21, casefold.cwcf_exceptions, cp) != null;
150}
151
43/// Caseless compare `a` and `b` by decomposing to NFKD. This is the most 152/// Caseless compare `a` and `b` by decomposing to NFKD. This is the most
44/// comprehensive comparison possible, but slower than `canonCaselessMatch`. 153/// comprehensive comparison possible, but slower than `canonCaselessMatch`.
45pub fn compatCaselessMatch( 154pub fn compatCaselessMatch(
46 self: Self, 155 casefold: *const CaseFolding,
47 allocator: mem.Allocator, 156 allocator: Allocator,
48 normalizer: *const Normalize,
49 a: []const u8, 157 a: []const u8,
50 b: []const u8, 158 b: []const u8,
51) !bool { 159) Allocator.Error!bool {
52 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); 160 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
53 161
54 // Process a 162 // Process a
55 const nfd_a = try normalizer.nfxdCodePoints(allocator, a, .nfd); 163 const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd);
56 defer allocator.free(nfd_a); 164 defer allocator.free(nfd_a);
57 165
58 var need_free_cf_nfd_a = false; 166 var need_free_cf_nfd_a = false;
59 var cf_nfd_a: []const u21 = nfd_a; 167 var cf_nfd_a: []const u21 = nfd_a;
60 if (self.changesWhenCaseFolded(nfd_a)) { 168 if (casefold.changesWhenCaseFolded(nfd_a)) {
61 cf_nfd_a = try self.caseFold(allocator, nfd_a); 169 cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a);
62 need_free_cf_nfd_a = true; 170 need_free_cf_nfd_a = true;
63 } 171 }
64 defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); 172 defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a);
65 173
66 const nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfd_a); 174 const nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_a);
67 defer allocator.free(nfkd_cf_nfd_a); 175 defer allocator.free(nfkd_cf_nfd_a);
68 const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a); 176 const cf_nfkd_cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_a);
69 defer allocator.free(cf_nfkd_cf_nfd_a); 177 defer allocator.free(cf_nfkd_cf_nfd_a);
70 const nfkd_cf_nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); 178 const nfkd_cf_nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a);
71 defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); 179 defer allocator.free(nfkd_cf_nfkd_cf_nfd_a);
72 180
73 // Process b 181 // Process b
74 const nfd_b = try normalizer.nfxdCodePoints(allocator, b, .nfd); 182 const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd);
75 defer allocator.free(nfd_b); 183 defer allocator.free(nfd_b);
76 184
77 var need_free_cf_nfd_b = false; 185 var need_free_cf_nfd_b = false;
78 var cf_nfd_b: []const u21 = nfd_b; 186 var cf_nfd_b: []const u21 = nfd_b;
79 if (self.changesWhenCaseFolded(nfd_b)) { 187 if (casefold.changesWhenCaseFolded(nfd_b)) {
80 cf_nfd_b = try self.caseFold(allocator, nfd_b); 188 cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b);
81 need_free_cf_nfd_b = true; 189 need_free_cf_nfd_b = true;
82 } 190 }
83 defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); 191 defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b);
84 192
85 const nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfd_b); 193 const nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_b);
86 defer allocator.free(nfkd_cf_nfd_b); 194 defer allocator.free(nfkd_cf_nfd_b);
87 const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b); 195 const cf_nfkd_cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_b);
88 defer allocator.free(cf_nfkd_cf_nfd_b); 196 defer allocator.free(cf_nfkd_cf_nfd_b);
89 const nfkd_cf_nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); 197 const nfkd_cf_nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b);
90 defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); 198 defer allocator.free(nfkd_cf_nfkd_cf_nfd_b);
91 199
92 return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); 200 return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b);
@@ -95,42 +203,37 @@ pub fn compatCaselessMatch(
95test "compatCaselessMatch" { 203test "compatCaselessMatch" {
96 const allocator = testing.allocator; 204 const allocator = testing.allocator;
97 205
98 const n = try Normalize.init(allocator); 206 const caser = try CaseFolding.init(allocator);
99 defer n.deinit(allocator); 207 defer caser.deinit(allocator);
100 208
101 const fold_data = try FoldData.init(allocator); 209 try testing.expect(try caser.compatCaselessMatch(allocator, "ascii only!", "ASCII Only!"));
102 defer fold_data.deinit(allocator);
103 const caser = Self{ .fold_data = &fold_data };
104
105 try testing.expect(try caser.compatCaselessMatch(allocator, &n, "ascii only!", "ASCII Only!"));
106 210
107 const a = "Héllo World! \u{3d3}"; 211 const a = "Héllo World! \u{3d3}";
108 const b = "He\u{301}llo World! \u{3a5}\u{301}"; 212 const b = "He\u{301}llo World! \u{3a5}\u{301}";
109 try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, b)); 213 try testing.expect(try caser.compatCaselessMatch(allocator, a, b));
110 214
111 const c = "He\u{301}llo World! \u{3d2}\u{301}"; 215 const c = "He\u{301}llo World! \u{3d2}\u{301}";
112 try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, c)); 216 try testing.expect(try caser.compatCaselessMatch(allocator, a, c));
113} 217}
114 218
115/// Performs canonical caseless string matching by decomposing to NFD. This is 219/// Performs canonical caseless string matching by decomposing to NFD. This is
116/// faster than `compatCaselessMatch`, but less comprehensive. 220/// faster than `compatCaselessMatch`, but less comprehensive.
117pub fn canonCaselessMatch( 221pub fn canonCaselessMatch(
118 self: Self, 222 casefold: *const CaseFolding,
119 allocator: mem.Allocator, 223 allocator: Allocator,
120 normalizer: *const Normalize,
121 a: []const u8, 224 a: []const u8,
122 b: []const u8, 225 b: []const u8,
123) !bool { 226) Allocator.Error!bool {
124 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); 227 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
125 228
126 // Process a 229 // Process a
127 const nfd_a = try normalizer.nfxdCodePoints(allocator, a, .nfd); 230 const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd);
128 defer allocator.free(nfd_a); 231 defer allocator.free(nfd_a);
129 232
130 var need_free_cf_nfd_a = false; 233 var need_free_cf_nfd_a = false;
131 var cf_nfd_a: []const u21 = nfd_a; 234 var cf_nfd_a: []const u21 = nfd_a;
132 if (self.changesWhenCaseFolded(nfd_a)) { 235 if (casefold.changesWhenCaseFolded(nfd_a)) {
133 cf_nfd_a = try self.caseFold(allocator, nfd_a); 236 cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a);
134 need_free_cf_nfd_a = true; 237 need_free_cf_nfd_a = true;
135 } 238 }
136 defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); 239 defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a);
@@ -138,19 +241,19 @@ pub fn canonCaselessMatch(
138 var need_free_nfd_cf_nfd_a = false; 241 var need_free_nfd_cf_nfd_a = false;
139 var nfd_cf_nfd_a = cf_nfd_a; 242 var nfd_cf_nfd_a = cf_nfd_a;
140 if (!need_free_cf_nfd_a) { 243 if (!need_free_cf_nfd_a) {
141 nfd_cf_nfd_a = try normalizer.nfdCodePoints(allocator, cf_nfd_a); 244 nfd_cf_nfd_a = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_a);
142 need_free_nfd_cf_nfd_a = true; 245 need_free_nfd_cf_nfd_a = true;
143 } 246 }
144 defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); 247 defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a);
145 248
146 // Process b 249 // Process b
147 const nfd_b = try normalizer.nfxdCodePoints(allocator, b, .nfd); 250 const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd);
148 defer allocator.free(nfd_b); 251 defer allocator.free(nfd_b);
149 252
150 var need_free_cf_nfd_b = false; 253 var need_free_cf_nfd_b = false;
151 var cf_nfd_b: []const u21 = nfd_b; 254 var cf_nfd_b: []const u21 = nfd_b;
152 if (self.changesWhenCaseFolded(nfd_b)) { 255 if (casefold.changesWhenCaseFolded(nfd_b)) {
153 cf_nfd_b = try self.caseFold(allocator, nfd_b); 256 cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b);
154 need_free_cf_nfd_b = true; 257 need_free_cf_nfd_b = true;
155 } 258 }
156 defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); 259 defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b);
@@ -158,7 +261,7 @@ pub fn canonCaselessMatch(
158 var need_free_nfd_cf_nfd_b = false; 261 var need_free_nfd_cf_nfd_b = false;
159 var nfd_cf_nfd_b = cf_nfd_b; 262 var nfd_cf_nfd_b = cf_nfd_b;
160 if (!need_free_cf_nfd_b) { 263 if (!need_free_cf_nfd_b) {
161 nfd_cf_nfd_b = try normalizer.nfdCodePoints(allocator, cf_nfd_b); 264 nfd_cf_nfd_b = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_b);
162 need_free_nfd_cf_nfd_b = true; 265 need_free_nfd_cf_nfd_b = true;
163 } 266 }
164 defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b); 267 defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b);
@@ -169,19 +272,50 @@ pub fn canonCaselessMatch(
169test "canonCaselessMatch" { 272test "canonCaselessMatch" {
170 const allocator = testing.allocator; 273 const allocator = testing.allocator;
171 274
172 const n = try Normalize.init(allocator); 275 const caser = try CaseFolding.init(allocator);
173 defer n.deinit(allocator); 276 defer caser.deinit(allocator);
174
175 const fold_data = try FoldData.init(allocator);
176 defer fold_data.deinit(allocator);
177 const caser = Self{ .fold_data = &fold_data };
178 277
179 try testing.expect(try caser.canonCaselessMatch(allocator, &n, "ascii only!", "ASCII Only!")); 278 try testing.expect(try caser.canonCaselessMatch(allocator, "ascii only!", "ASCII Only!"));
180 279
181 const a = "Héllo World! \u{3d3}"; 280 const a = "Héllo World! \u{3d3}";
182 const b = "He\u{301}llo World! \u{3a5}\u{301}"; 281 const b = "He\u{301}llo World! \u{3a5}\u{301}";
183 try testing.expect(!try caser.canonCaselessMatch(allocator, &n, a, b)); 282 try testing.expect(!try caser.canonCaselessMatch(allocator, a, b));
184 283
185 const c = "He\u{301}llo World! \u{3d2}\u{301}"; 284 const c = "He\u{301}llo World! \u{3d2}\u{301}";
186 try testing.expect(try caser.canonCaselessMatch(allocator, &n, a, c)); 285 try testing.expect(try caser.canonCaselessMatch(allocator, a, c));
187} 286}
287
288fn testAllocations(allocator: Allocator) !void {
289 // With normalize provided
290 {
291 const normalize = try Normalize.init(allocator);
292 defer normalize.deinit(allocator);
293 const caser1 = try CaseFolding.initWithNormalize(allocator, normalize);
294 defer caser1.deinit(allocator);
295 }
296 // With normalize owned
297 {
298 const caser2 = try CaseFolding.init(allocator);
299 defer caser2.deinit(allocator);
300 }
301}
302
303// test "Allocation Failures" {
304// if (true) return error.SkipZigTest; // XXX: remove
305// try testing.checkAllAllocationFailures(
306// testing.allocator,
307// testAllocations,
308// .{},
309// );
310// }
311
312const std = @import("std");
313const builtin = @import("builtin");
314const mem = std.mem;
315const testing = std.testing;
316const Allocator = mem.Allocator;
317
318const ascii = @import("ascii");
319const Normalize = @import("Normalize");
320
321const compress = std.compress;
diff --git a/src/FoldData.zig b/src/FoldData.zig
deleted file mode 100644
index b7fdceb..0000000
--- a/src/FoldData.zig
+++ /dev/null
@@ -1,99 +0,0 @@
1const std = @import("std");
2const builtin = @import("builtin");
3const compress = std.compress;
4const mem = std.mem;
5
6cutoff: u21 = undefined,
7cwcf_exceptions_min: u21 = undefined,
8cwcf_exceptions_max: u21 = undefined,
9cwcf_exceptions: []u21 = undefined,
10multiple_start: u21 = undefined,
11stage1: []u8 = undefined,
12stage2: []u8 = undefined,
13stage3: []i24 = undefined,
14
15const FoldData = @This();
16
17pub fn init(allocator: mem.Allocator) !FoldData {
18 const decompressor = compress.flate.inflate.decompressor;
19 const in_bytes = @embedFile("fold");
20 var in_fbs = std.io.fixedBufferStream(in_bytes);
21 var in_decomp = decompressor(.raw, in_fbs.reader());
22 var reader = in_decomp.reader();
23
24 const endian = builtin.cpu.arch.endian();
25
26 var fdata = FoldData{};
27 fdata.cutoff = @intCast(try reader.readInt(u24, endian));
28 fdata.multiple_start = @intCast(try reader.readInt(u24, endian));
29
30 var len = try reader.readInt(u16, endian);
31 fdata.stage1 = try allocator.alloc(u8, len);
32 errdefer allocator.free(fdata.stage1);
33 for (0..len) |i| fdata.stage1[i] = try reader.readInt(u8, endian);
34
35 len = try reader.readInt(u16, endian);
36 fdata.stage2 = try allocator.alloc(u8, len);
37 errdefer allocator.free(fdata.stage2);
38 for (0..len) |i| fdata.stage2[i] = try reader.readInt(u8, endian);
39
40 len = try reader.readInt(u16, endian);
41 fdata.stage3 = try allocator.alloc(i24, len);
42 errdefer allocator.free(fdata.stage3);
43 for (0..len) |i| fdata.stage3[i] = try reader.readInt(i24, endian);
44
45 fdata.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian));
46 fdata.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian));
47 len = try reader.readInt(u16, endian);
48 fdata.cwcf_exceptions = try allocator.alloc(u21, len);
49 errdefer allocator.free(fdata.cwcf_exceptions);
50 for (0..len) |i| fdata.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian));
51
52 return fdata;
53}
54
55pub fn deinit(fdata: *const FoldData, allocator: mem.Allocator) void {
56 allocator.free(fdata.stage1);
57 allocator.free(fdata.stage2);
58 allocator.free(fdata.stage3);
59 allocator.free(fdata.cwcf_exceptions);
60}
61
62/// Returns the case fold for `cp`.
63pub fn caseFold(fdata: *const FoldData, cp: u21, buf: []u21) []const u21 {
64 if (cp >= fdata.cutoff) return &.{};
65
66 const stage1_val = fdata.stage1[cp >> 8];
67 if (stage1_val == 0) return &.{};
68
69 const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF);
70 const stage3_index = fdata.stage2[stage2_index];
71
72 if (stage3_index & 0x80 != 0) {
73 const real_index = @as(usize, fdata.multiple_start) + (stage3_index ^ 0x80) * 3;
74 const mapping = mem.sliceTo(fdata.stage3[real_index..][0..3], 0);
75 for (mapping, 0..) |c, i| buf[i] = @intCast(c);
76
77 return buf[0..mapping.len];
78 }
79
80 const offset = fdata.stage3[stage3_index];
81 if (offset == 0) return &.{};
82
83 buf[0] = @intCast(@as(i32, cp) + offset);
84
85 return buf[0..1];
86}
87
88/// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`).
89pub fn changesWhenCaseFolded(fdata: *const FoldData, cp: u21) bool {
90 var buf: [3]u21 = undefined;
91 const has_mapping = fdata.caseFold(cp, &buf).len != 0;
92 return has_mapping and !fdata.isCwcfException(cp);
93}
94
95fn isCwcfException(fdata: *const FoldData, cp: u21) bool {
96 return cp >= fdata.cwcf_exceptions_min and
97 cp <= fdata.cwcf_exceptions_max and
98 std.mem.indexOfScalar(u21, fdata.cwcf_exceptions, cp) != null;
99}
diff --git a/src/Normalize.zig b/src/Normalize.zig
index 4f014cf..d8c867d 100644
--- a/src/Normalize.zig
+++ b/src/Normalize.zig
@@ -632,6 +632,21 @@ test "isLatin1Only" {
632 try testing.expect(!isLatin1Only(not_latin1_only)); 632 try testing.expect(!isLatin1Only(not_latin1_only));
633} 633}
634 634
635// NOTE: These tests take way waaaaay too long to run, because
636// the amount of allocations in a couple of the inflators is
637// completely excessive and is also costing memory for metadata.
638// I'm leaving this here for when I fix that.
639//
640// fn testAllocations(allocator: Allocator) !void {
641// const norm = try Normalize.init(allocator);
642// norm.deinit(allocator);
643// }
644//
645// test "allocation failures" {
646// if (true) return error.SkipZigTest;
647// try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{});
648// }
649
635const std = @import("std"); 650const std = @import("std");
636const debug = std.debug; 651const debug = std.debug;
637const assert = debug.assert; 652const assert = debug.assert;
@@ -649,6 +664,5 @@ const CodePointIterator = @import("code_point").Iterator;
649const CanonData = @import("CanonData"); 664const CanonData = @import("CanonData");
650const CccData = @import("CombiningData"); 665const CccData = @import("CombiningData");
651const CompatData = @import("CompatData"); 666const CompatData = @import("CompatData");
652const FoldData = @import("FoldData");
653const HangulData = @import("HangulData"); 667const HangulData = @import("HangulData");
654const NormPropsData = @import("NormPropsData"); 668const NormPropsData = @import("NormPropsData");