summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Sam Atman2026-02-04 18:01:36 -0500
committerGravatar Sam Atman2026-02-04 18:01:36 -0500
commitba5d9081b479e95ffa7f3baf751beedd370cec14 (patch)
treec12041d8aab9f9ff68b25a2e2c9042073c3d5f61 /src
parentConvert Words module to no-allocation (diff)
downloadzg-ba5d9081b479e95ffa7f3baf751beedd370cec14.tar.gz
zg-ba5d9081b479e95ffa7f3baf751beedd370cec14.tar.xz
zg-ba5d9081b479e95ffa7f3baf751beedd370cec14.zip
Normalization and case folding
Both of which deserve some further attention.
Diffstat (limited to 'src')
-rw-r--r--src/CaseFolding.zig258
-rw-r--r--src/CombiningData.zig49
-rw-r--r--src/CompatData.zig64
-rw-r--r--src/HangulData.zig42
-rw-r--r--src/NormPropsData.zig46
-rw-r--r--src/Normalize.zig119
-rw-r--r--src/comptime_map.zig176
7 files changed, 371 insertions, 383 deletions
diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig
index df86b92..88f047c 100644
--- a/src/CaseFolding.zig
+++ b/src/CaseFolding.zig
@@ -1,113 +1,53 @@
1cutoff: u21 = undefined,
2cwcf_exceptions_min: u21 = undefined,
3cwcf_exceptions_max: u21 = undefined,
4cwcf_exceptions: []u21 = undefined,
5multiple_start: u21 = undefined,
6stage1: []u8 = undefined,
7stage2: []u8 = undefined,
8stage3: []i24 = undefined,
9normalize: Normalize,
10owns_normalize: bool,
11
12const CaseFolding = @This(); 1const CaseFolding = @This();
13 2
14pub fn init(allocator: Allocator) Allocator.Error!CaseFolding { 3const Data = struct {
15 var case_fold: CaseFolding = undefined; 4 cutoff: u21 = undefined,
16 try case_fold.setup(allocator); 5 cwcf_exceptions_min: u21 = undefined,
17 return case_fold; 6 cwcf_exceptions_max: u21 = undefined,
18} 7 cwcf_exceptions: []const u21 = undefined,
19 8 multiple_start: u21 = undefined,
20pub fn initWithNormalize(allocator: Allocator, norm: Normalize) Allocator.Error!CaseFolding { 9 stage1: []const u8 = undefined,
21 var casefold: CaseFolding = undefined; 10 stage2: []const u8 = undefined,
22 try casefold.setupWithNormalize(allocator, norm); 11 stage3: []const i24 = undefined,
23 return casefold; 12};
24} 13
25 14const casefold = casefold: {
26pub fn setup(casefold: *CaseFolding, allocator: Allocator) Allocator.Error!void { 15 const data = @import("fold");
27 try casefold.setupImpl(allocator); 16 break :casefold Data{
28 // Handle normalize memory separately during setup: 17 .cutoff = data.cutoff,
29 casefold.owns_normalize = false; 18 .multiple_start = data.multiple_start,
30 errdefer casefold.deinit(allocator); 19 .stage1 = &data.stage1,
31 try casefold.normalize.setup(allocator); 20 .stage2 = &data.stage2,
32 casefold.owns_normalize = true; 21 .stage3 = &data.stage3,
33} 22 .cwcf_exceptions_min = data.cwcf_exceptions_min,
34 23 .cwcf_exceptions_max = data.cwcf_exceptions_max,
35pub fn setupWithNormalize(casefold: *CaseFolding, allocator: Allocator, norm: Normalize) !void { 24 .cwcf_exceptions = &data.cwcf_exceptions,
36 try casefold.setupImpl(allocator);
37 casefold.normalize = norm;
38 casefold.owns_normalize = false;
39}
40
41fn setupImpl(casefold: *CaseFolding, allocator: Allocator) Allocator.Error!void {
42 casefold.setupImplInner(allocator) catch |err| {
43 switch (err) {
44 error.OutOfMemory => |e| return e,
45 else => unreachable,
46 }
47 }; 25 };
48} 26};
49
50inline fn setupImplInner(casefold: *CaseFolding, allocator: Allocator) !void {
51 const in_bytes = @embedFile("fold");
52 var in_fbs = std.io.fixedBufferStream(in_bytes);
53 var reader = in_fbs.reader();
54
55 const endian = builtin.cpu.arch.endian();
56
57 casefold.cutoff = @intCast(try reader.readInt(u24, endian));
58 casefold.multiple_start = @intCast(try reader.readInt(u24, endian));
59
60 var len = try reader.readInt(u16, endian);
61 casefold.stage1 = try allocator.alloc(u8, len);
62 errdefer allocator.free(casefold.stage1);
63 for (0..len) |i| casefold.stage1[i] = try reader.readInt(u8, endian);
64
65 len = try reader.readInt(u16, endian);
66 casefold.stage2 = try allocator.alloc(u8, len);
67 errdefer allocator.free(casefold.stage2);
68 for (0..len) |i| casefold.stage2[i] = try reader.readInt(u8, endian);
69
70 len = try reader.readInt(u16, endian);
71 casefold.stage3 = try allocator.alloc(i24, len);
72 errdefer allocator.free(casefold.stage3);
73 for (0..len) |i| casefold.stage3[i] = try reader.readInt(i24, endian);
74
75 casefold.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian));
76 casefold.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian));
77 len = try reader.readInt(u16, endian);
78 casefold.cwcf_exceptions = try allocator.alloc(u21, len);
79 errdefer allocator.free(casefold.cwcf_exceptions);
80 for (0..len) |i| casefold.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian));
81}
82
83pub fn deinit(fdata: *const CaseFolding, allocator: mem.Allocator) void {
84 allocator.free(fdata.stage1);
85 allocator.free(fdata.stage2);
86 allocator.free(fdata.stage3);
87 allocator.free(fdata.cwcf_exceptions);
88 if (fdata.owns_normalize) fdata.normalize.deinit(allocator);
89}
90 27
91/// Returns the case fold for `cp`. 28/// Returns the case fold for `cp`.
92pub fn caseFold(fdata: *const CaseFolding, cp: u21, buf: []u21) []const u21 { 29pub fn caseFold(cp: u21, buf: []u21) []const u21 {
93 if (cp >= fdata.cutoff) return &.{}; 30 // Unmatched code points fold to themselves, so we default to this.
31 buf[0] = cp;
94 32
95 const stage1_val = fdata.stage1[cp >> 8]; 33 if (cp >= casefold.cutoff) return buf[0..1];
96 if (stage1_val == 0) return &.{}; 34
35 const stage1_val = casefold.stage1[cp >> 8];
36 if (stage1_val == 0) return buf[0..1];
97 37
98 const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF); 38 const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF);
99 const stage3_index = fdata.stage2[stage2_index]; 39 const stage3_index = casefold.stage2[stage2_index];
100 40
101 if (stage3_index & 0x80 != 0) { 41 if (stage3_index & 0x80 != 0) {
102 const real_index = @as(usize, fdata.multiple_start) + (stage3_index ^ 0x80) * 3; 42 const real_index = @as(usize, casefold.multiple_start) + (stage3_index ^ 0x80) * 3;
103 const mapping = mem.sliceTo(fdata.stage3[real_index..][0..3], 0); 43 const mapping = mem.sliceTo(casefold.stage3[real_index..][0..3], 0);
104 for (mapping, 0..) |c, i| buf[i] = @intCast(c); 44 for (mapping, 0..) |c, i| buf[i] = @intCast(c);
105 45
106 return buf[0..mapping.len]; 46 return buf[0..mapping.len];
107 } 47 }
108 48
109 const offset = fdata.stage3[stage3_index]; 49 const offset = casefold.stage3[stage3_index];
110 if (offset == 0) return &.{}; 50 if (offset == 0) return buf[0..1];
111 51
112 buf[0] = @intCast(@as(i32, cp) + offset); 52 buf[0] = @intCast(@as(i32, cp) + offset);
113 53
@@ -117,7 +57,6 @@ pub fn caseFold(fdata: *const CaseFolding, cp: u21, buf: []u21) []const u21 {
117/// Produces the case folded code points for `cps`. Caller must free returned 57/// Produces the case folded code points for `cps`. Caller must free returned
118/// slice with `allocator`. 58/// slice with `allocator`.
119pub fn caseFoldAlloc( 59pub fn caseFoldAlloc(
120 casefold: *const CaseFolding,
121 allocator: Allocator, 60 allocator: Allocator,
122 cps: []const u21, 61 cps: []const u21,
123) Allocator.Error![]const u21 { 62) Allocator.Error![]const u21 {
@@ -126,7 +65,7 @@ pub fn caseFoldAlloc(
126 var buf: [3]u21 = undefined; 65 var buf: [3]u21 = undefined;
127 66
128 for (cps) |cp| { 67 for (cps) |cp| {
129 const cf = casefold.caseFold(cp, &buf); 68 const cf = CaseFolding.caseFold(cp, &buf);
130 69
131 if (cf.len == 0) { 70 if (cf.len == 0) {
132 try cfcps.append(cp); 71 try cfcps.append(cp);
@@ -139,19 +78,19 @@ pub fn caseFoldAlloc(
139} 78}
140 79
141/// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). 80/// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`).
142pub fn cpChangesWhenCaseFolded(casefold: *const CaseFolding, cp: u21) bool { 81pub fn cpChangesWhenCaseFolded(cp: u21) bool {
143 var buf: [3]u21 = undefined; 82 var buf: [3]u21 = undefined;
144 const has_mapping = casefold.caseFold(cp, &buf).len != 0; 83 const has_mapping = CaseFolding.caseFold(cp, &buf).len != 0;
145 return has_mapping and !casefold.isCwcfException(cp); 84 return has_mapping and !CaseFolding.isCwcfException(cp);
146} 85}
147 86
148pub fn changesWhenCaseFolded(casefold: *const CaseFolding, cps: []const u21) bool { 87pub fn changesWhenCaseFolded(cps: []const u21) bool {
149 return for (cps) |cp| { 88 return for (cps) |cp| {
150 if (casefold.cpChangesWhenCaseFolded(cp)) break true; 89 if (CaseFolding.cpChangesWhenCaseFolded(cp)) break true;
151 } else false; 90 } else false;
152} 91}
153 92
154fn isCwcfException(casefold: *const CaseFolding, cp: u21) bool { 93fn isCwcfException(cp: u21) bool {
155 return cp >= casefold.cwcf_exceptions_min and 94 return cp >= casefold.cwcf_exceptions_min and
156 cp <= casefold.cwcf_exceptions_max and 95 cp <= casefold.cwcf_exceptions_max and
157 std.mem.indexOfScalar(u21, casefold.cwcf_exceptions, cp) != null; 96 std.mem.indexOfScalar(u21, casefold.cwcf_exceptions, cp) != null;
@@ -160,88 +99,114 @@ fn isCwcfException(casefold: *const CaseFolding, cp: u21) bool {
160/// Caseless compare `a` and `b` by decomposing to NFKD. This is the most 99/// Caseless compare `a` and `b` by decomposing to NFKD. This is the most
161/// comprehensive comparison possible, but slower than `canonCaselessMatch`. 100/// comprehensive comparison possible, but slower than `canonCaselessMatch`.
162pub fn compatCaselessMatch( 101pub fn compatCaselessMatch(
163 casefold: *const CaseFolding,
164 allocator: Allocator, 102 allocator: Allocator,
103 normalize: Normalize,
165 a: []const u8, 104 a: []const u8,
166 b: []const u8, 105 b: []const u8,
167) Allocator.Error!bool { 106) Allocator.Error!bool {
168 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); 107 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
169 108
170 // Process a 109 // Process a
171 const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd); 110 const nfd_a = try normalize.nfxdCodePoints(allocator, a, .nfd);
172 defer allocator.free(nfd_a); 111 defer allocator.free(nfd_a);
173 112
174 var need_free_cf_nfd_a = false; 113 var need_free_cf_nfd_a = false;
175 var cf_nfd_a: []const u21 = nfd_a; 114 var cf_nfd_a: []const u21 = nfd_a;
176 if (casefold.changesWhenCaseFolded(nfd_a)) { 115 if (CaseFolding.changesWhenCaseFolded(nfd_a)) {
177 cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a); 116 cf_nfd_a = try CaseFolding.caseFoldAlloc(allocator, nfd_a);
178 need_free_cf_nfd_a = true; 117 need_free_cf_nfd_a = true;
179 } 118 }
180 defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); 119 defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a);
181 120
182 const nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_a); 121 const nfkd_cf_nfd_a = try normalize.nfkdCodePoints(allocator, cf_nfd_a);
183 defer allocator.free(nfkd_cf_nfd_a); 122 defer allocator.free(nfkd_cf_nfd_a);
184 const cf_nfkd_cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_a); 123 const cf_nfkd_cf_nfd_a = try CaseFolding.caseFoldAlloc(allocator, nfkd_cf_nfd_a);
185 defer allocator.free(cf_nfkd_cf_nfd_a); 124 defer allocator.free(cf_nfkd_cf_nfd_a);
186 const nfkd_cf_nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); 125 const nfkd_cf_nfkd_cf_nfd_a = try normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a);
187 defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); 126 defer allocator.free(nfkd_cf_nfkd_cf_nfd_a);
188 127
189 // Process b 128 // Process b
190 const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd); 129 const nfd_b = try normalize.nfxdCodePoints(allocator, b, .nfd);
191 defer allocator.free(nfd_b); 130 defer allocator.free(nfd_b);
192 131
193 var need_free_cf_nfd_b = false; 132 var need_free_cf_nfd_b = false;
194 var cf_nfd_b: []const u21 = nfd_b; 133 var cf_nfd_b: []const u21 = nfd_b;
195 if (casefold.changesWhenCaseFolded(nfd_b)) { 134 if (CaseFolding.changesWhenCaseFolded(nfd_b)) {
196 cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b); 135 cf_nfd_b = try CaseFolding.caseFoldAlloc(allocator, nfd_b);
197 need_free_cf_nfd_b = true; 136 need_free_cf_nfd_b = true;
198 } 137 }
199 defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); 138 defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b);
200 139
201 const nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_b); 140 const nfkd_cf_nfd_b = try normalize.nfkdCodePoints(allocator, cf_nfd_b);
202 defer allocator.free(nfkd_cf_nfd_b); 141 defer allocator.free(nfkd_cf_nfd_b);
203 const cf_nfkd_cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_b); 142 const cf_nfkd_cf_nfd_b = try CaseFolding.caseFoldAlloc(allocator, nfkd_cf_nfd_b);
204 defer allocator.free(cf_nfkd_cf_nfd_b); 143 defer allocator.free(cf_nfkd_cf_nfd_b);
205 const nfkd_cf_nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); 144 const nfkd_cf_nfkd_cf_nfd_b = try normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b);
206 defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); 145 defer allocator.free(nfkd_cf_nfkd_cf_nfd_b);
207 146
208 return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); 147 return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b);
209} 148}
210 149
150test "caseFold" {
151 var buf: [3]u21 = undefined;
152
153 // Folds '1' to '1'
154 try testing.expectEqual(1, caseFold('1', &buf).len);
155 try testing.expectEqual('1', caseFold('1', &buf)[0]);
156
157 // Folds '2' to '2'
158 try testing.expectEqual(1, caseFold('2', &buf).len);
159 try testing.expectEqual('2', caseFold('2', &buf)[0]);
160
161 // Folds Armenian capital letter 'Zhe' (U+053A)
162 try testing.expectEqual(1, caseFold('Ժ', &buf).len);
163 // Armenian small letter 'Zhe' (U+056A)
164 try testing.expectEqual('ժ', caseFold('Ժ', &buf)[0]);
165
166 // Folds Greek small letter Upsilon with Dialytika and Perispomeni (U+1FE7)
167 try testing.expectEqual(3, caseFold('ῧ', &buf).len);
168 // Greek small letter Upsilon (U+03C5)
169 try testing.expectEqual('υ', caseFold('ῧ', &buf)[0]);
170 // Combining Diaeresis
171 try testing.expectEqual('\u{0308}', caseFold('ῧ', &buf)[1]);
172 // Combining Greek Perispomeni
173 try testing.expectEqual('\u{0342}', caseFold('ῧ', &buf)[2]);
174}
175
211test "compatCaselessMatch" { 176test "compatCaselessMatch" {
212 const allocator = testing.allocator; 177 const allocator = testing.allocator;
213 178
214 const caser = try CaseFolding.init(allocator); 179 var normalize = try Normalize.init(allocator);
215 defer caser.deinit(allocator); 180 defer normalize.deinit(allocator);
216 181
217 try testing.expect(try caser.compatCaselessMatch(allocator, "ascii only!", "ASCII Only!")); 182 try testing.expect(try compatCaselessMatch(allocator, normalize, "ascii only!", "ASCII Only!"));
218 183
219 const a = "Héllo World! \u{3d3}"; 184 const a = "Héllo World! \u{3d3}";
220 const b = "He\u{301}llo World! \u{3a5}\u{301}"; 185 const b = "He\u{301}llo World! \u{3a5}\u{301}";
221 try testing.expect(try caser.compatCaselessMatch(allocator, a, b)); 186 try testing.expect(try compatCaselessMatch(allocator, normalize, a, b));
222 187
223 const c = "He\u{301}llo World! \u{3d2}\u{301}"; 188 const c = "He\u{301}llo World! \u{3d2}\u{301}";
224 try testing.expect(try caser.compatCaselessMatch(allocator, a, c)); 189 try testing.expect(try compatCaselessMatch(allocator, normalize, a, c));
225} 190}
226 191
227/// Performs canonical caseless string matching by decomposing to NFD. This is 192/// Performs canonical caseless string matching by decomposing to NFD. This is
228/// faster than `compatCaselessMatch`, but less comprehensive. 193/// faster than `compatCaselessMatch`, but less comprehensive.
229pub fn canonCaselessMatch( 194pub fn canonCaselessMatch(
230 casefold: *const CaseFolding,
231 allocator: Allocator, 195 allocator: Allocator,
196 normalize: Normalize,
232 a: []const u8, 197 a: []const u8,
233 b: []const u8, 198 b: []const u8,
234) Allocator.Error!bool { 199) Allocator.Error!bool {
235 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); 200 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
236 201
237 // Process a 202 // Process a
238 const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd); 203 const nfd_a = try normalize.nfxdCodePoints(allocator, a, .nfd);
239 defer allocator.free(nfd_a); 204 defer allocator.free(nfd_a);
240 205
241 var need_free_cf_nfd_a = false; 206 var need_free_cf_nfd_a = false;
242 var cf_nfd_a: []const u21 = nfd_a; 207 var cf_nfd_a: []const u21 = nfd_a;
243 if (casefold.changesWhenCaseFolded(nfd_a)) { 208 if (CaseFolding.changesWhenCaseFolded(nfd_a)) {
244 cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a); 209 cf_nfd_a = try CaseFolding.caseFoldAlloc(allocator, nfd_a);
245 need_free_cf_nfd_a = true; 210 need_free_cf_nfd_a = true;
246 } 211 }
247 defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); 212 defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a);
@@ -249,19 +214,19 @@ pub fn canonCaselessMatch(
249 var need_free_nfd_cf_nfd_a = false; 214 var need_free_nfd_cf_nfd_a = false;
250 var nfd_cf_nfd_a = cf_nfd_a; 215 var nfd_cf_nfd_a = cf_nfd_a;
251 if (!need_free_cf_nfd_a) { 216 if (!need_free_cf_nfd_a) {
252 nfd_cf_nfd_a = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_a); 217 nfd_cf_nfd_a = try normalize.nfdCodePoints(allocator, cf_nfd_a);
253 need_free_nfd_cf_nfd_a = true; 218 need_free_nfd_cf_nfd_a = true;
254 } 219 }
255 defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); 220 defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a);
256 221
257 // Process b 222 // Process b
258 const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd); 223 const nfd_b = try normalize.nfxdCodePoints(allocator, b, .nfd);
259 defer allocator.free(nfd_b); 224 defer allocator.free(nfd_b);
260 225
261 var need_free_cf_nfd_b = false; 226 var need_free_cf_nfd_b = false;
262 var cf_nfd_b: []const u21 = nfd_b; 227 var cf_nfd_b: []const u21 = nfd_b;
263 if (casefold.changesWhenCaseFolded(nfd_b)) { 228 if (CaseFolding.changesWhenCaseFolded(nfd_b)) {
264 cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b); 229 cf_nfd_b = try CaseFolding.caseFoldAlloc(allocator, nfd_b);
265 need_free_cf_nfd_b = true; 230 need_free_cf_nfd_b = true;
266 } 231 }
267 defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); 232 defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b);
@@ -269,7 +234,7 @@ pub fn canonCaselessMatch(
269 var need_free_nfd_cf_nfd_b = false; 234 var need_free_nfd_cf_nfd_b = false;
270 var nfd_cf_nfd_b = cf_nfd_b; 235 var nfd_cf_nfd_b = cf_nfd_b;
271 if (!need_free_cf_nfd_b) { 236 if (!need_free_cf_nfd_b) {
272 nfd_cf_nfd_b = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_b); 237 nfd_cf_nfd_b = try normalize.nfdCodePoints(allocator, cf_nfd_b);
273 need_free_nfd_cf_nfd_b = true; 238 need_free_nfd_cf_nfd_b = true;
274 } 239 }
275 defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b); 240 defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b);
@@ -280,40 +245,17 @@ pub fn canonCaselessMatch(
280test "canonCaselessMatch" { 245test "canonCaselessMatch" {
281 const allocator = testing.allocator; 246 const allocator = testing.allocator;
282 247
283 const caser = try CaseFolding.init(allocator); 248 var normalize = try Normalize.init(allocator);
284 defer caser.deinit(allocator); 249 defer normalize.deinit(allocator);
285 250
286 try testing.expect(try caser.canonCaselessMatch(allocator, "ascii only!", "ASCII Only!")); 251 try testing.expect(try canonCaselessMatch(allocator, normalize, "ascii only!", "ASCII Only!"));
287 252
288 const a = "Héllo World! \u{3d3}"; 253 const a = "Héllo World! \u{3d3}";
289 const b = "He\u{301}llo World! \u{3a5}\u{301}"; 254 const b = "He\u{301}llo World! \u{3a5}\u{301}";
290 try testing.expect(!try caser.canonCaselessMatch(allocator, a, b)); 255 try testing.expect(!try canonCaselessMatch(allocator, normalize, a, b));
291 256
292 const c = "He\u{301}llo World! \u{3d2}\u{301}"; 257 const c = "He\u{301}llo World! \u{3d2}\u{301}";
293 try testing.expect(try caser.canonCaselessMatch(allocator, a, c)); 258 try testing.expect(try canonCaselessMatch(allocator, normalize, a, c));
294}
295
296fn testAllocations(allocator: Allocator) !void {
297 // With normalize provided
298 {
299 const normalize = try Normalize.init(allocator);
300 defer normalize.deinit(allocator);
301 const caser = try CaseFolding.initWithNormalize(allocator, normalize);
302 defer caser.deinit(allocator);
303 }
304 // With normalize owned
305 {
306 const caser = try CaseFolding.init(allocator);
307 defer caser.deinit(allocator);
308 }
309}
310
311test "Allocation Failures" {
312 try testing.checkAllAllocationFailures(
313 testing.allocator,
314 testAllocations,
315 .{},
316 );
317} 259}
318 260
319const std = @import("std"); 261const std = @import("std");
diff --git a/src/CombiningData.zig b/src/CombiningData.zig
index f58e0de..660dc14 100644
--- a/src/CombiningData.zig
+++ b/src/CombiningData.zig
@@ -1,45 +1,28 @@
1//! Combining Class Data 1//! Combining Class Data
2 2
3s1: []u16 = undefined, 3const Data = struct {
4s2: []u8 = undefined, 4 s1: []const u16 = undefined,
5 s2: []const u8 = undefined,
6};
7
8const combining_data = combining_data: {
9 const data = @import("ccc");
10 break :combining_data Data{
11 .s1 = &data.s1,
12 .s2 = &data.s2,
13 };
14};
5 15
6const CombiningData = @This(); 16const CombiningData = @This();
7 17
8pub fn init(allocator: mem.Allocator) !CombiningData {
9 const in_bytes = @embedFile("ccc");
10 var in_fbs = std.io.fixedBufferStream(in_bytes);
11 var reader = in_fbs.reader();
12
13 const endian = builtin.cpu.arch.endian();
14
15 var cbdata = CombiningData{};
16
17 const stage_1_len: u16 = try reader.readInt(u16, endian);
18 cbdata.s1 = try allocator.alloc(u16, stage_1_len);
19 errdefer allocator.free(cbdata.s1);
20 for (0..stage_1_len) |i| cbdata.s1[i] = try reader.readInt(u16, endian);
21
22 const stage_2_len: u16 = try reader.readInt(u16, endian);
23 cbdata.s2 = try allocator.alloc(u8, stage_2_len);
24 errdefer allocator.free(cbdata.s2);
25 _ = try reader.readAll(cbdata.s2);
26
27 return cbdata;
28}
29
30pub fn deinit(cbdata: *const CombiningData, allocator: mem.Allocator) void {
31 allocator.free(cbdata.s1);
32 allocator.free(cbdata.s2);
33}
34
35/// Returns the canonical combining class for a code point. 18/// Returns the canonical combining class for a code point.
36pub fn ccc(cbdata: CombiningData, cp: u21) u8 { 19pub fn ccc(cp: u21) u8 {
37 return cbdata.s2[cbdata.s1[cp >> 8] + (cp & 0xff)]; 20 return combining_data.s2[combining_data.s1[cp >> 8] + (cp & 0xff)];
38} 21}
39 22
40/// True if `cp` is a starter code point, not a combining character. 23/// True if `cp` is a starter code point, not a combining character.
41pub fn isStarter(cbdata: CombiningData, cp: u21) bool { 24pub fn isStarter(cp: u21) bool {
42 return cbdata.s2[cbdata.s1[cp >> 8] + (cp & 0xff)] == 0; 25 return combining_data.s2[combining_data.s1[cp >> 8] + (cp & 0xff)] == 0;
43} 26}
44 27
45const std = @import("std"); 28const std = @import("std");
diff --git a/src/CompatData.zig b/src/CompatData.zig
index 40ecd12..68d47f2 100644
--- a/src/CompatData.zig
+++ b/src/CompatData.zig
@@ -1,57 +1,19 @@
1//! Compatibility Data 1//! Compatibility Data
2 2
3nfkd: [][]u21 = undefined, 3const Data = struct {
4cps: []u21 = undefined, 4 s1: []const u16 = undefined,
5 5 s2: []const []const u21 = undefined,
6const CompatData = @This(); 6};
7 7
8pub fn init(allocator: mem.Allocator) !CompatData { 8const compat_data = compat_data: {
9 const in_bytes = @embedFile("compat"); 9 const data = @import("compat");
10 var in_fbs = std.io.fixedBufferStream(in_bytes); 10 break :compat_data Data{
11 var reader = in_fbs.reader(); 11 .s1 = &data.s1,
12 12 .s2 = &data.s2,
13 const endian = builtin.cpu.arch.endian();
14 var cpdata = CompatData{
15 .nfkd = try allocator.alloc([]u21, 0x110000),
16 }; 13 };
17 { 14};
18 errdefer allocator.free(cpdata.nfkd);
19 cpdata.cps = try allocator.alloc(u21, magic.compat_size);
20 }
21 errdefer cpdata.deinit(allocator);
22
23 @memset(cpdata.nfkd, &.{});
24
25 var total_len: usize = 0;
26
27 while (true) {
28 const len: u8 = try reader.readInt(u8, endian);
29 if (len == 0) break;
30 const cp = try reader.readInt(u24, endian);
31 const nk_s = cpdata.cps[total_len..][0 .. len - 1];
32 for (0..len - 1) |i| {
33 nk_s[i] = @intCast(try reader.readInt(u24, endian));
34 }
35 cpdata.nfkd[cp] = nk_s;
36 total_len += len - 1;
37 }
38
39 if (comptime magic.print) std.debug.print("CompatData magic number: {d}", .{total_len});
40
41 return cpdata;
42}
43
44pub fn deinit(cpdata: *const CompatData, allocator: mem.Allocator) void {
45 allocator.free(cpdata.cps);
46 allocator.free(cpdata.nfkd);
47}
48 15
49/// Returns compatibility decomposition for `cp`. 16/// Returns compatibility decomposition for `cp`.
50pub fn toNfkd(cpdata: *const CompatData, cp: u21) []u21 { 17pub fn toNfkd(cp: u21) []const u21 {
51 return cpdata.nfkd[cp]; 18 return compat_data.s2[compat_data.s1[cp >> 8] + (cp & 0xff)];
52} 19}
53
54const std = @import("std");
55const builtin = @import("builtin");
56const mem = std.mem;
57const magic = @import("magic");
diff --git a/src/HangulData.zig b/src/HangulData.zig
index cae8b97..9c17421 100644
--- a/src/HangulData.zig
+++ b/src/HangulData.zig
@@ -9,39 +9,23 @@ pub const Syllable = enum {
9 T, 9 T,
10}; 10};
11 11
12s1: []u16 = undefined, 12const Data = struct {
13s2: []u3 = undefined, 13 s1: []const u16 = undefined,
14 14 s2: []const u3 = undefined,
15const Hangul = @This(); 15};
16
17pub fn init(allocator: mem.Allocator) !Hangul {
18 const in_bytes = @embedFile("hangul");
19 var in_fbs = std.io.fixedBufferStream(in_bytes);
20 var reader = in_fbs.reader();
21
22 const endian = builtin.cpu.arch.endian();
23 var hangul = Hangul{};
24
25 const stage_1_len: u16 = try reader.readInt(u16, endian);
26 hangul.s1 = try allocator.alloc(u16, stage_1_len);
27 errdefer allocator.free(hangul.s1);
28 for (0..stage_1_len) |i| hangul.s1[i] = try reader.readInt(u16, endian);
29
30 const stage_2_len: u16 = try reader.readInt(u16, endian);
31 hangul.s2 = try allocator.alloc(u3, stage_2_len);
32 errdefer allocator.free(hangul.s2);
33 for (0..stage_2_len) |i| hangul.s2[i] = @intCast(try reader.readInt(u8, endian));
34 16
35 return hangul; 17const hangul = hangul_data: {
36} 18 const data = @import("hangul");
19 break :hangul_data Data{
20 .s1 = &data.s1,
21 .s2 = &data.s2,
22 };
23};
37 24
38pub fn deinit(hangul: *const Hangul, allocator: mem.Allocator) void { 25const Hangul = @This();
39 allocator.free(hangul.s1);
40 allocator.free(hangul.s2);
41}
42 26
43/// Returns the Hangul syllable type for `cp`. 27/// Returns the Hangul syllable type for `cp`.
44pub fn syllable(hangul: *const Hangul, cp: u21) Syllable { 28pub fn syllable(cp: u21) Syllable {
45 return @enumFromInt(hangul.s2[hangul.s1[cp >> 8] + (cp & 0xff)]); 29 return @enumFromInt(hangul.s2[hangul.s1[cp >> 8] + (cp & 0xff)]);
46} 30}
47 31
diff --git a/src/NormPropsData.zig b/src/NormPropsData.zig
index 7b53542..cca3556 100644
--- a/src/NormPropsData.zig
+++ b/src/NormPropsData.zig
@@ -1,48 +1,32 @@
1//! Normalization Properties Data 1//! Normalization Properties Data
2 2
3s1: []u16 = undefined, 3const Data = struct {
4s2: []u4 = undefined, 4 s1: []const u16 = undefined,
5 s2: []const u3 = undefined,
6};
7
8const norms = norm_props_data: {
9 const data = @import("normp");
10 break :norm_props_data Data{
11 .s1 = &data.s1,
12 .s2 = &data.s2,
13 };
14};
5 15
6const NormProps = @This(); 16const NormProps = @This();
7 17
8pub fn init(allocator: mem.Allocator) !NormProps {
9 const in_bytes = @embedFile("normp");
10 var in_fbs = std.io.fixedBufferStream(in_bytes);
11 var reader = in_fbs.reader();
12
13 const endian = builtin.cpu.arch.endian();
14 var norms = NormProps{};
15
16 const stage_1_len: u16 = try reader.readInt(u16, endian);
17 norms.s1 = try allocator.alloc(u16, stage_1_len);
18 errdefer allocator.free(norms.s1);
19 for (0..stage_1_len) |i| norms.s1[i] = try reader.readInt(u16, endian);
20
21 const stage_2_len: u16 = try reader.readInt(u16, endian);
22 norms.s2 = try allocator.alloc(u4, stage_2_len);
23 errdefer allocator.free(norms.s2);
24 for (0..stage_2_len) |i| norms.s2[i] = @intCast(try reader.readInt(u8, endian));
25
26 return norms;
27}
28
29pub fn deinit(norms: *const NormProps, allocator: mem.Allocator) void {
30 allocator.free(norms.s1);
31 allocator.free(norms.s2);
32}
33
34/// Returns true if `cp` is already in NFD form. 18/// Returns true if `cp` is already in NFD form.
35pub fn isNfd(norms: *const NormProps, cp: u21) bool { 19pub fn isNfd(cp: u21) bool {
36 return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 1 == 0; 20 return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 1 == 0;
37} 21}
38 22
39/// Returns true if `cp` is already in NFKD form. 23/// Returns true if `cp` is already in NFKD form.
40pub fn isNfkd(norms: *const NormProps, cp: u21) bool { 24pub fn isNfkd(cp: u21) bool {
41 return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 2 == 0; 25 return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 2 == 0;
42} 26}
43 27
44/// Returns true if `cp` is not allowed in any normalized form. 28/// Returns true if `cp` is not allowed in any normalized form.
45pub fn isFcx(norms: *const NormProps, cp: u21) bool { 29pub fn isFcx(cp: u21) bool {
46 return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 4 == 4; 30 return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
47} 31}
48 32
diff --git a/src/Normalize.zig b/src/Normalize.zig
index 4a1bae8..3191a8c 100644
--- a/src/Normalize.zig
+++ b/src/Normalize.zig
@@ -3,64 +3,22 @@
3//! NFKC, NFD, and NFKD normalization forms. 3//! NFKC, NFD, and NFKD normalization forms.
4 4
5canon_data: CanonData = undefined, 5canon_data: CanonData = undefined,
6ccc_data: CccData = undefined,
7compat_data: CompatData = undefined,
8hangul_data: HangulData = undefined,
9normp_data: NormPropsData = undefined,
10 6
11const Normalize = @This(); 7const Normalize = @This();
12 8
13pub fn init(allocator: Allocator) Allocator.Error!Normalize { 9pub fn init(allocator: Allocator) !Normalize {
14 var norm: Normalize = undefined; 10 var norm: Normalize = undefined;
15 try norm.setup(allocator); 11 try norm.setup(allocator);
16 return norm; 12 return norm;
17} 13}
18 14
19pub fn setup(self: *Normalize, allocator: Allocator) Allocator.Error!void { 15pub fn setup(self: *Normalize, allocator: Allocator) !void {
20 self.canon_data = CanonData.init(allocator) catch |err| { 16 self.canon_data = try CanonData.init(allocator);
21 switch (err) {
22 error.OutOfMemory => |e| return e,
23 else => unreachable,
24 }
25 };
26 errdefer self.canon_data.deinit(allocator);
27 self.ccc_data = CccData.init(allocator) catch |err| {
28 switch (err) {
29 error.OutOfMemory => |e| return e,
30 else => unreachable,
31 }
32 };
33 errdefer self.ccc_data.deinit(allocator);
34 self.compat_data = CompatData.init(allocator) catch |err| {
35 switch (err) {
36 error.OutOfMemory => |e| return e,
37 else => unreachable,
38 }
39 };
40 errdefer self.compat_data.deinit(allocator);
41 self.hangul_data = HangulData.init(allocator) catch |err| {
42 switch (err) {
43 error.OutOfMemory => |e| return e,
44 else => unreachable,
45 }
46 };
47 errdefer self.hangul_data.deinit(allocator);
48 self.normp_data = NormPropsData.init(allocator) catch |err| {
49 switch (err) {
50 error.OutOfMemory => |e| return e,
51 else => unreachable,
52 }
53 };
54} 17}
55 18
56pub fn deinit(norm: *const Normalize, allocator: Allocator) void { 19pub fn deinit(norm: *const Normalize, allocator: Allocator) void {
57 // Reasonably safe (?) 20 const mut_norm = @constCast(norm);
58 var mut_norm = @constCast(norm);
59 mut_norm.canon_data.deinit(allocator); 21 mut_norm.canon_data.deinit(allocator);
60 mut_norm.ccc_data.deinit(allocator);
61 mut_norm.compat_data.deinit(allocator);
62 mut_norm.hangul_data.deinit(allocator);
63 mut_norm.normp_data.deinit(allocator);
64} 22}
65 23
66const SBase: u21 = 0xAC00; 24const SBase: u21 = 0xAC00;
@@ -73,8 +31,8 @@ const TCount: u21 = 28;
73const NCount: u21 = 588; // VCount * TCount 31const NCount: u21 = 588; // VCount * TCount
74const SCount: u21 = 11172; // LCount * NCount 32const SCount: u21 = 11172; // LCount * NCount
75 33
76fn decomposeHangul(self: Normalize, cp: u21, buf: []u21) ?Decomp { 34fn decomposeHangul(cp: u21, buf: []u21) ?Decomp {
77 const kind = self.hangul_data.syllable(cp); 35 const kind = HangulData.syllable(cp);
78 if (kind != .LV and kind != .LVT) return null; 36 if (kind != .LV and kind != .LVT) return null;
79 37
80 const SIndex: u21 = cp - SBase; 38 const SIndex: u21 = cp - SBase;
@@ -143,7 +101,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp {
143 }, 101 },
144 102
145 .nfkd => { 103 .nfkd => {
146 dc.cps = self.compat_data.toNfkd(cp); 104 dc.cps = CompatData.toNfkd(cp);
147 if (dc.cps.len != 0) { 105 if (dc.cps.len != 0) {
148 dc.form = .nfkd; 106 dc.form = .nfkd;
149 } else { 107 } else {
@@ -170,13 +128,13 @@ fn decompose(
170 128
171 // NFD / NFKD quick checks. 129 // NFD / NFKD quick checks.
172 switch (form) { 130 switch (form) {
173 .nfd => if (self.normp_data.isNfd(cp)) return .{}, 131 .nfd => if (NormPropsData.isNfd(cp)) return .{},
174 .nfkd => if (self.normp_data.isNfkd(cp)) return .{}, 132 .nfkd => if (NormPropsData.isNfkd(cp)) return .{},
175 else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."), 133 else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."),
176 } 134 }
177 135
178 // Hangul precomposed syllable full decomposition. 136 // Hangul precomposed syllable full decomposition.
179 if (self.decomposeHangul(cp, buf)) |dc| return dc; 137 if (decomposeHangul(cp, buf)) |dc| return dc;
180 138
181 // Full decomposition. 139 // Full decomposition.
182 var dc = Decomp{ .form = form }; 140 var dc = Decomp{ .form = form };
@@ -218,9 +176,8 @@ fn decompose(
218 176
219test "decompose" { 177test "decompose" {
220 const allocator = testing.allocator; 178 const allocator = testing.allocator;
221 const n = try Normalize.init(allocator); 179 var n = try Normalize.init(allocator);
222 defer n.deinit(allocator); 180 defer n.deinit(allocator);
223
224 var buf: [18]u21 = undefined; 181 var buf: [18]u21 = undefined;
225 182
226 var dc = n.decompose('é', .nfd, &buf); 183 var dc = n.decompose('é', .nfd, &buf);
@@ -280,17 +237,17 @@ pub const Result = struct {
280}; 237};
281 238
282// Compares code points by Canonical Combining Class order. 239// Compares code points by Canonical Combining Class order.
283fn cccLess(self: Normalize, lhs: u21, rhs: u21) bool { 240fn cccLess(_: void, lhs: u21, rhs: u21) bool {
284 return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs); 241 return CombiningData.ccc(lhs) < CombiningData.ccc(rhs);
285} 242}
286 243
287// Applies the Canonical Sorting Algorithm. 244// Applies the Canonical Sorting Algorithm.
288fn canonicalSort(self: Normalize, cps: []u21) void { 245fn canonicalSort(cps: []u21) void {
289 var i: usize = 0; 246 var i: usize = 0;
290 while (i < cps.len) : (i += 1) { 247 while (i < cps.len) : (i += 1) {
291 const start: usize = i; 248 const start: usize = i;
292 while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} 249 while (i < cps.len and CombiningData.ccc(cps[i]) != 0) : (i += 1) {}
293 mem.sort(u21, cps[start..i], self, cccLess); 250 mem.sort(u21, cps[start..i], {}, cccLess);
294 } 251 }
295} 252}
296 253
@@ -320,7 +277,7 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo
320 } 277 }
321 } 278 }
322 279
323 self.canonicalSort(dcp_list.items); 280 canonicalSort(dcp_list.items);
324 281
325 return try dcp_list.toOwnedSlice(); 282 return try dcp_list.toOwnedSlice();
326} 283}
@@ -346,7 +303,7 @@ fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
346 303
347test "nfd ASCII / no-alloc" { 304test "nfd ASCII / no-alloc" {
348 const allocator = testing.allocator; 305 const allocator = testing.allocator;
349 const n = try Normalize.init(allocator); 306 var n = try Normalize.init(allocator);
350 defer n.deinit(allocator); 307 defer n.deinit(allocator);
351 308
352 const result = try n.nfd(allocator, "Hello World!"); 309 const result = try n.nfd(allocator, "Hello World!");
@@ -357,7 +314,7 @@ test "nfd ASCII / no-alloc" {
357 314
358test "nfd !ASCII / alloc" { 315test "nfd !ASCII / alloc" {
359 const allocator = testing.allocator; 316 const allocator = testing.allocator;
360 const n = try Normalize.init(allocator); 317 var n = try Normalize.init(allocator);
361 defer n.deinit(allocator); 318 defer n.deinit(allocator);
362 319
363 const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); 320 const result = try n.nfd(allocator, "Héllo World! \u{3d3}");
@@ -368,7 +325,7 @@ test "nfd !ASCII / alloc" {
368 325
369test "nfkd ASCII / no-alloc" { 326test "nfkd ASCII / no-alloc" {
370 const allocator = testing.allocator; 327 const allocator = testing.allocator;
371 const n = try Normalize.init(allocator); 328 var n = try Normalize.init(allocator);
372 defer n.deinit(allocator); 329 defer n.deinit(allocator);
373 330
374 const result = try n.nfkd(allocator, "Hello World!"); 331 const result = try n.nfkd(allocator, "Hello World!");
@@ -379,7 +336,7 @@ test "nfkd ASCII / no-alloc" {
379 336
380test "nfkd !ASCII / alloc" { 337test "nfkd !ASCII / alloc" {
381 const allocator = testing.allocator; 338 const allocator = testing.allocator;
382 const n = try Normalize.init(allocator); 339 var n = try Normalize.init(allocator);
383 defer n.deinit(allocator); 340 defer n.deinit(allocator);
384 341
385 const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); 342 const result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
@@ -408,7 +365,7 @@ pub fn nfdCodePoints(
408 } 365 }
409 } 366 }
410 367
411 self.canonicalSort(dcp_list.items); 368 canonicalSort(dcp_list.items);
412 369
413 return try dcp_list.toOwnedSlice(); 370 return try dcp_list.toOwnedSlice();
414} 371}
@@ -433,15 +390,15 @@ pub fn nfkdCodePoints(
433 } 390 }
434 } 391 }
435 392
436 self.canonicalSort(dcp_list.items); 393 canonicalSort(dcp_list.items);
437 394
438 return try dcp_list.toOwnedSlice(); 395 return try dcp_list.toOwnedSlice();
439} 396}
440 397
441// Composition (NFC, NFKC) 398// Composition (NFC, NFKC)
442 399
443fn isHangul(self: Normalize, cp: u21) bool { 400fn isHangul(cp: u21) bool {
444 return cp >= 0x1100 and self.hangul_data.syllable(cp) != .none; 401 return cp >= 0x1100 and HangulData.syllable(cp) != .none;
445} 402}
446 403
447/// Normalizes `str` to NFC. 404/// Normalizes `str` to NFC.
@@ -479,7 +436,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
479 block_check: while (i < dcps.len) : (i += 1) { 436 block_check: while (i < dcps.len) : (i += 1) {
480 const C = dcps[i]; 437 const C = dcps[i];
481 if (C == tombstone) continue :block_check; 438 if (C == tombstone) continue :block_check;
482 const cc_C = self.ccc_data.ccc(C); 439 const cc_C = CombiningData.ccc(C);
483 var starter_index: ?usize = null; 440 var starter_index: ?usize = null;
484 var j: usize = i; 441 var j: usize = i;
485 442
@@ -489,12 +446,12 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
489 if (dcps[j] == tombstone) continue; 446 if (dcps[j] == tombstone) continue;
490 447
491 // Check for starter. 448 // Check for starter.
492 if (self.ccc_data.isStarter(dcps[j])) { 449 if (CombiningData.isStarter(dcps[j])) {
493 // Check for blocking conditions. 450 // Check for blocking conditions.
494 for (dcps[(j + 1)..i]) |B| { 451 for (dcps[(j + 1)..i]) |B| {
495 if (B == tombstone) continue; 452 if (B == tombstone) continue;
496 const cc_B = self.ccc_data.ccc(B); 453 const cc_B = CombiningData.ccc(B);
497 if (cc_B != 0 and self.isHangul(C)) continue :block_check; 454 if (cc_B != 0 and isHangul(C)) continue :block_check;
498 if (cc_B >= cc_C) continue :block_check; 455 if (cc_B >= cc_C) continue :block_check;
499 } 456 }
500 457
@@ -515,10 +472,10 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
515 472
516 // If L and C are Hangul syllables, we can compose 473 // If L and C are Hangul syllables, we can compose
517 // them algorithmically if possible. 474 // them algorithmically if possible.
518 if (self.isHangul(L) and self.isHangul(C)) { 475 if (isHangul(L) and isHangul(C)) {
519 // Get Hangul syllable types. 476 // Get Hangul syllable types.
520 const l_stype = self.hangul_data.syllable(L); 477 const l_stype = HangulData.syllable(L);
521 const c_stype = self.hangul_data.syllable(C); 478 const c_stype = HangulData.syllable(C);
522 479
523 if (l_stype == .LV and c_stype == .T) { 480 if (l_stype == .LV and c_stype == .T) {
524 // LV, T canonical composition. 481 // LV, T canonical composition.
@@ -547,7 +504,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
547 // Composition Exclusions (FCX) list, 504 // Composition Exclusions (FCX) list,
548 // preventing it from appearing in any 505 // preventing it from appearing in any
549 // composed form (NFC, NFKC). 506 // composed form (NFC, NFKC).
550 if (!self.normp_data.isFcx(P)) { 507 if (!NormPropsData.isFcx(P)) {
551 dcps[sidx] = P; 508 dcps[sidx] = P;
552 dcps[i] = tombstone; // Mark for deletion. 509 dcps[i] = tombstone; // Mark for deletion.
553 deleted += 1; 510 deleted += 1;
@@ -577,7 +534,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
577 534
578test "nfc" { 535test "nfc" {
579 const allocator = testing.allocator; 536 const allocator = testing.allocator;
580 const n = try Normalize.init(allocator); 537 var n = try Normalize.init(allocator);
581 defer n.deinit(allocator); 538 defer n.deinit(allocator);
582 539
583 const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); 540 const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
@@ -588,7 +545,7 @@ test "nfc" {
588 545
589test "nfkc" { 546test "nfkc" {
590 const allocator = testing.allocator; 547 const allocator = testing.allocator;
591 const n = try Normalize.init(allocator); 548 var n = try Normalize.init(allocator);
592 defer n.deinit(allocator); 549 defer n.deinit(allocator);
593 550
594 const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); 551 const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
@@ -609,7 +566,7 @@ pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8)
609 566
610test "eql" { 567test "eql" {
611 const allocator = testing.allocator; 568 const allocator = testing.allocator;
612 const n = try Normalize.init(allocator); 569 var n = try Normalize.init(allocator);
613 defer n.deinit(allocator); 570 defer n.deinit(allocator);
614 571
615 try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); 572 try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
@@ -666,13 +623,13 @@ const mem = std.mem;
666const simd = std.simd; 623const simd = std.simd;
667const testing = std.testing; 624const testing = std.testing;
668const unicode = std.unicode; 625const unicode = std.unicode;
669const Allocator = std.mem.Allocator; 626const Allocator = mem.Allocator;
670 627
671const ascii = @import("ascii"); 628const ascii = @import("ascii");
672const CodePointIterator = @import("code_point").Iterator; 629const CodePointIterator = @import("code_point").Iterator;
673 630
674const CanonData = @import("CanonData"); 631const CanonData = @import("CanonData");
675const CccData = @import("CombiningData"); 632const CombiningData = @import("CombiningData");
676const CompatData = @import("CompatData"); 633const CompatData = @import("CompatData");
677const HangulData = @import("HangulData"); 634const HangulData = @import("HangulData");
678const NormPropsData = @import("NormPropsData"); 635const NormPropsData = @import("NormPropsData");
diff --git a/src/comptime_map.zig b/src/comptime_map.zig
new file mode 100644
index 0000000..b2c4d11
--- /dev/null
+++ b/src/comptime_map.zig
@@ -0,0 +1,176 @@
1//! A copypasta of Vexu's comptime hashmap:
2//!
3//! https://github.com/Vexu/comptime_hash_map
4//!
5//! (License at base of file, thanks Vexu!)
6//!
7
8/// A comptime hashmap constructed with automatically selected hash and eql functions.
9pub fn AutoComptimeHashMap(comptime K: type, comptime V: type, comptime values: anytype) type {
10 return ComptimeHashMap(K, V, hash_map.AutoContext(K), values);
11}
12
13/// Builtin hashmap for strings as keys.
14pub fn ComptimeStringHashMap(comptime V: type, comptime values: anytype) type {
15 return ComptimeHashMap([]const u8, V, hash_map.StringContext, values);
16}
17
18/// A hashmap which is constructed at compile time from constant values.
19/// Intended to be used as a faster lookup table.
20pub fn ComptimeHashMap(comptime K: type, comptime V: type, comptime ctx: type, comptime values: anytype) type {
21 std.debug.assert(values.len != 0);
22 @setEvalBranchQuota(1000 * values.len);
23
24 const Entry = struct {
25 key: K = undefined,
26 val: V = undefined,
27 used: bool = false,
28 };
29
30 // ensure that the hash map will be at most 60% full
31 const size = if (@import("builtin").is_test) values.len else math.ceilPowerOfTwo(usize, values.len * 5 / 3) catch unreachable;
32 comptime var slots = [1]Entry{.{}} ** size;
33 comptime var distance: [size]usize = .{0} ** size;
34
35 comptime var max_distance_from_start_index = 0;
36
37 slot_loop: for (values) |kv| {
38 var key: K = kv.@"0";
39 var value: V = kv.@"1";
40
41 const start_index = @as(usize, ctx.hash(undefined, key)) & (size - 1);
42
43 var roll_over = 0;
44 var distance_from_start_index = 0;
45 while (roll_over < size) : ({
46 roll_over += 1;
47 distance_from_start_index += 1;
48 }) {
49 const index = (start_index + roll_over) & (size - 1);
50 const entry = &slots[index];
51
52 if (entry.used and !ctx.eql(undefined, entry.key, key)) {
53 if (distance[index] < distance_from_start_index) {
54 // robin hood to the rescue
55 const tmp = slots[index];
56 max_distance_from_start_index = @max(max_distance_from_start_index, distance_from_start_index);
57 entry.* = .{
58 .used = true,
59 .key = key,
60 .val = value,
61 };
62 const tmp_distance = distance[index];
63 distance[index] = distance_from_start_index;
64 key = tmp.key;
65 value = tmp.val;
66 distance_from_start_index = tmp_distance;
67 }
68 continue;
69 }
70
71 max_distance_from_start_index = @max(distance_from_start_index, max_distance_from_start_index);
72 entry.* = .{
73 .used = true,
74 .key = key,
75 .val = value,
76 };
77 distance[index] = distance_from_start_index;
78 continue :slot_loop;
79 }
80 unreachable; // put into a full map
81 }
82
83 return struct {
84 const entries = slots;
85
86 pub fn has(key: K) bool {
87 return get(key) != null;
88 }
89
90 pub fn get(key: K) ?*const V {
91 const start_index = @as(usize, ctx.hash(undefined, key)) & (size - 1);
92 {
93 var roll_over: usize = 0;
94 while (roll_over <= max_distance_from_start_index) : (roll_over += 1) {
95 const index = (start_index + roll_over) & (size - 1);
96 const entry = &entries[index];
97
98 if (!entry.used) return null;
99 if (ctx.eql(undefined, entry.key, key)) return &entry.val;
100 }
101 }
102 return null;
103 }
104 };
105}
106
107test "basic usage" {
108 const map = ComptimeStringHashMap(usize, .{
109 .{ "foo", 1 },
110 .{ "bar", 2 },
111 .{ "baz", 3 },
112 .{ "quux", 4 },
113 .{ "Foo", 1 },
114 .{ "Bar", 2 },
115 .{ "Baz", 3 },
116 .{ "Quux", 4 },
117 });
118
119 try testing.expect(map.has("foo"));
120 try testing.expect(map.has("bar"));
121 try testing.expect(map.has("Foo"));
122 try testing.expect(map.has("Bar"));
123 try testing.expect(!map.has("zig"));
124 try testing.expect(!map.has("ziguana"));
125
126 try testing.expect(map.get("baz").?.* == 3);
127 try testing.expect(map.get("quux").?.* == 4);
128 try testing.expect(map.get("nah") == null);
129 try testing.expect(map.get("...") == null);
130}
131
132test "auto comptime hash map" {
133 const map = AutoComptimeHashMap(usize, []const u8, .{
134 .{ 1, "foo" },
135 .{ 2, "bar" },
136 .{ 3, "baz" },
137 .{ 45, "quux" },
138 });
139
140 try testing.expect(map.has(1));
141 try testing.expect(map.has(2));
142 try testing.expect(!map.has(4));
143 try testing.expect(!map.has(1_000_000));
144
145 try testing.expectEqualStrings("foo", map.get(1).?.*);
146 try testing.expectEqualStrings("bar", map.get(2).?.*);
147 try testing.expect(map.get(4) == null);
148 try testing.expect(map.get(4_000_000) == null);
149}
150
151const std = @import("std");
152const hash_map = std.hash_map;
153const testing = std.testing;
154const math = std.math;
155
156// MIT License
157//
158// Copyright (c) 2020 Veikka Tuominen
159//
160// Permission is hereby granted, free of charge, to any person obtaining a copy
161// of this software and associated documentation files (the "Software"), to deal
162// in the Software without restriction, including without limitation the rights
163// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
164// copies of the Software, and to permit persons to whom the Software is
165// furnished to do so, subject to the following conditions:
166//
167// The above copyright notice and this permission notice shall be included in all
168// copies or substantial portions of the Software.
169//
170// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
171// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
172// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
173// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
174// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
175// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
176// SOFTWARE.