summaryrefslogtreecommitdiff
path: root/src/Normalizer.zig
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-20 09:13:36 -0400
committerGravatar Jose Colon Rodriguez2024-02-20 09:13:36 -0400
commit134a7df8206aa66ca7b0abbc7f31f08410b502d2 (patch)
treee3c520dce4f529de5290bc145847e78fb5583bee /src/Normalizer.zig
parentCleaned up directory structure (diff)
downloadzg-134a7df8206aa66ca7b0abbc7f31f08410b502d2.tar.gz
zg-134a7df8206aa66ca7b0abbc7f31f08410b502d2.tar.xz
zg-134a7df8206aa66ca7b0abbc7f31f08410b502d2.zip
Replaced ccc_map with table. 20ms faster
Diffstat (limited to 'src/Normalizer.zig')
-rw-r--r--src/Normalizer.zig863
1 files changed, 863 insertions, 0 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig
new file mode 100644
index 0000000..1b4a2d5
--- /dev/null
+++ b/src/Normalizer.zig
@@ -0,0 +1,863 @@
1//! Normalizer contains functions and methods that implement Unicode Normalization algorithms. You can normalize strings
2//! into NFC, NFKC, NFD, and NFKD normalization forms (see `nfc`, `nfkc`, `nfd`, and `nfkd`). You can also test for
3//! string equality under different parameters related to normalization (see `eql`, `eqlCaseless`, `eqlIdentifiers`).
4
5const std = @import("std");
6
7const CodePointIterator = @import("code_point").Iterator;
8const case_fold_map = @import("ziglyph").case_folding;
9const hangul_map = @import("ziglyph").hangul;
10const norm_props = @import("ziglyph").normalization_props;
11const normp = @import("normp");
12
13const Self = @This();
14
15nfc_map: std.AutoHashMap([2]u21, u21),
16nfd_map: std.AutoHashMap(u21, [2]u21),
17nfkd_map: std.AutoHashMap(u21, [18]u21),
18
19pub fn init(allocator: std.mem.Allocator) !Self {
20 var self = Self{
21 .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator),
22 .nfd_map = std.AutoHashMap(u21, [2]u21).init(allocator),
23 .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator),
24 };
25 errdefer self.deinit();
26
27 // Canonical compositions
28 const decompressor = std.compress.deflate.decompressor;
29 const comp_file = @embedFile("autogen/canonical_compositions.txt.deflate");
30 var comp_stream = std.io.fixedBufferStream(comp_file);
31 var comp_decomp = try decompressor(allocator, comp_stream.reader(), null);
32 defer comp_decomp.deinit();
33
34 var comp_buf = std.io.bufferedReader(comp_decomp.reader());
35 const comp_reader = comp_buf.reader();
36 var buf: [4096]u8 = undefined;
37
38 while (try comp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| {
39 if (line.len == 0) continue;
40 var fields = std.mem.split(u8, line, ";");
41 const cp_a = try std.fmt.parseInt(u21, fields.next().?, 16);
42 const cp_b = try std.fmt.parseInt(u21, fields.next().?, 16);
43 const cp_c = try std.fmt.parseInt(u21, fields.next().?, 16);
44 try self.nfc_map.put(.{ cp_a, cp_b }, cp_c);
45 }
46
47 // Canonical decompositions
48 const decomp_file = @embedFile("autogen/canonical_decompositions.txt.deflate");
49 var decomp_stream = std.io.fixedBufferStream(decomp_file);
50 var decomp_decomp = try decompressor(allocator, decomp_stream.reader(), null);
51 defer decomp_decomp.deinit();
52
53 var decomp_buf = std.io.bufferedReader(decomp_decomp.reader());
54 const decomp_reader = decomp_buf.reader();
55
56 while (try decomp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| {
57 if (line.len == 0) continue;
58 var fields = std.mem.split(u8, line, ";");
59 const cp_a = try std.fmt.parseInt(u21, fields.next().?, 16);
60 const cp_b = try std.fmt.parseInt(u21, fields.next().?, 16);
61 const cp_c = try std.fmt.parseInt(u21, fields.next().?, 16);
62 try self.nfd_map.put(cp_a, .{ cp_b, cp_c });
63 }
64
65 // Compatibility decompositions
66 const dekomp_file = @embedFile("autogen/compatibility_decompositions.txt.deflate");
67 var dekomp_stream = std.io.fixedBufferStream(dekomp_file);
68 var dekomp_decomp = try decompressor(allocator, dekomp_stream.reader(), null);
69 defer dekomp_decomp.deinit();
70
71 var dekomp_buf = std.io.bufferedReader(dekomp_decomp.reader());
72 const dekomp_reader = dekomp_buf.reader();
73
74 while (try dekomp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| {
75 if (line.len == 0) continue;
76 var fields = std.mem.split(u8, line, ";");
77 const cp_a = try std.fmt.parseInt(u21, fields.next().?, 16);
78 var cps = [_]u21{0} ** 18;
79 var i: usize = 0;
80
81 while (fields.next()) |cp| : (i += 1) {
82 cps[i] = try std.fmt.parseInt(u21, cp, 16);
83 }
84
85 try self.nfkd_map.put(cp_a, cps);
86 }
87
88 return self;
89}
90
91pub fn deinit(self: *Self) void {
92 self.nfc_map.deinit();
93 self.nfd_map.deinit();
94 self.nfkd_map.deinit();
95}
96
97test "init / deinit" {
98 var n = try init(std.testing.allocator);
99 defer n.deinit();
100}
101
102// Hangul processing utilities.
103fn isHangulPrecomposed(cp: u21) bool {
104 if (hangul_map.syllableType(cp)) |kind| return kind == .LV or kind == .LVT;
105 return false;
106}
107
108const SBase: u21 = 0xAC00;
109const LBase: u21 = 0x1100;
110const VBase: u21 = 0x1161;
111const TBase: u21 = 0x11A7;
112const LCount: u21 = 19;
113const VCount: u21 = 21;
114const TCount: u21 = 28;
115const NCount: u21 = 588; // VCount * TCount
116const SCount: u21 = 11172; // LCount * NCount
117
118fn decomposeHangul(cp: u21) [3]u21 {
119 const SIndex: u21 = cp - SBase;
120 const LIndex: u21 = SIndex / NCount;
121 const VIndex: u21 = (SIndex % NCount) / TCount;
122 const TIndex: u21 = SIndex % TCount;
123 const LPart: u21 = LBase + LIndex;
124 const VPart: u21 = VBase + VIndex;
125 var TPart: u21 = 0;
126 if (TIndex != 0) TPart = TBase + TIndex;
127
128 return [3]u21{ LPart, VPart, TPart };
129}
130
131fn composeHangulCanon(lv: u21, t: u21) u21 {
132 std.debug.assert(0x11A8 <= t and t <= 0x11C2);
133 return lv + (t - TBase);
134}
135
136fn composeHangulFull(l: u21, v: u21, t: u21) u21 {
137 std.debug.assert(0x1100 <= l and l <= 0x1112);
138 std.debug.assert(0x1161 <= v and v <= 0x1175);
139 const LIndex = l - LBase;
140 const VIndex = v - VBase;
141 const LVIndex = LIndex * NCount + VIndex * TCount;
142
143 if (t == 0) return SBase + LVIndex;
144
145 std.debug.assert(0x11A8 <= t and t <= 0x11C2);
146 const TIndex = t - TBase;
147
148 return SBase + LVIndex + TIndex;
149}
150
151const Form = enum {
152 nfc,
153 nfd,
154 nfkc,
155 nfkd,
156 same,
157};
158
159const Decomp = struct {
160 form: Form = .nfd,
161 cps: [18]u21 = [_]u21{0} ** 18,
162};
163
164/// `mapping` retrieves the decomposition mapping for a code point as per the UCD.
165pub fn mapping(self: Self, cp: u21, form: Form) Decomp {
166 std.debug.assert(form == .nfd or form == .nfkd);
167
168 var dc = Decomp{ .form = .same };
169 dc.cps[0] = cp;
170
171 if (self.nfkd_map.get(cp)) |array| {
172 if (form != .nfd) {
173 dc.form = .nfkd;
174 @memcpy(dc.cps[0..array.len], &array);
175 }
176 } else if (self.nfd_map.get(cp)) |array| {
177 dc.form = .nfd;
178 @memcpy(dc.cps[0..array.len], &array);
179 }
180
181 return dc;
182}
183
184/// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`.
185pub fn decompose(self: Self, cp: u21, form: Form) Decomp {
186 std.debug.assert(form == .nfd or form == .nfkd);
187
188 var dc = Decomp{ .form = form };
189
190 // ASCII or NFD / NFKD quick checks.
191 if (cp <= 127 or (form == .nfd and norm_props.isNfd(cp)) or (form == .nfkd and norm_props.isNfkd(cp))) {
192 dc.cps[0] = cp;
193 return dc;
194 }
195
196 // Hangul precomposed syllable full decomposition.
197 if (isHangulPrecomposed(cp)) {
198 const cps = decomposeHangul(cp);
199 @memcpy(dc.cps[0..cps.len], &cps);
200 return dc;
201 }
202
203 // Full decomposition.
204 var result_index: usize = 0;
205 var work_index: usize = 1;
206
207 // Start work with argument code point.
208 var work = [_]u21{cp} ++ [_]u21{0} ** 17;
209
210 while (work_index > 0) {
211 // Look at previous code point in work queue.
212 work_index -= 1;
213 const next = work[work_index];
214 const m = self.mapping(next, form);
215
216 // No more of decompositions for this code point.
217 if (m.form == .same) {
218 dc.cps[result_index] = m.cps[0];
219 result_index += 1;
220 continue;
221 }
222
223 // Find last index of decomposition.
224 const m_last = for (m.cps, 0..) |mcp, i| {
225 if (mcp == 0) break i;
226 } else m.cps.len;
227
228 // Work backwards through decomposition.
229 // `i` starts at 1 because m_last is 1 past the last code point.
230 var i: usize = 1;
231 while (i <= m_last) : ({
232 i += 1;
233 work_index += 1;
234 }) {
235 work[work_index] = m.cps[m_last - i];
236 }
237 }
238
239 return dc;
240}
241
242test "decompose" {
243 const allocator = std.testing.allocator;
244 var n = try init(allocator);
245 defer n.deinit();
246
247 var dc = n.decompose('é', .nfd);
248 try std.testing.expect(dc.form == .nfd);
249 try std.testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]);
250
251 dc = n.decompose('\u{1e0a}', .nfd);
252 try std.testing.expect(dc.form == .nfd);
253 try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
254
255 dc = n.decompose('\u{1e0a}', .nfkd);
256 try std.testing.expect(dc.form == .nfkd);
257 try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
258
259 dc = n.decompose('\u{3189}', .nfd);
260 try std.testing.expect(dc.form == .nfd);
261 try std.testing.expectEqualSlices(u21, &[_]u21{'\u{3189}'}, dc.cps[0..1]);
262
263 dc = n.decompose('\u{3189}', .nfkd);
264 try std.testing.expect(dc.form == .nfkd);
265 try std.testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]);
266
267 dc = n.decompose('\u{ace1}', .nfd);
268 try std.testing.expect(dc.form == .nfd);
269 try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
270
271 dc = n.decompose('\u{ace1}', .nfkd);
272 try std.testing.expect(dc.form == .nfkd);
273 try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
274
275 dc = n.decompose('\u{3d3}', .nfd);
276 try std.testing.expect(dc.form == .nfd);
277 try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]);
278
279 dc = n.decompose('\u{3d3}', .nfkd);
280 try std.testing.expect(dc.form == .nfkd);
281 try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]);
282}
283
284// Some quick checks.
285
286fn onlyAscii(str: []const u8) bool {
287 return for (str) |b| {
288 if (b > 127) break false;
289 } else true;
290}
291
292fn onlyLatin1(str: []const u8) bool {
293 var cp_iter = CodePointIterator{ .bytes = str };
294 return while (cp_iter.next()) |cp| {
295 if (cp.code > 256) break false;
296 } else true;
297}
298
299/// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory.
300pub const Result = struct {
301 allocator: ?std.mem.Allocator = null,
302 slice: []const u8,
303
304 pub fn deinit(self: *Result) void {
305 if (self.allocator) |allocator| allocator.free(self.slice);
306 }
307};
308
309// Compares code points by Canonical Combining Class order.
310fn cccLess(_: void, lhs: u21, rhs: u21) bool {
311 const lcc = normp.stage_2[normp.stage_1[lhs >> 8] + (lhs & 0xff)];
312 const rcc = normp.stage_2[normp.stage_1[rhs >> 8] + (rhs & 0xff)];
313 return lcc < rcc;
314}
315
316// Applies the Canonical Sorting Algorithm.
317fn canonicalSort(cps: []u21) void {
318 var i: usize = 0;
319 while (i < cps.len) : (i += 1) {
320 const start: usize = i;
321 while (i < cps.len and normp.stage_2[normp.stage_1[cps[i] >> 8] + (cps[i] & 0xff)] != 0) : (i += 1) {}
322 std.mem.sort(u21, cps[start..i], {}, cccLess);
323 }
324}
325
326/// Normalize `str` to NFD.
327pub fn nfd(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result {
328 return self.nfxd(allocator, str, .nfd);
329}
330
331/// Normalize `str` to NFKD.
332pub fn nfkd(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result {
333 return self.nfxd(allocator, str, .nfkd);
334}
335
336fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result {
337 // Quick checks.
338 if (onlyAscii(str)) return Result{ .slice = str };
339
340 var dcp_list = try std.ArrayList(u21).initCapacity(allocator, str.len + str.len / 2);
341 defer dcp_list.deinit();
342
343 var cp_iter = CodePointIterator{ .bytes = str };
344 while (cp_iter.next()) |cp| {
345 const dc = self.decompose(cp.code, form);
346 const slice = for (dc.cps, 0..) |dcp, i| {
347 if (dcp == 0) break dc.cps[0..i];
348 } else dc.cps[0..];
349 try dcp_list.appendSlice(slice);
350 }
351
352 canonicalSort(dcp_list.items);
353
354 var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4);
355 defer dstr_list.deinit();
356
357 var buf: [4]u8 = undefined;
358 for (dcp_list.items) |dcp| {
359 const len = try std.unicode.utf8Encode(dcp, &buf);
360 dstr_list.appendSliceAssumeCapacity(buf[0..len]);
361 }
362
363 return Result{ .allocator = allocator, .slice = try dstr_list.toOwnedSlice() };
364}
365
366test "nfd ASCII / no-alloc" {
367 const allocator = std.testing.allocator;
368 var n = try init(allocator);
369 defer n.deinit();
370
371 var result = try n.nfd(allocator, "Hello World!");
372 defer result.deinit();
373
374 try std.testing.expectEqualStrings("Hello World!", result.slice);
375}
376
377test "nfd !ASCII / alloc" {
378 const allocator = std.testing.allocator;
379 var n = try init(allocator);
380 defer n.deinit();
381
382 var result = try n.nfd(allocator, "Héllo World! \u{3d3}");
383 defer result.deinit();
384
385 try std.testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice);
386}
387
388test "nfkd ASCII / no-alloc" {
389 const allocator = std.testing.allocator;
390 var n = try init(allocator);
391 defer n.deinit();
392
393 var result = try n.nfkd(allocator, "Hello World!");
394 defer result.deinit();
395
396 try std.testing.expectEqualStrings("Hello World!", result.slice);
397}
398
399test "nfkd !ASCII / alloc" {
400 const allocator = std.testing.allocator;
401 var n = try init(allocator);
402 defer n.deinit();
403
404 var result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
405 defer result.deinit();
406
407 try std.testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice);
408}
409
410// Composition utilities.
411
412fn isHangul(cp: u21) bool {
413 return cp >= 0x1100 and hangul_map.syllableType(cp) != null;
414}
415
416fn isStarter(cp: u21) bool {
417 return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] == 0;
418}
419
420fn isCombining(cp: u21) bool {
421 return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] != 0;
422}
423
424fn isNonHangulStarter(cp: u21) bool {
425 return !isHangul(cp) and isStarter(cp);
426}
427
428/// Normalizes `str` to NFC.
429pub fn nfc(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result {
430 return self.nfxc(allocator, str, .nfc);
431}
432
433/// Normalizes `str` to NFKC.
434pub fn nfkc(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result {
435 return self.nfxc(allocator, str, .nfkc);
436}
437
438fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result {
439 // Quick checks.
440 if (onlyAscii(str)) return Result{ .slice = str };
441 if (form == .nfc and onlyLatin1(str)) return Result{ .slice = str };
442
443 // Decompose first.
444 var d_result = if (form == .nfc)
445 try self.nfd(allocator, str)
446 else
447 try self.nfkd(allocator, str);
448 defer d_result.deinit();
449
450 // Get code points.
451 var cp_iter = CodePointIterator{ .bytes = d_result.slice };
452
453 var d_list = try std.ArrayList(u21).initCapacity(allocator, d_result.slice.len);
454 defer d_list.deinit();
455
456 while (cp_iter.next()) |cp| d_list.appendAssumeCapacity(cp.code);
457
458 // Compose
459 const tombstone = 0xe000; // Start of BMP Private Use Area
460
461 while (true) {
462 var i: usize = 1; // start at second code point.
463 var deleted: usize = 0;
464
465 block_check: while (i < d_list.items.len) : (i += 1) {
466 const C = d_list.items[i];
467 const cc_C = normp.stage_2[normp.stage_1[C >> 8] + (C & 0xff)];
468 var starter_index: ?usize = null;
469 var j: usize = i;
470
471 while (true) {
472 j -= 1;
473
474 // Check for starter.
475 if (isStarter(d_list.items[j])) {
476 if (i - j > 1) { // If there's distance between the starting point and the current position.
477 for (d_list.items[(j + 1)..i]) |B| {
478 // Check for blocking conditions.
479 if (isHangul(C)) {
480 if (isCombining(B) or isNonHangulStarter(B)) continue :block_check;
481 }
482 const cc_B = normp.stage_2[normp.stage_1[B >> 8] + (B & 0xff)];
483 if (cc_B >= cc_C) continue :block_check;
484 }
485 }
486
487 // Found starter at j.
488 starter_index = j;
489 break;
490 }
491
492 if (j == 0) break;
493 }
494
495 if (starter_index) |sidx| {
496 const L = d_list.items[sidx];
497 var processed_hangul = false;
498
499 if (isHangul(L) and isHangul(C)) {
500 const l_stype = hangul_map.syllableType(L).?;
501 const c_stype = hangul_map.syllableType(C).?;
502
503 if (l_stype == .LV and c_stype == .T) {
504 // LV, T
505 d_list.items[sidx] = composeHangulCanon(L, C);
506 d_list.items[i] = tombstone; // Mark for deletion.
507 processed_hangul = true;
508 }
509
510 if (l_stype == .L and c_stype == .V) {
511 // Handle L, V. L, V, T is handled via main loop.
512 d_list.items[sidx] = composeHangulFull(L, C, 0);
513 d_list.items[i] = tombstone; // Mark for deletion.
514 processed_hangul = true;
515 }
516
517 if (processed_hangul) deleted += 1;
518 }
519
520 if (!processed_hangul) {
521 // L -> C not Hangul.
522 if (self.nfc_map.get(.{ L, C })) |P| {
523 if (!norm_props.isFcx(P)) {
524 d_list.items[sidx] = P;
525 d_list.items[i] = tombstone; // Mark for deletion.
526 deleted += 1;
527 }
528 }
529 }
530 }
531 }
532
533 // Check if finished.
534 if (deleted == 0) {
535 var cstr_list = try std.ArrayList(u8).initCapacity(allocator, d_list.items.len * 4);
536 defer cstr_list.deinit();
537 var buf: [4]u8 = undefined;
538
539 for (d_list.items) |cp| {
540 if (cp == tombstone) continue; // "Delete"
541 const len = try std.unicode.utf8Encode(cp, &buf);
542 cstr_list.appendSliceAssumeCapacity(buf[0..len]);
543 }
544
545 return Result{ .allocator = allocator, .slice = try cstr_list.toOwnedSlice() };
546 }
547
548 // Otherwise update code points list.
549 var tmp_d_list = try std.ArrayList(u21).initCapacity(allocator, d_list.items.len - deleted);
550 defer tmp_d_list.deinit();
551
552 for (d_list.items) |cp| {
553 if (cp != tombstone) tmp_d_list.appendAssumeCapacity(cp);
554 }
555
556 d_list.clearRetainingCapacity();
557 d_list.appendSliceAssumeCapacity(tmp_d_list.items);
558 }
559}
560
561test "nfc" {
562 const allocator = std.testing.allocator;
563 var n = try init(allocator);
564 defer n.deinit();
565
566 var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
567 defer result.deinit();
568
569 try std.testing.expectEqualStrings("Complex char: \u{3D3}", result.slice);
570}
571
572test "nfkc" {
573 const allocator = std.testing.allocator;
574 var n = try init(allocator);
575 defer n.deinit();
576
577 var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
578 defer result.deinit();
579
580 try std.testing.expectEqualStrings("Complex char: \u{038E}", result.slice);
581}
582
583/// Tests for equality as per Unicode rules for Identifiers.
584pub fn eqlIdentifiers(allocator: std.mem.Allocator, a: []const u8, b: []const u8) !bool {
585 var list_a = try std.ArrayList(u21).initCapacity(allocator, a.len);
586 defer list_a.deinit();
587 var list_b = try std.ArrayList(u21).initCapacity(allocator, b.len);
588 defer list_b.deinit();
589
590 const Item = struct {
591 str: []const u8,
592 list: *std.ArrayList(u21),
593 };
594
595 const items = [_]Item{
596 .{ .str = a, .list = &list_a },
597 .{ .str = b, .list = &list_b },
598 };
599
600 for (items) |item| {
601 var cp_iter = CodePointIterator{ .bytes = item.str };
602 while (cp_iter.next()) |cp| {
603 if (norm_props.toNfkcCaseFold(cp.code)) |nfkcf| {
604 for (nfkcf) |c| {
605 if (c == 0) break;
606 item.list.appendAssumeCapacity(c);
607 }
608 } else {
609 item.list.appendAssumeCapacity(cp.code); // maps to itself
610 }
611 }
612 }
613
614 return std.mem.eql(u21, list_a.items, list_b.items);
615}
616
617test "eqlIdentifiers" {
618 try std.testing.expect(try eqlIdentifiers(std.testing.allocator, "Foé", "foé"));
619}
620
621/// Tests for equality of `a` and `b` after normalizing to NFD.
622pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u8) !bool {
623 var norm_result_a = try self.nfd(allocator, a);
624 defer norm_result_a.deinit();
625 var norm_result_b = try self.nfd(allocator, b);
626 defer norm_result_b.deinit();
627
628 return std.mem.eql(u8, norm_result_a.slice, norm_result_b.slice);
629}
630
631test "eql" {
632 const allocator = std.testing.allocator;
633 var n = try init(allocator);
634 defer n.deinit();
635
636 try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
637 try std.testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
638}
639
640fn requiresNfdBeforeCaseFold(cp: u21) bool {
641 return switch (cp) {
642 0x0345 => true,
643 0x1F80...0x1FAF => true,
644 0x1FB2...0x1FB4 => true,
645 0x1FB7 => true,
646 0x1FBC => true,
647 0x1FC2...0x1FC4 => true,
648 0x1FC7 => true,
649 0x1FCC => true,
650 0x1FF2...0x1FF4 => true,
651 0x1FF7 => true,
652 0x1FFC => true,
653 else => false,
654 };
655}
656
657fn requiresPreNfd(str: []const u8) bool {
658 var cp_iter = CodePointIterator{ .bytes = str };
659
660 return while (cp_iter.next()) |cp| {
661 if (requiresNfdBeforeCaseFold(cp.code)) break true;
662 } else false;
663}
664
665/// `eqlCaseless` tests for equality of `a` and `b` after normalizing to NFD and ignoring letter case.
666pub fn eqlCaseless(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u8) !bool {
667 // The long winding road of normalized caseless matching...
668 // NFD(CaseFold(NFD(str))) or NFD(CaseFold(str))
669 var norm_result_a: Result = Result{ .slice = a };
670 if (requiresPreNfd(a)) {
671 if (!self.isFcd(a)) {
672 norm_result_a = try self.nfd(allocator, a);
673 }
674 }
675 defer norm_result_a.deinit();
676
677 const cf_a = try case_fold_map.caseFoldStr(allocator, norm_result_a.slice);
678 defer allocator.free(cf_a);
679 norm_result_a.deinit();
680 norm_result_a = try self.nfd(allocator, cf_a);
681
682 var norm_result_b: Result = Result{ .slice = b };
683 if (requiresPreNfd(b)) {
684 if (!self.isFcd(b)) {
685 norm_result_b = try self.nfd(allocator, b);
686 }
687 }
688 defer norm_result_b.deinit();
689
690 const cf_b = try case_fold_map.caseFoldStr(allocator, norm_result_b.slice);
691 defer allocator.free(cf_b);
692 norm_result_b.deinit();
693 norm_result_b = try self.nfd(allocator, cf_b);
694
695 return std.mem.eql(u8, norm_result_a.slice, norm_result_b.slice);
696}
697
698test "eqlCaseless" {
699 const allocator = std.testing.allocator;
700 var n = try init(allocator);
701 defer n.deinit();
702
703 try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}"));
704 try std.testing.expect(try n.eqlCaseless(allocator, "FOÉ", "foe\u{0301}")); // foÉ == foé
705}
706
707// FCD
708fn getLeadCcc(self: Self, cp: u21) u8 {
709 const dc = self.mapping(cp, .nfd);
710 return normp.stage_2[normp.stage_1[dc.cps[0] >> 8] + (dc.cps[0] & 0xff)];
711}
712
713fn getTrailCcc(self: Self, cp: u21) u8 {
714 const dc = self.mapping(cp, .nfd);
715 const len = for (dc.cps, 0..) |dcp, i| {
716 if (dcp == 0) break i;
717 } else dc.cps.len;
718 const tcp = dc.cps[len -| 1];
719 return normp.stage_2[normp.stage_1[tcp >> 8] + (tcp & 0xff)];
720}
721
722/// Fast check to detect if a string is already in NFC or NFD form.
723pub fn isFcd(self: Self, str: []const u8) bool {
724 var prev_ccc: u8 = 0;
725 var cp_iter = CodePointIterator{ .bytes = str };
726
727 return while (cp_iter.next()) |cp| {
728 const ccc = self.getLeadCcc(cp.code);
729 if (ccc != 0 and ccc < prev_ccc) break false;
730 prev_ccc = self.getTrailCcc(cp.code);
731 } else true;
732}
733
734test "isFcd" {
735 const allocator = std.testing.allocator;
736 var n = try init(allocator);
737 defer n.deinit();
738
739 const is_nfc = "José \u{3D3}";
740 try std.testing.expect(n.isFcd(is_nfc));
741
742 const is_nfd = "Jose\u{301} \u{3d2}\u{301}";
743 try std.testing.expect(n.isFcd(is_nfd));
744
745 const not_fcd = "Jose\u{301} \u{3d2}\u{315}\u{301}";
746 try std.testing.expect(!n.isFcd(not_fcd));
747}
748
749test "Unicode normalization tests" {
750 var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
751 defer arena.deinit();
752 var allocator = arena.allocator();
753
754 var n = try init(allocator);
755 defer n.deinit();
756
757 var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
758 defer file.close();
759 var buf_reader = std.io.bufferedReader(file.reader());
760 const input_stream = buf_reader.reader();
761
762 var line_no: usize = 0;
763 var buf: [4096]u8 = undefined;
764 var cp_buf: [4]u8 = undefined;
765
766 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| {
767 line_no += 1;
768 // Skip comments or empty lines.
769 if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
770 // Iterate over fields.
771 var fields = std.mem.split(u8, line, ";");
772 var field_index: usize = 0;
773 var input: []u8 = undefined;
774 defer allocator.free(input);
775
776 while (fields.next()) |field| : (field_index += 1) {
777 if (field_index == 0) {
778 var i_buf = std.ArrayList(u8).init(allocator);
779 defer i_buf.deinit();
780
781 var i_fields = std.mem.split(u8, field, " ");
782 while (i_fields.next()) |s| {
783 const icp = try std.fmt.parseInt(u21, s, 16);
784 const len = try std.unicode.utf8Encode(icp, &cp_buf);
785 try i_buf.appendSlice(cp_buf[0..len]);
786 }
787
788 input = try i_buf.toOwnedSlice();
789 } else if (field_index == 1) {
790 //std.debug.print("\n*** {s} ***\n", .{line});
791 // NFC, time to test.
792 var w_buf = std.ArrayList(u8).init(allocator);
793 defer w_buf.deinit();
794
795 var w_fields = std.mem.split(u8, field, " ");
796 while (w_fields.next()) |s| {
797 const wcp = try std.fmt.parseInt(u21, s, 16);
798 const len = try std.unicode.utf8Encode(wcp, &cp_buf);
799 try w_buf.appendSlice(cp_buf[0..len]);
800 }
801
802 const want = w_buf.items;
803 var got = try n.nfc(allocator, input);
804 defer got.deinit();
805
806 try std.testing.expectEqualStrings(want, got.slice);
807 } else if (field_index == 2) {
808 // NFD, time to test.
809 var w_buf = std.ArrayList(u8).init(allocator);
810 defer w_buf.deinit();
811
812 var w_fields = std.mem.split(u8, field, " ");
813 while (w_fields.next()) |s| {
814 const wcp = try std.fmt.parseInt(u21, s, 16);
815 const len = try std.unicode.utf8Encode(wcp, &cp_buf);
816 try w_buf.appendSlice(cp_buf[0..len]);
817 }
818
819 const want = w_buf.items;
820 var got = try n.nfd(allocator, input);
821 defer got.deinit();
822
823 try std.testing.expectEqualStrings(want, got.slice);
824 } else if (field_index == 3) {
825 // NFKC, time to test.
826 var w_buf = std.ArrayList(u8).init(allocator);
827 defer w_buf.deinit();
828
829 var w_fields = std.mem.split(u8, field, " ");
830 while (w_fields.next()) |s| {
831 const wcp = try std.fmt.parseInt(u21, s, 16);
832 const len = try std.unicode.utf8Encode(wcp, &cp_buf);
833 try w_buf.appendSlice(cp_buf[0..len]);
834 }
835
836 const want = w_buf.items;
837 var got = try n.nfkc(allocator, input);
838 defer got.deinit();
839
840 try std.testing.expectEqualStrings(want, got.slice);
841 } else if (field_index == 4) {
842 // NFKD, time to test.
843 var w_buf = std.ArrayList(u8).init(allocator);
844 defer w_buf.deinit();
845
846 var w_fields = std.mem.split(u8, field, " ");
847 while (w_fields.next()) |s| {
848 const wcp = try std.fmt.parseInt(u21, s, 16);
849 const len = try std.unicode.utf8Encode(wcp, &cp_buf);
850 try w_buf.appendSlice(cp_buf[0..len]);
851 }
852
853 const want = w_buf.items;
854 var got = try n.nfkd(allocator, input);
855 defer got.deinit();
856
857 try std.testing.expectEqualStrings(want, got.slice);
858 } else {
859 continue;
860 }
861 }
862 }
863}