summaryrefslogtreecommitdiff
path: root/src/Normalize.zig
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-03-23 20:32:13 -0400
committerGravatar Jose Colon Rodriguez2024-03-23 20:32:13 -0400
commitbcd79d29b316f636af9d21c8ace61e9ba93180d9 (patch)
treecbe36a301c87b83506ba0c2183d86820cdd80e59 /src/Normalize.zig
parentRenamed Caser to Folder (diff)
downloadzg-bcd79d29b316f636af9d21c8ace61e9ba93180d9.tar.gz
zg-bcd79d29b316f636af9d21c8ace61e9ba93180d9.tar.xz
zg-bcd79d29b316f636af9d21c8ace61e9ba93180d9.zip
Rename CaseFold and Normalize
Diffstat (limited to 'src/Normalize.zig')
-rw-r--r--src/Normalize.zig774
1 files changed, 774 insertions, 0 deletions
diff --git a/src/Normalize.zig b/src/Normalize.zig
new file mode 100644
index 0000000..b5a54d1
--- /dev/null
+++ b/src/Normalize.zig
@@ -0,0 +1,774 @@
1//! Normalizer contains functions and methods that implement
2//! Unicode Normalization. You can normalize strings into NFC,
3//! NFKC, NFD, and NFKD normalization forms.
4
5const std = @import("std");
6const assert = std.debug.assert;
7const debug = std.debug;
8const fmt = std.fmt;
9const fs = std.fs;
10const heap = std.heap;
11const io = std.io;
12const mem = std.mem;
13const simd = std.simd;
14const testing = std.testing;
15const unicode = std.unicode;
16
17const ascii = @import("ascii");
18const CodePointIterator = @import("code_point").Iterator;
19pub const NormData = @import("NormData");
20
21norm_data: *const NormData,
22
23const Self = @This();
24
25const SBase: u21 = 0xAC00;
26const LBase: u21 = 0x1100;
27const VBase: u21 = 0x1161;
28const TBase: u21 = 0x11A7;
29const LCount: u21 = 19;
30const VCount: u21 = 21;
31const TCount: u21 = 28;
32const NCount: u21 = 588; // VCount * TCount
33const SCount: u21 = 11172; // LCount * NCount
34
35fn decomposeHangul(self: Self, cp: u21, buf: []u21) ?Decomp {
36 const kind = self.norm_data.hangul_data.syllable(cp);
37 if (kind != .LV and kind != .LVT) return null;
38
39 const SIndex: u21 = cp - SBase;
40 const LIndex: u21 = SIndex / NCount;
41 const VIndex: u21 = (SIndex % NCount) / TCount;
42 const TIndex: u21 = SIndex % TCount;
43 const LPart: u21 = LBase + LIndex;
44 const VPart: u21 = VBase + VIndex;
45
46 var dc = Decomp{ .form = .nfd };
47 buf[0] = LPart;
48 buf[1] = VPart;
49
50 if (TIndex == 0) {
51 dc.cps = buf[0..2];
52 return dc;
53 }
54
55 // TPart
56 buf[2] = TBase + TIndex;
57 dc.cps = buf[0..3];
58 return dc;
59}
60
61fn composeHangulCanon(lv: u21, t: u21) u21 {
62 assert(0x11A8 <= t and t <= 0x11C2);
63 return lv + (t - TBase);
64}
65
66fn composeHangulFull(l: u21, v: u21, t: u21) u21 {
67 assert(0x1100 <= l and l <= 0x1112);
68 assert(0x1161 <= v and v <= 0x1175);
69 const LIndex = l - LBase;
70 const VIndex = v - VBase;
71 const LVIndex = LIndex * NCount + VIndex * TCount;
72
73 if (t == 0) return SBase + LVIndex;
74
75 assert(0x11A8 <= t and t <= 0x11C2);
76 const TIndex = t - TBase;
77
78 return SBase + LVIndex + TIndex;
79}
80
81const Form = enum {
82 nfc,
83 nfd,
84 nfkc,
85 nfkd,
86 same,
87};
88
89const Decomp = struct {
90 form: Form = .same,
91 cps: []const u21 = &.{},
92};
93
94/// `mapping` retrieves the decomposition mapping for a code point as per the UCD.
95pub fn mapping(self: Self, cp: u21, form: Form) Decomp {
96 var dc = Decomp{};
97
98 switch (form) {
99 .nfd => {
100 dc.cps = self.norm_data.canon_data.toNfd(cp);
101 if (dc.cps.len != 0) dc.form = .nfd;
102 },
103
104 .nfkd => {
105 dc.cps = self.norm_data.compat_data.toNfkd(cp);
106 if (dc.cps.len != 0) {
107 dc.form = .nfkd;
108 } else {
109 dc.cps = self.norm_data.canon_data.toNfd(cp);
110 if (dc.cps.len != 0) dc.form = .nfkd;
111 }
112 },
113
114 else => @panic("Normalizer.mapping only accepts form .nfd or .nfkd."),
115 }
116
117 return dc;
118}
119
120/// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`.
121pub fn decompose(
122 self: Self,
123 cp: u21,
124 form: Form,
125 buf: []u21,
126) Decomp {
127 // ASCII
128 if (cp < 128) return .{};
129
130 // NFD / NFKD quick checks.
131 switch (form) {
132 .nfd => if (self.norm_data.normp_data.isNfd(cp)) return .{},
133 .nfkd => if (self.norm_data.normp_data.isNfkd(cp)) return .{},
134 else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."),
135 }
136
137 // Hangul precomposed syllable full decomposition.
138 if (self.decomposeHangul(cp, buf)) |dc| return dc;
139
140 // Full decomposition.
141 var dc = Decomp{ .form = form };
142
143 var result_index: usize = 0;
144 var work_index: usize = 1;
145
146 // Start work with argument code point.
147 var work = [_]u21{cp} ++ [_]u21{0} ** 17;
148
149 while (work_index > 0) {
150 // Look at previous code point in work queue.
151 work_index -= 1;
152 const next = work[work_index];
153 const m = self.mapping(next, form);
154
155 // No more of decompositions for this code point.
156 if (m.form == .same) {
157 buf[result_index] = next;
158 result_index += 1;
159 continue;
160 }
161
162 // Work backwards through decomposition.
163 // `i` starts at 1 because m_last is 1 past the last code point.
164 var i: usize = 1;
165 while (i <= m.cps.len) : ({
166 i += 1;
167 work_index += 1;
168 }) {
169 work[work_index] = m.cps[m.cps.len - i];
170 }
171 }
172
173 dc.cps = buf[0..result_index];
174
175 return dc;
176}
177
178test "decompose" {
179 const allocator = testing.allocator;
180 var data = try NormData.init(allocator);
181 defer data.deinit();
182 var n = Self{ .norm_data = &data };
183
184 var buf: [18]u21 = undefined;
185
186 var dc = n.decompose('é', .nfd, &buf);
187 try testing.expect(dc.form == .nfd);
188 try testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]);
189
190 dc = n.decompose('\u{1e0a}', .nfd, &buf);
191 try testing.expect(dc.form == .nfd);
192 try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
193
194 dc = n.decompose('\u{1e0a}', .nfkd, &buf);
195 try testing.expect(dc.form == .nfkd);
196 try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
197
198 dc = n.decompose('\u{3189}', .nfd, &buf);
199 try testing.expect(dc.form == .same);
200 try testing.expect(dc.cps.len == 0);
201
202 dc = n.decompose('\u{3189}', .nfkd, &buf);
203 try testing.expect(dc.form == .nfkd);
204 try testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]);
205
206 dc = n.decompose('\u{ace1}', .nfd, &buf);
207 try testing.expect(dc.form == .nfd);
208 try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
209
210 dc = n.decompose('\u{ace1}', .nfkd, &buf);
211 try testing.expect(dc.form == .nfd);
212 try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
213
214 dc = n.decompose('\u{3d3}', .nfd, &buf);
215 try testing.expect(dc.form == .nfd);
216 try testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]);
217
218 dc = n.decompose('\u{3d3}', .nfkd, &buf);
219 try testing.expect(dc.form == .nfkd);
220 try testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]);
221}
222
223/// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory.
224pub const Result = struct {
225 allocator: ?mem.Allocator = null,
226 slice: []const u8,
227
228 pub fn deinit(self: *Result) void {
229 if (self.allocator) |allocator| allocator.free(self.slice);
230 }
231};
232
233// Compares code points by Canonical Combining Class order.
234fn cccLess(self: Self, lhs: u21, rhs: u21) bool {
235 return self.norm_data.ccc_data.ccc(lhs) < self.norm_data.ccc_data.ccc(rhs);
236}
237
238// Applies the Canonical Sorting Algorithm.
239fn canonicalSort(self: Self, cps: []u21) void {
240 var i: usize = 0;
241 while (i < cps.len) : (i += 1) {
242 const start: usize = i;
243 while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {}
244 mem.sort(u21, cps[start..i], self, cccLess);
245 }
246}
247
248/// Normalize `str` to NFD.
249pub fn nfd(self: Self, allocator: mem.Allocator, str: []const u8) !Result {
250 return self.nfxd(allocator, str, .nfd);
251}
252
253/// Normalize `str` to NFKD.
254pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) !Result {
255 return self.nfxd(allocator, str, .nfkd);
256}
257
258pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) ![]u21 {
259 var dcp_list = std.ArrayList(u21).init(allocator);
260 defer dcp_list.deinit();
261
262 var cp_iter = CodePointIterator{ .bytes = str };
263 var dc_buf: [18]u21 = undefined;
264
265 while (cp_iter.next()) |cp| {
266 const dc = self.decompose(cp.code, form, &dc_buf);
267 if (dc.form == .same) {
268 try dcp_list.append(cp.code);
269 } else {
270 try dcp_list.appendSlice(dc.cps);
271 }
272 }
273
274 self.canonicalSort(dcp_list.items);
275
276 return try dcp_list.toOwnedSlice();
277}
278
279fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Result {
280 // Quick checks.
281 if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
282
283 const dcps = try self.nfxdCodePoints(allocator, str, form);
284 defer allocator.free(dcps);
285
286 var dstr_list = std.ArrayList(u8).init(allocator);
287 defer dstr_list.deinit();
288 var buf: [4]u8 = undefined;
289
290 for (dcps) |dcp| {
291 const len = try unicode.utf8Encode(dcp, &buf);
292 try dstr_list.appendSlice(buf[0..len]);
293 }
294
295 return Result{ .allocator = allocator, .slice = try dstr_list.toOwnedSlice() };
296}
297
298test "nfd ASCII / no-alloc" {
299 const allocator = testing.allocator;
300 var data = try NormData.init(allocator);
301 defer data.deinit();
302 var n = Self{ .norm_data = &data };
303
304 var result = try n.nfd(allocator, "Hello World!");
305 defer result.deinit();
306
307 try testing.expectEqualStrings("Hello World!", result.slice);
308}
309
310test "nfd !ASCII / alloc" {
311 const allocator = testing.allocator;
312 var data = try NormData.init(allocator);
313 defer data.deinit();
314 var n = Self{ .norm_data = &data };
315
316 var result = try n.nfd(allocator, "Héllo World! \u{3d3}");
317 defer result.deinit();
318
319 try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice);
320}
321
322test "nfkd ASCII / no-alloc" {
323 const allocator = testing.allocator;
324 var data = try NormData.init(allocator);
325 defer data.deinit();
326 var n = Self{ .norm_data = &data };
327
328 var result = try n.nfkd(allocator, "Hello World!");
329 defer result.deinit();
330
331 try testing.expectEqualStrings("Hello World!", result.slice);
332}
333
334test "nfkd !ASCII / alloc" {
335 const allocator = testing.allocator;
336 var data = try NormData.init(allocator);
337 defer data.deinit();
338 var n = Self{ .norm_data = &data };
339
340 var result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
341 defer result.deinit();
342
343 try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice);
344}
345
346pub fn nfdCodePoints(
347 self: Self,
348 allocator: mem.Allocator,
349 cps: []const u21,
350) ![]u21 {
351 var dcp_list = std.ArrayList(u21).init(allocator);
352 defer dcp_list.deinit();
353
354 var dc_buf: [18]u21 = undefined;
355
356 for (cps) |cp| {
357 const dc = self.decompose(cp, .nfd, &dc_buf);
358
359 if (dc.form == .same) {
360 try dcp_list.append(cp);
361 } else {
362 try dcp_list.appendSlice(dc.cps);
363 }
364 }
365
366 self.canonicalSort(dcp_list.items);
367
368 return try dcp_list.toOwnedSlice();
369}
370
371pub fn nfkdCodePoints(
372 self: Self,
373 allocator: mem.Allocator,
374 cps: []const u21,
375) ![]u21 {
376 var dcp_list = std.ArrayList(u21).init(allocator);
377 defer dcp_list.deinit();
378
379 var dc_buf: [18]u21 = undefined;
380
381 for (cps) |cp| {
382 const dc = self.decompose(cp, .nfkd, &dc_buf);
383
384 if (dc.form == .same) {
385 try dcp_list.append(cp);
386 } else {
387 try dcp_list.appendSlice(dc.cps);
388 }
389 }
390
391 self.canonicalSort(dcp_list.items);
392
393 return try dcp_list.toOwnedSlice();
394}
395
396// Composition (NFC, NFKC)
397
398fn isHangul(self: Self, cp: u21) bool {
399 return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none;
400}
401
402/// Normalizes `str` to NFC.
403pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) !Result {
404 return self.nfxc(allocator, str, .nfc);
405}
406
407/// Normalizes `str` to NFKC.
408pub fn nfkc(self: Self, allocator: mem.Allocator, str: []const u8) !Result {
409 return self.nfxc(allocator, str, .nfkc);
410}
411
412fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Result {
413 // Quick checks.
414 if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
415 if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str };
416
417 // Decompose first.
418 var dcps = if (form == .nfc)
419 try self.nfxdCodePoints(allocator, str, .nfd)
420 else
421 try self.nfxdCodePoints(allocator, str, .nfkd);
422 defer allocator.free(dcps);
423
424 // Compose
425 const tombstone = 0xe000; // Start of BMP Private Use Area
426
427 // Loop over all decomposed code points.
428 while (true) {
429 var i: usize = 1; // start at second code point.
430 var deleted: usize = 0;
431
432 // For each code point, C, find the preceding
433 // starter code point L, if any.
434 block_check: while (i < dcps.len) : (i += 1) {
435 const C = dcps[i];
436 if (C == tombstone) continue :block_check;
437 const cc_C = self.norm_data.ccc_data.ccc(C);
438 var starter_index: ?usize = null;
439 var j: usize = i;
440
441 // Seek back to find starter L, if any.
442 while (true) {
443 j -= 1;
444 if (dcps[j] == tombstone) continue;
445
446 // Check for starter.
447 if (self.norm_data.ccc_data.isStarter(dcps[j])) {
448 // Check for blocking conditions.
449 for (dcps[(j + 1)..i]) |B| {
450 if (B == tombstone) continue;
451 const cc_B = self.norm_data.ccc_data.ccc(B);
452 if (cc_B != 0 and self.isHangul(C)) continue :block_check;
453 if (cc_B >= cc_C) continue :block_check;
454 }
455
456 // Found starter at j.
457 starter_index = j;
458 break;
459 }
460
461 if (j == 0) break;
462 }
463
464 // If we have a starter L, see if there's a primary
465 // composite, P, for the sequence L, C. If so, we must
466 // repace L with P and delete C.
467 if (starter_index) |sidx| {
468 const L = dcps[sidx];
469 var processed_hangul = false;
470
471 // If L and C are Hangul syllables, we can compose
472 // them algorithmically if possible.
473 if (self.isHangul(L) and self.isHangul(C)) {
474 // Get Hangul syllable types.
475 const l_stype = self.norm_data.hangul_data.syllable(L);
476 const c_stype = self.norm_data.hangul_data.syllable(C);
477
478 if (l_stype == .LV and c_stype == .T) {
479 // LV, T canonical composition.
480 dcps[sidx] = composeHangulCanon(L, C);
481 dcps[i] = tombstone; // Mark for deletion.
482 processed_hangul = true;
483 }
484
485 if (l_stype == .L and c_stype == .V) {
486 // L, V full composition. L, V, T is handled via main loop.
487 dcps[sidx] = composeHangulFull(L, C, 0);
488 dcps[i] = tombstone; // Mark for deletion.
489 processed_hangul = true;
490 }
491
492 if (processed_hangul) deleted += 1;
493 }
494
495 // If no composition has occurred yet.
496 if (!processed_hangul) {
497 // L, C are not Hangul, so check for primary composite
498 // in the Unicode Character Database.
499 if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| {
500 // We have a primary composite P for L, C.
501 // We must check if P is not in the Full
502 // Composition Exclusions (FCX) list,
503 // preventing it from appearing in any
504 // composed form (NFC, NFKC).
505 if (!self.norm_data.normp_data.isFcx(P)) {
506 dcps[sidx] = P;
507 dcps[i] = tombstone; // Mark for deletion.
508 deleted += 1;
509 }
510 }
511 }
512 }
513 }
514
515 // If we have no deletions. the code point sequence
516 // has been fully composed.
517 if (deleted == 0) {
518 var cstr_list = std.ArrayList(u8).init(allocator);
519 defer cstr_list.deinit();
520 var buf: [4]u8 = undefined;
521
522 for (dcps) |cp| {
523 if (cp == tombstone) continue; // "Delete"
524 const len = try unicode.utf8Encode(cp, &buf);
525 try cstr_list.appendSlice(buf[0..len]);
526 }
527
528 return Result{ .allocator = allocator, .slice = try cstr_list.toOwnedSlice() };
529 }
530 }
531}
532
533test "nfc" {
534 const allocator = testing.allocator;
535 var data = try NormData.init(allocator);
536 defer data.deinit();
537 var n = Self{ .norm_data = &data };
538
539 var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
540 defer result.deinit();
541
542 try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice);
543}
544
545test "nfkc" {
546 const allocator = testing.allocator;
547 var data = try NormData.init(allocator);
548 defer data.deinit();
549 var n = Self{ .norm_data = &data };
550
551 var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
552 defer result.deinit();
553
554 try testing.expectEqualStrings("Complex char: \u{038E}", result.slice);
555}
556
557/// Tests for equality of `a` and `b` after normalizing to NFC.
558pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool {
559 var norm_result_a = try self.nfc(allocator, a);
560 defer norm_result_a.deinit();
561 var norm_result_b = try self.nfc(allocator, b);
562 defer norm_result_b.deinit();
563
564 return mem.eql(u8, norm_result_a.slice, norm_result_b.slice);
565}
566
567test "eql" {
568 const allocator = testing.allocator;
569 var data = try NormData.init(allocator);
570 defer data.deinit();
571 var n = Self{ .norm_data = &data };
572
573 try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
574 try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
575}
576
577// FCD
578fn getLeadCcc(self: Self, cp: u21) u8 {
579 const dc = self.mapping(cp, .nfd);
580 const dcp = if (dc.form == .same) cp else dc.cps[0];
581 return self.norm_data.ccc_data.ccc(dcp);
582}
583
584fn getTrailCcc(self: Self, cp: u21) u8 {
585 const dc = self.mapping(cp, .nfd);
586 const dcp = if (dc.form == .same) cp else dc.cps[dc.cps.len - 1];
587 return self.norm_data.ccc_data.ccc(dcp);
588}
589
590/// Fast check to detect if a string is already in NFC or NFD form.
591pub fn isFcd(self: Self, str: []const u8) bool {
592 var prev_ccc: u8 = 0;
593 var cp_iter = CodePointIterator{ .bytes = str };
594
595 return while (cp_iter.next()) |cp| {
596 const ccc = self.getLeadCcc(cp.code);
597 if (ccc != 0 and ccc < prev_ccc) break false;
598 prev_ccc = self.getTrailCcc(cp.code);
599 } else true;
600}
601
602test "isFcd" {
603 const allocator = testing.allocator;
604 var data = try NormData.init(allocator);
605 defer data.deinit();
606 var n = Self{ .norm_data = &data };
607
608 const is_nfc = "José \u{3D3}";
609 try testing.expect(n.isFcd(is_nfc));
610
611 const is_nfd = "Jose\u{301} \u{3d2}\u{301}";
612 try testing.expect(n.isFcd(is_nfd));
613
614 const not_fcd = "Jose\u{301} \u{3d2}\u{315}\u{301}";
615 try testing.expect(!n.isFcd(not_fcd));
616}
617
618test "Unicode normalization tests" {
619 var arena = heap.ArenaAllocator.init(testing.allocator);
620 defer arena.deinit();
621 var allocator = arena.allocator();
622
623 var data = try NormData.init(allocator);
624 defer data.deinit();
625 var n = Self{ .norm_data = &data };
626
627 var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
628 defer file.close();
629 var buf_reader = io.bufferedReader(file.reader());
630 const input_stream = buf_reader.reader();
631
632 var line_no: usize = 0;
633 var buf: [4096]u8 = undefined;
634 var cp_buf: [4]u8 = undefined;
635
636 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| {
637 line_no += 1;
638 // Skip comments or empty lines.
639 if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
640 // Iterate over fields.
641 var fields = mem.split(u8, line, ";");
642 var field_index: usize = 0;
643 var input: []u8 = undefined;
644 defer allocator.free(input);
645
646 while (fields.next()) |field| : (field_index += 1) {
647 if (field_index == 0) {
648 var i_buf = std.ArrayList(u8).init(allocator);
649 defer i_buf.deinit();
650
651 var i_fields = mem.split(u8, field, " ");
652 while (i_fields.next()) |s| {
653 const icp = try fmt.parseInt(u21, s, 16);
654 const len = try unicode.utf8Encode(icp, &cp_buf);
655 try i_buf.appendSlice(cp_buf[0..len]);
656 }
657
658 input = try i_buf.toOwnedSlice();
659 } else if (field_index == 1) {
660 //debug.print("\n*** {s} ***\n", .{line});
661 // NFC, time to test.
662 var w_buf = std.ArrayList(u8).init(allocator);
663 defer w_buf.deinit();
664
665 var w_fields = mem.split(u8, field, " ");
666 while (w_fields.next()) |s| {
667 const wcp = try fmt.parseInt(u21, s, 16);
668 const len = try unicode.utf8Encode(wcp, &cp_buf);
669 try w_buf.appendSlice(cp_buf[0..len]);
670 }
671
672 const want = w_buf.items;
673 var got = try n.nfc(allocator, input);
674 defer got.deinit();
675
676 try testing.expectEqualStrings(want, got.slice);
677 } else if (field_index == 2) {
678 // NFD, time to test.
679 var w_buf = std.ArrayList(u8).init(allocator);
680 defer w_buf.deinit();
681
682 var w_fields = mem.split(u8, field, " ");
683 while (w_fields.next()) |s| {
684 const wcp = try fmt.parseInt(u21, s, 16);
685 const len = try unicode.utf8Encode(wcp, &cp_buf);
686 try w_buf.appendSlice(cp_buf[0..len]);
687 }
688
689 const want = w_buf.items;
690 var got = try n.nfd(allocator, input);
691 defer got.deinit();
692
693 try testing.expectEqualStrings(want, got.slice);
694 } else if (field_index == 3) {
695 // NFKC, time to test.
696 var w_buf = std.ArrayList(u8).init(allocator);
697 defer w_buf.deinit();
698
699 var w_fields = mem.split(u8, field, " ");
700 while (w_fields.next()) |s| {
701 const wcp = try fmt.parseInt(u21, s, 16);
702 const len = try unicode.utf8Encode(wcp, &cp_buf);
703 try w_buf.appendSlice(cp_buf[0..len]);
704 }
705
706 const want = w_buf.items;
707 var got = try n.nfkc(allocator, input);
708 defer got.deinit();
709
710 try testing.expectEqualStrings(want, got.slice);
711 } else if (field_index == 4) {
712 // NFKD, time to test.
713 var w_buf = std.ArrayList(u8).init(allocator);
714 defer w_buf.deinit();
715
716 var w_fields = mem.split(u8, field, " ");
717 while (w_fields.next()) |s| {
718 const wcp = try fmt.parseInt(u21, s, 16);
719 const len = try unicode.utf8Encode(wcp, &cp_buf);
720 try w_buf.appendSlice(cp_buf[0..len]);
721 }
722
723 const want = w_buf.items;
724 var got = try n.nfkd(allocator, input);
725 defer got.deinit();
726
727 try testing.expectEqualStrings(want, got.slice);
728 } else {
729 continue;
730 }
731 }
732 }
733}
734
735/// Returns true if `str` only contains Latin-1 Supplement
736/// code points. Uses SIMD if possible.
737pub fn isLatin1Only(str: []const u8) bool {
738 var cp_iter = CodePointIterator{ .bytes = str };
739
740 const vec_len = simd.suggestVectorLength(u21) orelse return blk: {
741 break :blk while (cp_iter.next()) |cp| {
742 if (cp.code > 256) break false;
743 } else true;
744 };
745
746 const Vec = @Vector(vec_len, u21);
747
748 outer: while (true) {
749 var v1: Vec = undefined;
750 const saved_cp_i = cp_iter.i;
751
752 for (0..vec_len) |i| {
753 if (cp_iter.next()) |cp| {
754 v1[i] = cp.code;
755 } else {
756 cp_iter.i = saved_cp_i;
757 break :outer;
758 }
759 }
760 const v2: Vec = @splat(256);
761 if (@reduce(.Or, v1 > v2)) return false;
762 }
763
764 return while (cp_iter.next()) |cp| {
765 if (cp.code > 256) break false;
766 } else true;
767}
768
769test "isLatin1Only" {
770 const latin1_only = "Hello, World! \u{fe} \u{ff}";
771 try testing.expect(isLatin1Only(latin1_only));
772 const not_latin1_only = "Héllo, World! \u{3d3}";
773 try testing.expect(!isLatin1Only(not_latin1_only));
774}