summaryrefslogtreecommitdiff
path: root/src/Normalizer.zig
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-28 20:30:48 -0400
committerGravatar Jose Colon Rodriguez2024-02-28 20:30:48 -0400
commit3735e5b7bbd8d0d25687f3080925084b9dbb938d (patch)
tree3e2113e0030f18879f65a5d3f24b723b27a5b95b /src/Normalizer.zig
parentUsing slices for decompositions in Normalizer (diff)
downloadzg-3735e5b7bbd8d0d25687f3080925084b9dbb938d.tar.gz
zg-3735e5b7bbd8d0d25687f3080925084b9dbb938d.tar.xz
zg-3735e5b7bbd8d0d25687f3080925084b9dbb938d.zip
Added nfc latin1 check back
Diffstat (limited to 'src/Normalizer.zig')
-rw-r--r--src/Normalizer.zig191
1 files changed, 121 insertions, 70 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig
index 89cc50c..d32ad52 100644
--- a/src/Normalizer.zig
+++ b/src/Normalizer.zig
@@ -3,7 +3,16 @@
3//! NFKC, NFD, and NFKD normalization forms. 3//! NFKC, NFD, and NFKD normalization forms.
4 4
5const std = @import("std"); 5const std = @import("std");
6const assert = std.debug.assert;
7const debug = std.debug;
8const fmt = std.fmt;
9const fs = std.fs;
10const heap = std.heap;
11const io = std.io;
12const mem = std.mem;
13const simd = std.simd;
6const testing = std.testing; 14const testing = std.testing;
15const unicode = std.unicode;
7 16
8const ascii = @import("ascii"); 17const ascii = @import("ascii");
9const CodePointIterator = @import("code_point").Iterator; 18const CodePointIterator = @import("code_point").Iterator;
@@ -50,20 +59,20 @@ fn decomposeHangul(self: Self, cp: u21, buf: []u21) ?Decomp {
50} 59}
51 60
52fn composeHangulCanon(lv: u21, t: u21) u21 { 61fn composeHangulCanon(lv: u21, t: u21) u21 {
53 std.debug.assert(0x11A8 <= t and t <= 0x11C2); 62 assert(0x11A8 <= t and t <= 0x11C2);
54 return lv + (t - TBase); 63 return lv + (t - TBase);
55} 64}
56 65
57fn composeHangulFull(l: u21, v: u21, t: u21) u21 { 66fn composeHangulFull(l: u21, v: u21, t: u21) u21 {
58 std.debug.assert(0x1100 <= l and l <= 0x1112); 67 assert(0x1100 <= l and l <= 0x1112);
59 std.debug.assert(0x1161 <= v and v <= 0x1175); 68 assert(0x1161 <= v and v <= 0x1175);
60 const LIndex = l - LBase; 69 const LIndex = l - LBase;
61 const VIndex = v - VBase; 70 const VIndex = v - VBase;
62 const LVIndex = LIndex * NCount + VIndex * TCount; 71 const LVIndex = LIndex * NCount + VIndex * TCount;
63 72
64 if (t == 0) return SBase + LVIndex; 73 if (t == 0) return SBase + LVIndex;
65 74
66 std.debug.assert(0x11A8 <= t and t <= 0x11C2); 75 assert(0x11A8 <= t and t <= 0x11C2);
67 const TIndex = t - TBase; 76 const TIndex = t - TBase;
68 77
69 return SBase + LVIndex + TIndex; 78 return SBase + LVIndex + TIndex;
@@ -175,45 +184,45 @@ test "decompose" {
175 var buf: [18]u21 = undefined; 184 var buf: [18]u21 = undefined;
176 185
177 var dc = n.decompose('é', .nfd, &buf); 186 var dc = n.decompose('é', .nfd, &buf);
178 try std.testing.expect(dc.form == .nfd); 187 try testing.expect(dc.form == .nfd);
179 try std.testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]); 188 try testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]);
180 189
181 dc = n.decompose('\u{1e0a}', .nfd, &buf); 190 dc = n.decompose('\u{1e0a}', .nfd, &buf);
182 try std.testing.expect(dc.form == .nfd); 191 try testing.expect(dc.form == .nfd);
183 try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); 192 try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
184 193
185 dc = n.decompose('\u{1e0a}', .nfkd, &buf); 194 dc = n.decompose('\u{1e0a}', .nfkd, &buf);
186 try std.testing.expect(dc.form == .nfkd); 195 try testing.expect(dc.form == .nfkd);
187 try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); 196 try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
188 197
189 dc = n.decompose('\u{3189}', .nfd, &buf); 198 dc = n.decompose('\u{3189}', .nfd, &buf);
190 try std.testing.expect(dc.form == .same); 199 try testing.expect(dc.form == .same);
191 try std.testing.expect(dc.cps.len == 0); 200 try testing.expect(dc.cps.len == 0);
192 201
193 dc = n.decompose('\u{3189}', .nfkd, &buf); 202 dc = n.decompose('\u{3189}', .nfkd, &buf);
194 try std.testing.expect(dc.form == .nfkd); 203 try testing.expect(dc.form == .nfkd);
195 try std.testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]); 204 try testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]);
196 205
197 dc = n.decompose('\u{ace1}', .nfd, &buf); 206 dc = n.decompose('\u{ace1}', .nfd, &buf);
198 try std.testing.expect(dc.form == .nfd); 207 try testing.expect(dc.form == .nfd);
199 try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); 208 try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
200 209
201 dc = n.decompose('\u{ace1}', .nfkd, &buf); 210 dc = n.decompose('\u{ace1}', .nfkd, &buf);
202 try std.testing.expect(dc.form == .nfd); 211 try testing.expect(dc.form == .nfd);
203 try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); 212 try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
204 213
205 dc = n.decompose('\u{3d3}', .nfd, &buf); 214 dc = n.decompose('\u{3d3}', .nfd, &buf);
206 try std.testing.expect(dc.form == .nfd); 215 try testing.expect(dc.form == .nfd);
207 try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]); 216 try testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]);
208 217
209 dc = n.decompose('\u{3d3}', .nfkd, &buf); 218 dc = n.decompose('\u{3d3}', .nfkd, &buf);
210 try std.testing.expect(dc.form == .nfkd); 219 try testing.expect(dc.form == .nfkd);
211 try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]); 220 try testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]);
212} 221}
213 222
214/// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory. 223/// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory.
215pub const Result = struct { 224pub const Result = struct {
216 allocator: ?std.mem.Allocator = null, 225 allocator: ?mem.Allocator = null,
217 slice: []const u8, 226 slice: []const u8,
218 227
219 pub fn deinit(self: *Result) void { 228 pub fn deinit(self: *Result) void {
@@ -232,25 +241,25 @@ fn canonicalSort(self: Self, cps: []u21) void {
232 while (i < cps.len) : (i += 1) { 241 while (i < cps.len) : (i += 1) {
233 const start: usize = i; 242 const start: usize = i;
234 while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} 243 while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {}
235 std.mem.sort(u21, cps[start..i], self, cccLess); 244 mem.sort(u21, cps[start..i], self, cccLess);
236 } 245 }
237} 246}
238 247
239/// Normalize `str` to NFD. 248/// Normalize `str` to NFD.
240pub fn nfd(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { 249pub fn nfd(self: Self, allocator: mem.Allocator, str: []const u8) !Result {
241 return self.nfxd(allocator, str, .nfd); 250 return self.nfxd(allocator, str, .nfd);
242} 251}
243 252
244/// Normalize `str` to NFKD. 253/// Normalize `str` to NFKD.
245pub fn nfkd(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { 254pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) !Result {
246 return self.nfxd(allocator, str, .nfkd); 255 return self.nfxd(allocator, str, .nfkd);
247} 256}
248 257
249fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result { 258fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Result {
250 // Quick checks. 259 // Quick checks.
251 if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; 260 if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
252 261
253 var dcp_list = try std.ArrayList(u21).initCapacity(allocator, str.len * 3); 262 var dcp_list = std.ArrayList(u21).init(allocator);
254 defer dcp_list.deinit(); 263 defer dcp_list.deinit();
255 264
256 var cp_iter = CodePointIterator{ .bytes = str }; 265 var cp_iter = CodePointIterator{ .bytes = str };
@@ -272,7 +281,7 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
272 281
273 var buf: [4]u8 = undefined; 282 var buf: [4]u8 = undefined;
274 for (dcp_list.items) |dcp| { 283 for (dcp_list.items) |dcp| {
275 const len = try std.unicode.utf8Encode(dcp, &buf); 284 const len = try unicode.utf8Encode(dcp, &buf);
276 dstr_list.appendSliceAssumeCapacity(buf[0..len]); 285 dstr_list.appendSliceAssumeCapacity(buf[0..len]);
277 } 286 }
278 287
@@ -288,7 +297,7 @@ test "nfd ASCII / no-alloc" {
288 var result = try n.nfd(allocator, "Hello World!"); 297 var result = try n.nfd(allocator, "Hello World!");
289 defer result.deinit(); 298 defer result.deinit();
290 299
291 try std.testing.expectEqualStrings("Hello World!", result.slice); 300 try testing.expectEqualStrings("Hello World!", result.slice);
292} 301}
293 302
294test "nfd !ASCII / alloc" { 303test "nfd !ASCII / alloc" {
@@ -300,7 +309,7 @@ test "nfd !ASCII / alloc" {
300 var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); 309 var result = try n.nfd(allocator, "Héllo World! \u{3d3}");
301 defer result.deinit(); 310 defer result.deinit();
302 311
303 try std.testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice); 312 try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice);
304} 313}
305 314
306test "nfkd ASCII / no-alloc" { 315test "nfkd ASCII / no-alloc" {
@@ -312,7 +321,7 @@ test "nfkd ASCII / no-alloc" {
312 var result = try n.nfkd(allocator, "Hello World!"); 321 var result = try n.nfkd(allocator, "Hello World!");
313 defer result.deinit(); 322 defer result.deinit();
314 323
315 try std.testing.expectEqualStrings("Hello World!", result.slice); 324 try testing.expectEqualStrings("Hello World!", result.slice);
316} 325}
317 326
318test "nfkd !ASCII / alloc" { 327test "nfkd !ASCII / alloc" {
@@ -324,7 +333,7 @@ test "nfkd !ASCII / alloc" {
324 var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); 333 var result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
325 defer result.deinit(); 334 defer result.deinit();
326 335
327 try std.testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); 336 try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice);
328} 337}
329 338
330// Composition utilities. 339// Composition utilities.
@@ -338,18 +347,19 @@ fn isNonHangulStarter(self: Self, cp: u21) bool {
338} 347}
339 348
340/// Normalizes `str` to NFC. 349/// Normalizes `str` to NFC.
341pub fn nfc(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { 350pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) !Result {
342 return self.nfxc(allocator, str, .nfc); 351 return self.nfxc(allocator, str, .nfc);
343} 352}
344 353
345/// Normalizes `str` to NFKC. 354/// Normalizes `str` to NFKC.
346pub fn nfkc(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { 355pub fn nfkc(self: Self, allocator: mem.Allocator, str: []const u8) !Result {
347 return self.nfxc(allocator, str, .nfkc); 356 return self.nfxc(allocator, str, .nfkc);
348} 357}
349 358
350fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result { 359fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Result {
351 // Quick checks. 360 // Quick checks.
352 if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; 361 if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
362 if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str };
353 363
354 // Decompose first. 364 // Decompose first.
355 var d_result = if (form == .nfc) 365 var d_result = if (form == .nfc)
@@ -449,7 +459,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
449 459
450 for (d_list.items) |cp| { 460 for (d_list.items) |cp| {
451 if (cp == tombstone) continue; // "Delete" 461 if (cp == tombstone) continue; // "Delete"
452 const len = try std.unicode.utf8Encode(cp, &buf); 462 const len = try unicode.utf8Encode(cp, &buf);
453 cstr_list.appendSliceAssumeCapacity(buf[0..len]); 463 cstr_list.appendSliceAssumeCapacity(buf[0..len]);
454 } 464 }
455 465
@@ -478,7 +488,7 @@ test "nfc" {
478 var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); 488 var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
479 defer result.deinit(); 489 defer result.deinit();
480 490
481 try std.testing.expectEqualStrings("Complex char: \u{3D3}", result.slice); 491 try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice);
482} 492}
483 493
484test "nfkc" { 494test "nfkc" {
@@ -490,17 +500,17 @@ test "nfkc" {
490 var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); 500 var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
491 defer result.deinit(); 501 defer result.deinit();
492 502
493 try std.testing.expectEqualStrings("Complex char: \u{038E}", result.slice); 503 try testing.expectEqualStrings("Complex char: \u{038E}", result.slice);
494} 504}
495 505
496/// Tests for equality of `a` and `b` after normalizing to NFD. 506/// Tests for equality of `a` and `b` after normalizing to NFD.
497pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u8) !bool { 507pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool {
498 var norm_result_a = try self.nfd(allocator, a); 508 var norm_result_a = try self.nfd(allocator, a);
499 defer norm_result_a.deinit(); 509 defer norm_result_a.deinit();
500 var norm_result_b = try self.nfd(allocator, b); 510 var norm_result_b = try self.nfd(allocator, b);
501 defer norm_result_b.deinit(); 511 defer norm_result_b.deinit();
502 512
503 return std.mem.eql(u8, norm_result_a.slice, norm_result_b.slice); 513 return mem.eql(u8, norm_result_a.slice, norm_result_b.slice);
504} 514}
505 515
506test "eql" { 516test "eql" {
@@ -509,8 +519,8 @@ test "eql" {
509 defer data.deinit(); 519 defer data.deinit();
510 var n = Self{ .norm_data = &data }; 520 var n = Self{ .norm_data = &data };
511 521
512 try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); 522 try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
513 try std.testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); 523 try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
514} 524}
515 525
516// FCD 526// FCD
@@ -545,17 +555,17 @@ test "isFcd" {
545 var n = Self{ .norm_data = &data }; 555 var n = Self{ .norm_data = &data };
546 556
547 const is_nfc = "José \u{3D3}"; 557 const is_nfc = "José \u{3D3}";
548 try std.testing.expect(n.isFcd(is_nfc)); 558 try testing.expect(n.isFcd(is_nfc));
549 559
550 const is_nfd = "Jose\u{301} \u{3d2}\u{301}"; 560 const is_nfd = "Jose\u{301} \u{3d2}\u{301}";
551 try std.testing.expect(n.isFcd(is_nfd)); 561 try testing.expect(n.isFcd(is_nfd));
552 562
553 const not_fcd = "Jose\u{301} \u{3d2}\u{315}\u{301}"; 563 const not_fcd = "Jose\u{301} \u{3d2}\u{315}\u{301}";
554 try std.testing.expect(!n.isFcd(not_fcd)); 564 try testing.expect(!n.isFcd(not_fcd));
555} 565}
556 566
557test "Unicode normalization tests" { 567test "Unicode normalization tests" {
558 var arena = std.heap.ArenaAllocator.init(std.testing.allocator); 568 var arena = heap.ArenaAllocator.init(testing.allocator);
559 defer arena.deinit(); 569 defer arena.deinit();
560 var allocator = arena.allocator(); 570 var allocator = arena.allocator();
561 571
@@ -563,9 +573,9 @@ test "Unicode normalization tests" {
563 defer data.deinit(); 573 defer data.deinit();
564 var n = Self{ .norm_data = &data }; 574 var n = Self{ .norm_data = &data };
565 575
566 var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); 576 var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
567 defer file.close(); 577 defer file.close();
568 var buf_reader = std.io.bufferedReader(file.reader()); 578 var buf_reader = io.bufferedReader(file.reader());
569 const input_stream = buf_reader.reader(); 579 const input_stream = buf_reader.reader();
570 580
571 var line_no: usize = 0; 581 var line_no: usize = 0;
@@ -577,7 +587,7 @@ test "Unicode normalization tests" {
577 // Skip comments or empty lines. 587 // Skip comments or empty lines.
578 if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; 588 if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
579 // Iterate over fields. 589 // Iterate over fields.
580 var fields = std.mem.split(u8, line, ";"); 590 var fields = mem.split(u8, line, ";");
581 var field_index: usize = 0; 591 var field_index: usize = 0;
582 var input: []u8 = undefined; 592 var input: []u8 = undefined;
583 defer allocator.free(input); 593 defer allocator.free(input);
@@ -587,24 +597,24 @@ test "Unicode normalization tests" {
587 var i_buf = std.ArrayList(u8).init(allocator); 597 var i_buf = std.ArrayList(u8).init(allocator);
588 defer i_buf.deinit(); 598 defer i_buf.deinit();
589 599
590 var i_fields = std.mem.split(u8, field, " "); 600 var i_fields = mem.split(u8, field, " ");
591 while (i_fields.next()) |s| { 601 while (i_fields.next()) |s| {
592 const icp = try std.fmt.parseInt(u21, s, 16); 602 const icp = try fmt.parseInt(u21, s, 16);
593 const len = try std.unicode.utf8Encode(icp, &cp_buf); 603 const len = try unicode.utf8Encode(icp, &cp_buf);
594 try i_buf.appendSlice(cp_buf[0..len]); 604 try i_buf.appendSlice(cp_buf[0..len]);
595 } 605 }
596 606
597 input = try i_buf.toOwnedSlice(); 607 input = try i_buf.toOwnedSlice();
598 } else if (field_index == 1) { 608 } else if (field_index == 1) {
599 //std.debug.print("\n*** {s} ***\n", .{line}); 609 //debug.print("\n*** {s} ***\n", .{line});
600 // NFC, time to test. 610 // NFC, time to test.
601 var w_buf = std.ArrayList(u8).init(allocator); 611 var w_buf = std.ArrayList(u8).init(allocator);
602 defer w_buf.deinit(); 612 defer w_buf.deinit();
603 613
604 var w_fields = std.mem.split(u8, field, " "); 614 var w_fields = mem.split(u8, field, " ");
605 while (w_fields.next()) |s| { 615 while (w_fields.next()) |s| {
606 const wcp = try std.fmt.parseInt(u21, s, 16); 616 const wcp = try fmt.parseInt(u21, s, 16);
607 const len = try std.unicode.utf8Encode(wcp, &cp_buf); 617 const len = try unicode.utf8Encode(wcp, &cp_buf);
608 try w_buf.appendSlice(cp_buf[0..len]); 618 try w_buf.appendSlice(cp_buf[0..len]);
609 } 619 }
610 620
@@ -612,16 +622,16 @@ test "Unicode normalization tests" {
612 var got = try n.nfc(allocator, input); 622 var got = try n.nfc(allocator, input);
613 defer got.deinit(); 623 defer got.deinit();
614 624
615 try std.testing.expectEqualStrings(want, got.slice); 625 try testing.expectEqualStrings(want, got.slice);
616 } else if (field_index == 2) { 626 } else if (field_index == 2) {
617 // NFD, time to test. 627 // NFD, time to test.
618 var w_buf = std.ArrayList(u8).init(allocator); 628 var w_buf = std.ArrayList(u8).init(allocator);
619 defer w_buf.deinit(); 629 defer w_buf.deinit();
620 630
621 var w_fields = std.mem.split(u8, field, " "); 631 var w_fields = mem.split(u8, field, " ");
622 while (w_fields.next()) |s| { 632 while (w_fields.next()) |s| {
623 const wcp = try std.fmt.parseInt(u21, s, 16); 633 const wcp = try fmt.parseInt(u21, s, 16);
624 const len = try std.unicode.utf8Encode(wcp, &cp_buf); 634 const len = try unicode.utf8Encode(wcp, &cp_buf);
625 try w_buf.appendSlice(cp_buf[0..len]); 635 try w_buf.appendSlice(cp_buf[0..len]);
626 } 636 }
627 637
@@ -629,16 +639,16 @@ test "Unicode normalization tests" {
629 var got = try n.nfd(allocator, input); 639 var got = try n.nfd(allocator, input);
630 defer got.deinit(); 640 defer got.deinit();
631 641
632 try std.testing.expectEqualStrings(want, got.slice); 642 try testing.expectEqualStrings(want, got.slice);
633 } else if (field_index == 3) { 643 } else if (field_index == 3) {
634 // NFKC, time to test. 644 // NFKC, time to test.
635 var w_buf = std.ArrayList(u8).init(allocator); 645 var w_buf = std.ArrayList(u8).init(allocator);
636 defer w_buf.deinit(); 646 defer w_buf.deinit();
637 647
638 var w_fields = std.mem.split(u8, field, " "); 648 var w_fields = mem.split(u8, field, " ");
639 while (w_fields.next()) |s| { 649 while (w_fields.next()) |s| {
640 const wcp = try std.fmt.parseInt(u21, s, 16); 650 const wcp = try fmt.parseInt(u21, s, 16);
641 const len = try std.unicode.utf8Encode(wcp, &cp_buf); 651 const len = try unicode.utf8Encode(wcp, &cp_buf);
642 try w_buf.appendSlice(cp_buf[0..len]); 652 try w_buf.appendSlice(cp_buf[0..len]);
643 } 653 }
644 654
@@ -646,16 +656,16 @@ test "Unicode normalization tests" {
646 var got = try n.nfkc(allocator, input); 656 var got = try n.nfkc(allocator, input);
647 defer got.deinit(); 657 defer got.deinit();
648 658
649 try std.testing.expectEqualStrings(want, got.slice); 659 try testing.expectEqualStrings(want, got.slice);
650 } else if (field_index == 4) { 660 } else if (field_index == 4) {
651 // NFKD, time to test. 661 // NFKD, time to test.
652 var w_buf = std.ArrayList(u8).init(allocator); 662 var w_buf = std.ArrayList(u8).init(allocator);
653 defer w_buf.deinit(); 663 defer w_buf.deinit();
654 664
655 var w_fields = std.mem.split(u8, field, " "); 665 var w_fields = mem.split(u8, field, " ");
656 while (w_fields.next()) |s| { 666 while (w_fields.next()) |s| {
657 const wcp = try std.fmt.parseInt(u21, s, 16); 667 const wcp = try fmt.parseInt(u21, s, 16);
658 const len = try std.unicode.utf8Encode(wcp, &cp_buf); 668 const len = try unicode.utf8Encode(wcp, &cp_buf);
659 try w_buf.appendSlice(cp_buf[0..len]); 669 try w_buf.appendSlice(cp_buf[0..len]);
660 } 670 }
661 671
@@ -663,10 +673,51 @@ test "Unicode normalization tests" {
663 var got = try n.nfkd(allocator, input); 673 var got = try n.nfkd(allocator, input);
664 defer got.deinit(); 674 defer got.deinit();
665 675
666 try std.testing.expectEqualStrings(want, got.slice); 676 try testing.expectEqualStrings(want, got.slice);
667 } else { 677 } else {
668 continue; 678 continue;
669 } 679 }
670 } 680 }
671 } 681 }
672} 682}
683
684/// Returns true if `str` only contains Latin-1 Supplement
685/// code points. Uses SIMD if possible.
686pub fn isLatin1Only(str: []const u8) bool {
687 var cp_iter = CodePointIterator{ .bytes = str };
688
689 const vec_len = simd.suggestVectorLength(u21) orelse return blk: {
690 break :blk while (cp_iter.next()) |cp| {
691 if (cp.code > 256) break false;
692 } else true;
693 };
694
695 const Vec = @Vector(vec_len, u21);
696
697 outer: while (true) {
698 var v1: Vec = undefined;
699 const saved_cp_i = cp_iter.i;
700
701 for (0..vec_len) |i| {
702 if (cp_iter.next()) |cp| {
703 v1[i] = cp.code;
704 } else {
705 cp_iter.i = saved_cp_i;
706 break :outer;
707 }
708 }
709 const v2: Vec = @splat(256);
710 if (@reduce(.Or, v1 > v2)) return false;
711 }
712
713 return while (cp_iter.next()) |cp| {
714 if (cp.code > 256) break false;
715 } else true;
716}
717
718test "isLatin1Only" {
719 const latin1_only = "Hello, World! \u{fe} \u{ff}";
720 try testing.expect(isLatin1Only(latin1_only));
721 const not_latin1_only = "Héllo, World! \u{3d3}";
722 try testing.expect(!isLatin1Only(not_latin1_only));
723}