summaryrefslogtreecommitdiff
path: root/src/Normalizer.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/Normalizer.zig')
-rw-r--r--src/Normalizer.zig97
1 files changed, 56 insertions, 41 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig
index 1b4a2d5..6a19f47 100644
--- a/src/Normalizer.zig
+++ b/src/Normalizer.zig
@@ -8,16 +8,18 @@ const CodePointIterator = @import("code_point").Iterator;
8const case_fold_map = @import("ziglyph").case_folding; 8const case_fold_map = @import("ziglyph").case_folding;
9const hangul_map = @import("ziglyph").hangul; 9const hangul_map = @import("ziglyph").hangul;
10const norm_props = @import("ziglyph").normalization_props; 10const norm_props = @import("ziglyph").normalization_props;
11const normp = @import("normp"); 11pub const Data = @import("CombiningClassData");
12
13const Self = @This();
14 12
13ccc_data: *Data,
15nfc_map: std.AutoHashMap([2]u21, u21), 14nfc_map: std.AutoHashMap([2]u21, u21),
16nfd_map: std.AutoHashMap(u21, [2]u21), 15nfd_map: std.AutoHashMap(u21, [2]u21),
17nfkd_map: std.AutoHashMap(u21, [18]u21), 16nfkd_map: std.AutoHashMap(u21, [18]u21),
18 17
19pub fn init(allocator: std.mem.Allocator) !Self { 18const Self = @This();
19
20pub fn init(allocator: std.mem.Allocator, data: *Data) !Self {
20 var self = Self{ 21 var self = Self{
22 .ccc_data = data,
21 .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator), 23 .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator),
22 .nfd_map = std.AutoHashMap(u21, [2]u21).init(allocator), 24 .nfd_map = std.AutoHashMap(u21, [2]u21).init(allocator),
23 .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), 25 .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator),
@@ -95,7 +97,9 @@ pub fn deinit(self: *Self) void {
95} 97}
96 98
97test "init / deinit" { 99test "init / deinit" {
98 var n = try init(std.testing.allocator); 100 var data = try Data.init(std.testing.allocator);
101 defer data.deinit();
102 var n = try init(std.testing.allocator, &data);
99 defer n.deinit(); 103 defer n.deinit();
100} 104}
101 105
@@ -241,7 +245,9 @@ pub fn decompose(self: Self, cp: u21, form: Form) Decomp {
241 245
242test "decompose" { 246test "decompose" {
243 const allocator = std.testing.allocator; 247 const allocator = std.testing.allocator;
244 var n = try init(allocator); 248 var data = try Data.init(allocator);
249 defer data.deinit();
250 var n = try init(allocator, &data);
245 defer n.deinit(); 251 defer n.deinit();
246 252
247 var dc = n.decompose('é', .nfd); 253 var dc = n.decompose('é', .nfd);
@@ -307,19 +313,17 @@ pub const Result = struct {
307}; 313};
308 314
309// Compares code points by Canonical Combining Class order. 315// Compares code points by Canonical Combining Class order.
310fn cccLess(_: void, lhs: u21, rhs: u21) bool { 316fn cccLess(self: Self, lhs: u21, rhs: u21) bool {
311 const lcc = normp.stage_2[normp.stage_1[lhs >> 8] + (lhs & 0xff)]; 317 return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs);
312 const rcc = normp.stage_2[normp.stage_1[rhs >> 8] + (rhs & 0xff)];
313 return lcc < rcc;
314} 318}
315 319
316// Applies the Canonical Sorting Algorithm. 320// Applies the Canonical Sorting Algorithm.
317fn canonicalSort(cps: []u21) void { 321fn canonicalSort(self: Self, cps: []u21) void {
318 var i: usize = 0; 322 var i: usize = 0;
319 while (i < cps.len) : (i += 1) { 323 while (i < cps.len) : (i += 1) {
320 const start: usize = i; 324 const start: usize = i;
321 while (i < cps.len and normp.stage_2[normp.stage_1[cps[i] >> 8] + (cps[i] & 0xff)] != 0) : (i += 1) {} 325 while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {}
322 std.mem.sort(u21, cps[start..i], {}, cccLess); 326 std.mem.sort(u21, cps[start..i], self, cccLess);
323 } 327 }
324} 328}
325 329
@@ -349,7 +353,7 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
349 try dcp_list.appendSlice(slice); 353 try dcp_list.appendSlice(slice);
350 } 354 }
351 355
352 canonicalSort(dcp_list.items); 356 self.canonicalSort(dcp_list.items);
353 357
354 var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4); 358 var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4);
355 defer dstr_list.deinit(); 359 defer dstr_list.deinit();
@@ -365,7 +369,9 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
365 369
366test "nfd ASCII / no-alloc" { 370test "nfd ASCII / no-alloc" {
367 const allocator = std.testing.allocator; 371 const allocator = std.testing.allocator;
368 var n = try init(allocator); 372 var data = try Data.init(allocator);
373 defer data.deinit();
374 var n = try init(allocator, &data);
369 defer n.deinit(); 375 defer n.deinit();
370 376
371 var result = try n.nfd(allocator, "Hello World!"); 377 var result = try n.nfd(allocator, "Hello World!");
@@ -376,7 +382,9 @@ test "nfd ASCII / no-alloc" {
376 382
377test "nfd !ASCII / alloc" { 383test "nfd !ASCII / alloc" {
378 const allocator = std.testing.allocator; 384 const allocator = std.testing.allocator;
379 var n = try init(allocator); 385 var data = try Data.init(allocator);
386 defer data.deinit();
387 var n = try init(allocator, &data);
380 defer n.deinit(); 388 defer n.deinit();
381 389
382 var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); 390 var result = try n.nfd(allocator, "Héllo World! \u{3d3}");
@@ -387,7 +395,9 @@ test "nfd !ASCII / alloc" {
387 395
388test "nfkd ASCII / no-alloc" { 396test "nfkd ASCII / no-alloc" {
389 const allocator = std.testing.allocator; 397 const allocator = std.testing.allocator;
390 var n = try init(allocator); 398 var data = try Data.init(allocator);
399 defer data.deinit();
400 var n = try init(allocator, &data);
391 defer n.deinit(); 401 defer n.deinit();
392 402
393 var result = try n.nfkd(allocator, "Hello World!"); 403 var result = try n.nfkd(allocator, "Hello World!");
@@ -398,7 +408,9 @@ test "nfkd ASCII / no-alloc" {
398 408
399test "nfkd !ASCII / alloc" { 409test "nfkd !ASCII / alloc" {
400 const allocator = std.testing.allocator; 410 const allocator = std.testing.allocator;
401 var n = try init(allocator); 411 var data = try Data.init(allocator);
412 defer data.deinit();
413 var n = try init(allocator, &data);
402 defer n.deinit(); 414 defer n.deinit();
403 415
404 var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); 416 var result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
@@ -413,16 +425,8 @@ fn isHangul(cp: u21) bool {
413 return cp >= 0x1100 and hangul_map.syllableType(cp) != null; 425 return cp >= 0x1100 and hangul_map.syllableType(cp) != null;
414} 426}
415 427
416fn isStarter(cp: u21) bool { 428fn isNonHangulStarter(self: Self, cp: u21) bool {
417 return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] == 0; 429 return !isHangul(cp) and self.ccc_data.isStarter(cp);
418}
419
420fn isCombining(cp: u21) bool {
421 return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] != 0;
422}
423
424fn isNonHangulStarter(cp: u21) bool {
425 return !isHangul(cp) and isStarter(cp);
426} 430}
427 431
428/// Normalizes `str` to NFC. 432/// Normalizes `str` to NFC.
@@ -464,7 +468,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
464 468
465 block_check: while (i < d_list.items.len) : (i += 1) { 469 block_check: while (i < d_list.items.len) : (i += 1) {
466 const C = d_list.items[i]; 470 const C = d_list.items[i];
467 const cc_C = normp.stage_2[normp.stage_1[C >> 8] + (C & 0xff)]; 471 const cc_C = self.ccc_data.ccc(C);
468 var starter_index: ?usize = null; 472 var starter_index: ?usize = null;
469 var j: usize = i; 473 var j: usize = i;
470 474
@@ -472,14 +476,14 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
472 j -= 1; 476 j -= 1;
473 477
474 // Check for starter. 478 // Check for starter.
475 if (isStarter(d_list.items[j])) { 479 if (self.ccc_data.isStarter(d_list.items[j])) {
476 if (i - j > 1) { // If there's distance between the starting point and the current position. 480 if (i - j > 1) { // If there's distance between the starting point and the current position.
477 for (d_list.items[(j + 1)..i]) |B| { 481 for (d_list.items[(j + 1)..i]) |B| {
482 const cc_B = self.ccc_data.ccc(B);
478 // Check for blocking conditions. 483 // Check for blocking conditions.
479 if (isHangul(C)) { 484 if (isHangul(C)) {
480 if (isCombining(B) or isNonHangulStarter(B)) continue :block_check; 485 if (cc_B != 0 or self.isNonHangulStarter(B)) continue :block_check;
481 } 486 }
482 const cc_B = normp.stage_2[normp.stage_1[B >> 8] + (B & 0xff)];
483 if (cc_B >= cc_C) continue :block_check; 487 if (cc_B >= cc_C) continue :block_check;
484 } 488 }
485 } 489 }
@@ -560,7 +564,9 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
560 564
561test "nfc" { 565test "nfc" {
562 const allocator = std.testing.allocator; 566 const allocator = std.testing.allocator;
563 var n = try init(allocator); 567 var data = try Data.init(allocator);
568 defer data.deinit();
569 var n = try init(allocator, &data);
564 defer n.deinit(); 570 defer n.deinit();
565 571
566 var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); 572 var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
@@ -571,7 +577,9 @@ test "nfc" {
571 577
572test "nfkc" { 578test "nfkc" {
573 const allocator = std.testing.allocator; 579 const allocator = std.testing.allocator;
574 var n = try init(allocator); 580 var data = try Data.init(allocator);
581 defer data.deinit();
582 var n = try init(allocator, &data);
575 defer n.deinit(); 583 defer n.deinit();
576 584
577 var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); 585 var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
@@ -630,7 +638,9 @@ pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u
630 638
631test "eql" { 639test "eql" {
632 const allocator = std.testing.allocator; 640 const allocator = std.testing.allocator;
633 var n = try init(allocator); 641 var data = try Data.init(allocator);
642 defer data.deinit();
643 var n = try init(allocator, &data);
634 defer n.deinit(); 644 defer n.deinit();
635 645
636 try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); 646 try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
@@ -697,7 +707,9 @@ pub fn eqlCaseless(self: Self, allocator: std.mem.Allocator, a: []const u8, b: [
697 707
698test "eqlCaseless" { 708test "eqlCaseless" {
699 const allocator = std.testing.allocator; 709 const allocator = std.testing.allocator;
700 var n = try init(allocator); 710 var data = try Data.init(allocator);
711 defer data.deinit();
712 var n = try init(allocator, &data);
701 defer n.deinit(); 713 defer n.deinit();
702 714
703 try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}")); 715 try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}"));
@@ -707,7 +719,7 @@ test "eqlCaseless" {
707// FCD 719// FCD
708fn getLeadCcc(self: Self, cp: u21) u8 { 720fn getLeadCcc(self: Self, cp: u21) u8 {
709 const dc = self.mapping(cp, .nfd); 721 const dc = self.mapping(cp, .nfd);
710 return normp.stage_2[normp.stage_1[dc.cps[0] >> 8] + (dc.cps[0] & 0xff)]; 722 return self.ccc_data.ccc(dc.cps[0]);
711} 723}
712 724
713fn getTrailCcc(self: Self, cp: u21) u8 { 725fn getTrailCcc(self: Self, cp: u21) u8 {
@@ -715,8 +727,7 @@ fn getTrailCcc(self: Self, cp: u21) u8 {
715 const len = for (dc.cps, 0..) |dcp, i| { 727 const len = for (dc.cps, 0..) |dcp, i| {
716 if (dcp == 0) break i; 728 if (dcp == 0) break i;
717 } else dc.cps.len; 729 } else dc.cps.len;
718 const tcp = dc.cps[len -| 1]; 730 return self.ccc_data.ccc(dc.cps[len - 1]);
719 return normp.stage_2[normp.stage_1[tcp >> 8] + (tcp & 0xff)];
720} 731}
721 732
722/// Fast check to detect if a string is already in NFC or NFD form. 733/// Fast check to detect if a string is already in NFC or NFD form.
@@ -733,7 +744,9 @@ pub fn isFcd(self: Self, str: []const u8) bool {
733 744
734test "isFcd" { 745test "isFcd" {
735 const allocator = std.testing.allocator; 746 const allocator = std.testing.allocator;
736 var n = try init(allocator); 747 var data = try Data.init(allocator);
748 defer data.deinit();
749 var n = try init(allocator, &data);
737 defer n.deinit(); 750 defer n.deinit();
738 751
739 const is_nfc = "José \u{3D3}"; 752 const is_nfc = "José \u{3D3}";
@@ -751,7 +764,9 @@ test "Unicode normalization tests" {
751 defer arena.deinit(); 764 defer arena.deinit();
752 var allocator = arena.allocator(); 765 var allocator = arena.allocator();
753 766
754 var n = try init(allocator); 767 var data = try Data.init(allocator);
768 defer data.deinit();
769 var n = try init(allocator, &data);
755 defer n.deinit(); 770 defer n.deinit();
756 771
757 var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); 772 var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});