diff options
Diffstat (limited to 'src/Normalizer.zig')
| -rw-r--r-- | src/Normalizer.zig | 97 |
1 files changed, 56 insertions, 41 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig index 1b4a2d5..6a19f47 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig | |||
| @@ -8,16 +8,18 @@ const CodePointIterator = @import("code_point").Iterator; | |||
| 8 | const case_fold_map = @import("ziglyph").case_folding; | 8 | const case_fold_map = @import("ziglyph").case_folding; |
| 9 | const hangul_map = @import("ziglyph").hangul; | 9 | const hangul_map = @import("ziglyph").hangul; |
| 10 | const norm_props = @import("ziglyph").normalization_props; | 10 | const norm_props = @import("ziglyph").normalization_props; |
| 11 | const normp = @import("normp"); | 11 | pub const Data = @import("CombiningClassData"); |
| 12 | |||
| 13 | const Self = @This(); | ||
| 14 | 12 | ||
| 13 | ccc_data: *Data, | ||
| 15 | nfc_map: std.AutoHashMap([2]u21, u21), | 14 | nfc_map: std.AutoHashMap([2]u21, u21), |
| 16 | nfd_map: std.AutoHashMap(u21, [2]u21), | 15 | nfd_map: std.AutoHashMap(u21, [2]u21), |
| 17 | nfkd_map: std.AutoHashMap(u21, [18]u21), | 16 | nfkd_map: std.AutoHashMap(u21, [18]u21), |
| 18 | 17 | ||
| 19 | pub fn init(allocator: std.mem.Allocator) !Self { | 18 | const Self = @This(); |
| 19 | |||
| 20 | pub fn init(allocator: std.mem.Allocator, data: *Data) !Self { | ||
| 20 | var self = Self{ | 21 | var self = Self{ |
| 22 | .ccc_data = data, | ||
| 21 | .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator), | 23 | .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator), |
| 22 | .nfd_map = std.AutoHashMap(u21, [2]u21).init(allocator), | 24 | .nfd_map = std.AutoHashMap(u21, [2]u21).init(allocator), |
| 23 | .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), | 25 | .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), |
| @@ -95,7 +97,9 @@ pub fn deinit(self: *Self) void { | |||
| 95 | } | 97 | } |
| 96 | 98 | ||
| 97 | test "init / deinit" { | 99 | test "init / deinit" { |
| 98 | var n = try init(std.testing.allocator); | 100 | var data = try Data.init(std.testing.allocator); |
| 101 | defer data.deinit(); | ||
| 102 | var n = try init(std.testing.allocator, &data); | ||
| 99 | defer n.deinit(); | 103 | defer n.deinit(); |
| 100 | } | 104 | } |
| 101 | 105 | ||
| @@ -241,7 +245,9 @@ pub fn decompose(self: Self, cp: u21, form: Form) Decomp { | |||
| 241 | 245 | ||
| 242 | test "decompose" { | 246 | test "decompose" { |
| 243 | const allocator = std.testing.allocator; | 247 | const allocator = std.testing.allocator; |
| 244 | var n = try init(allocator); | 248 | var data = try Data.init(allocator); |
| 249 | defer data.deinit(); | ||
| 250 | var n = try init(allocator, &data); | ||
| 245 | defer n.deinit(); | 251 | defer n.deinit(); |
| 246 | 252 | ||
| 247 | var dc = n.decompose('é', .nfd); | 253 | var dc = n.decompose('é', .nfd); |
| @@ -307,19 +313,17 @@ pub const Result = struct { | |||
| 307 | }; | 313 | }; |
| 308 | 314 | ||
| 309 | // Compares code points by Canonical Combining Class order. | 315 | // Compares code points by Canonical Combining Class order. |
| 310 | fn cccLess(_: void, lhs: u21, rhs: u21) bool { | 316 | fn cccLess(self: Self, lhs: u21, rhs: u21) bool { |
| 311 | const lcc = normp.stage_2[normp.stage_1[lhs >> 8] + (lhs & 0xff)]; | 317 | return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs); |
| 312 | const rcc = normp.stage_2[normp.stage_1[rhs >> 8] + (rhs & 0xff)]; | ||
| 313 | return lcc < rcc; | ||
| 314 | } | 318 | } |
| 315 | 319 | ||
| 316 | // Applies the Canonical Sorting Algorithm. | 320 | // Applies the Canonical Sorting Algorithm. |
| 317 | fn canonicalSort(cps: []u21) void { | 321 | fn canonicalSort(self: Self, cps: []u21) void { |
| 318 | var i: usize = 0; | 322 | var i: usize = 0; |
| 319 | while (i < cps.len) : (i += 1) { | 323 | while (i < cps.len) : (i += 1) { |
| 320 | const start: usize = i; | 324 | const start: usize = i; |
| 321 | while (i < cps.len and normp.stage_2[normp.stage_1[cps[i] >> 8] + (cps[i] & 0xff)] != 0) : (i += 1) {} | 325 | while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} |
| 322 | std.mem.sort(u21, cps[start..i], {}, cccLess); | 326 | std.mem.sort(u21, cps[start..i], self, cccLess); |
| 323 | } | 327 | } |
| 324 | } | 328 | } |
| 325 | 329 | ||
| @@ -349,7 +353,7 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 349 | try dcp_list.appendSlice(slice); | 353 | try dcp_list.appendSlice(slice); |
| 350 | } | 354 | } |
| 351 | 355 | ||
| 352 | canonicalSort(dcp_list.items); | 356 | self.canonicalSort(dcp_list.items); |
| 353 | 357 | ||
| 354 | var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4); | 358 | var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4); |
| 355 | defer dstr_list.deinit(); | 359 | defer dstr_list.deinit(); |
| @@ -365,7 +369,9 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 365 | 369 | ||
| 366 | test "nfd ASCII / no-alloc" { | 370 | test "nfd ASCII / no-alloc" { |
| 367 | const allocator = std.testing.allocator; | 371 | const allocator = std.testing.allocator; |
| 368 | var n = try init(allocator); | 372 | var data = try Data.init(allocator); |
| 373 | defer data.deinit(); | ||
| 374 | var n = try init(allocator, &data); | ||
| 369 | defer n.deinit(); | 375 | defer n.deinit(); |
| 370 | 376 | ||
| 371 | var result = try n.nfd(allocator, "Hello World!"); | 377 | var result = try n.nfd(allocator, "Hello World!"); |
| @@ -376,7 +382,9 @@ test "nfd ASCII / no-alloc" { | |||
| 376 | 382 | ||
| 377 | test "nfd !ASCII / alloc" { | 383 | test "nfd !ASCII / alloc" { |
| 378 | const allocator = std.testing.allocator; | 384 | const allocator = std.testing.allocator; |
| 379 | var n = try init(allocator); | 385 | var data = try Data.init(allocator); |
| 386 | defer data.deinit(); | ||
| 387 | var n = try init(allocator, &data); | ||
| 380 | defer n.deinit(); | 388 | defer n.deinit(); |
| 381 | 389 | ||
| 382 | var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); | 390 | var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); |
| @@ -387,7 +395,9 @@ test "nfd !ASCII / alloc" { | |||
| 387 | 395 | ||
| 388 | test "nfkd ASCII / no-alloc" { | 396 | test "nfkd ASCII / no-alloc" { |
| 389 | const allocator = std.testing.allocator; | 397 | const allocator = std.testing.allocator; |
| 390 | var n = try init(allocator); | 398 | var data = try Data.init(allocator); |
| 399 | defer data.deinit(); | ||
| 400 | var n = try init(allocator, &data); | ||
| 391 | defer n.deinit(); | 401 | defer n.deinit(); |
| 392 | 402 | ||
| 393 | var result = try n.nfkd(allocator, "Hello World!"); | 403 | var result = try n.nfkd(allocator, "Hello World!"); |
| @@ -398,7 +408,9 @@ test "nfkd ASCII / no-alloc" { | |||
| 398 | 408 | ||
| 399 | test "nfkd !ASCII / alloc" { | 409 | test "nfkd !ASCII / alloc" { |
| 400 | const allocator = std.testing.allocator; | 410 | const allocator = std.testing.allocator; |
| 401 | var n = try init(allocator); | 411 | var data = try Data.init(allocator); |
| 412 | defer data.deinit(); | ||
| 413 | var n = try init(allocator, &data); | ||
| 402 | defer n.deinit(); | 414 | defer n.deinit(); |
| 403 | 415 | ||
| 404 | var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); | 416 | var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); |
| @@ -413,16 +425,8 @@ fn isHangul(cp: u21) bool { | |||
| 413 | return cp >= 0x1100 and hangul_map.syllableType(cp) != null; | 425 | return cp >= 0x1100 and hangul_map.syllableType(cp) != null; |
| 414 | } | 426 | } |
| 415 | 427 | ||
| 416 | fn isStarter(cp: u21) bool { | 428 | fn isNonHangulStarter(self: Self, cp: u21) bool { |
| 417 | return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] == 0; | 429 | return !isHangul(cp) and self.ccc_data.isStarter(cp); |
| 418 | } | ||
| 419 | |||
| 420 | fn isCombining(cp: u21) bool { | ||
| 421 | return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] != 0; | ||
| 422 | } | ||
| 423 | |||
| 424 | fn isNonHangulStarter(cp: u21) bool { | ||
| 425 | return !isHangul(cp) and isStarter(cp); | ||
| 426 | } | 430 | } |
| 427 | 431 | ||
| 428 | /// Normalizes `str` to NFC. | 432 | /// Normalizes `str` to NFC. |
| @@ -464,7 +468,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 464 | 468 | ||
| 465 | block_check: while (i < d_list.items.len) : (i += 1) { | 469 | block_check: while (i < d_list.items.len) : (i += 1) { |
| 466 | const C = d_list.items[i]; | 470 | const C = d_list.items[i]; |
| 467 | const cc_C = normp.stage_2[normp.stage_1[C >> 8] + (C & 0xff)]; | 471 | const cc_C = self.ccc_data.ccc(C); |
| 468 | var starter_index: ?usize = null; | 472 | var starter_index: ?usize = null; |
| 469 | var j: usize = i; | 473 | var j: usize = i; |
| 470 | 474 | ||
| @@ -472,14 +476,14 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 472 | j -= 1; | 476 | j -= 1; |
| 473 | 477 | ||
| 474 | // Check for starter. | 478 | // Check for starter. |
| 475 | if (isStarter(d_list.items[j])) { | 479 | if (self.ccc_data.isStarter(d_list.items[j])) { |
| 476 | if (i - j > 1) { // If there's distance between the starting point and the current position. | 480 | if (i - j > 1) { // If there's distance between the starting point and the current position. |
| 477 | for (d_list.items[(j + 1)..i]) |B| { | 481 | for (d_list.items[(j + 1)..i]) |B| { |
| 482 | const cc_B = self.ccc_data.ccc(B); | ||
| 478 | // Check for blocking conditions. | 483 | // Check for blocking conditions. |
| 479 | if (isHangul(C)) { | 484 | if (isHangul(C)) { |
| 480 | if (isCombining(B) or isNonHangulStarter(B)) continue :block_check; | 485 | if (cc_B != 0 or self.isNonHangulStarter(B)) continue :block_check; |
| 481 | } | 486 | } |
| 482 | const cc_B = normp.stage_2[normp.stage_1[B >> 8] + (B & 0xff)]; | ||
| 483 | if (cc_B >= cc_C) continue :block_check; | 487 | if (cc_B >= cc_C) continue :block_check; |
| 484 | } | 488 | } |
| 485 | } | 489 | } |
| @@ -560,7 +564,9 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 560 | 564 | ||
| 561 | test "nfc" { | 565 | test "nfc" { |
| 562 | const allocator = std.testing.allocator; | 566 | const allocator = std.testing.allocator; |
| 563 | var n = try init(allocator); | 567 | var data = try Data.init(allocator); |
| 568 | defer data.deinit(); | ||
| 569 | var n = try init(allocator, &data); | ||
| 564 | defer n.deinit(); | 570 | defer n.deinit(); |
| 565 | 571 | ||
| 566 | var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); | 572 | var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); |
| @@ -571,7 +577,9 @@ test "nfc" { | |||
| 571 | 577 | ||
| 572 | test "nfkc" { | 578 | test "nfkc" { |
| 573 | const allocator = std.testing.allocator; | 579 | const allocator = std.testing.allocator; |
| 574 | var n = try init(allocator); | 580 | var data = try Data.init(allocator); |
| 581 | defer data.deinit(); | ||
| 582 | var n = try init(allocator, &data); | ||
| 575 | defer n.deinit(); | 583 | defer n.deinit(); |
| 576 | 584 | ||
| 577 | var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); | 585 | var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); |
| @@ -630,7 +638,9 @@ pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u | |||
| 630 | 638 | ||
| 631 | test "eql" { | 639 | test "eql" { |
| 632 | const allocator = std.testing.allocator; | 640 | const allocator = std.testing.allocator; |
| 633 | var n = try init(allocator); | 641 | var data = try Data.init(allocator); |
| 642 | defer data.deinit(); | ||
| 643 | var n = try init(allocator, &data); | ||
| 634 | defer n.deinit(); | 644 | defer n.deinit(); |
| 635 | 645 | ||
| 636 | try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); | 646 | try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); |
| @@ -697,7 +707,9 @@ pub fn eqlCaseless(self: Self, allocator: std.mem.Allocator, a: []const u8, b: [ | |||
| 697 | 707 | ||
| 698 | test "eqlCaseless" { | 708 | test "eqlCaseless" { |
| 699 | const allocator = std.testing.allocator; | 709 | const allocator = std.testing.allocator; |
| 700 | var n = try init(allocator); | 710 | var data = try Data.init(allocator); |
| 711 | defer data.deinit(); | ||
| 712 | var n = try init(allocator, &data); | ||
| 701 | defer n.deinit(); | 713 | defer n.deinit(); |
| 702 | 714 | ||
| 703 | try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}")); | 715 | try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}")); |
| @@ -707,7 +719,7 @@ test "eqlCaseless" { | |||
| 707 | // FCD | 719 | // FCD |
| 708 | fn getLeadCcc(self: Self, cp: u21) u8 { | 720 | fn getLeadCcc(self: Self, cp: u21) u8 { |
| 709 | const dc = self.mapping(cp, .nfd); | 721 | const dc = self.mapping(cp, .nfd); |
| 710 | return normp.stage_2[normp.stage_1[dc.cps[0] >> 8] + (dc.cps[0] & 0xff)]; | 722 | return self.ccc_data.ccc(dc.cps[0]); |
| 711 | } | 723 | } |
| 712 | 724 | ||
| 713 | fn getTrailCcc(self: Self, cp: u21) u8 { | 725 | fn getTrailCcc(self: Self, cp: u21) u8 { |
| @@ -715,8 +727,7 @@ fn getTrailCcc(self: Self, cp: u21) u8 { | |||
| 715 | const len = for (dc.cps, 0..) |dcp, i| { | 727 | const len = for (dc.cps, 0..) |dcp, i| { |
| 716 | if (dcp == 0) break i; | 728 | if (dcp == 0) break i; |
| 717 | } else dc.cps.len; | 729 | } else dc.cps.len; |
| 718 | const tcp = dc.cps[len -| 1]; | 730 | return self.ccc_data.ccc(dc.cps[len - 1]); |
| 719 | return normp.stage_2[normp.stage_1[tcp >> 8] + (tcp & 0xff)]; | ||
| 720 | } | 731 | } |
| 721 | 732 | ||
| 722 | /// Fast check to detect if a string is already in NFC or NFD form. | 733 | /// Fast check to detect if a string is already in NFC or NFD form. |
| @@ -733,7 +744,9 @@ pub fn isFcd(self: Self, str: []const u8) bool { | |||
| 733 | 744 | ||
| 734 | test "isFcd" { | 745 | test "isFcd" { |
| 735 | const allocator = std.testing.allocator; | 746 | const allocator = std.testing.allocator; |
| 736 | var n = try init(allocator); | 747 | var data = try Data.init(allocator); |
| 748 | defer data.deinit(); | ||
| 749 | var n = try init(allocator, &data); | ||
| 737 | defer n.deinit(); | 750 | defer n.deinit(); |
| 738 | 751 | ||
| 739 | const is_nfc = "José \u{3D3}"; | 752 | const is_nfc = "José \u{3D3}"; |
| @@ -751,7 +764,9 @@ test "Unicode normalization tests" { | |||
| 751 | defer arena.deinit(); | 764 | defer arena.deinit(); |
| 752 | var allocator = arena.allocator(); | 765 | var allocator = arena.allocator(); |
| 753 | 766 | ||
| 754 | var n = try init(allocator); | 767 | var data = try Data.init(allocator); |
| 768 | defer data.deinit(); | ||
| 769 | var n = try init(allocator, &data); | ||
| 755 | defer n.deinit(); | 770 | defer n.deinit(); |
| 756 | 771 | ||
| 757 | var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); | 772 | var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); |