diff options
Diffstat (limited to 'src/Normalizer.zig')
| -rw-r--r-- | src/Normalizer.zig | 113 |
1 files changed, 3 insertions, 110 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig index 0670cae..d1d7cee 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig | |||
| @@ -1,12 +1,11 @@ | |||
| 1 | //! Normalizer contains functions and methods that implement Unicode Normalization algorithms. You can normalize strings | 1 | //! Normalizer contains functions and methods that implement |
| 2 | //! into NFC, NFKC, NFD, and NFKD normalization forms (see `nfc`, `nfkc`, `nfd`, and `nfkd`). You can also test for | 2 | //! Unicode Normalization. You can normalize strings into NFC, |
| 3 | //! string equality under different parameters related to normalization (see `eql`, `eqlCaseless`, `eqlIdentifiers`). | 3 | //! NFKC, NFD, and NFKD normalization forms. |
| 4 | 4 | ||
| 5 | const std = @import("std"); | 5 | const std = @import("std"); |
| 6 | const testing = std.testing; | 6 | const testing = std.testing; |
| 7 | 7 | ||
| 8 | const CodePointIterator = @import("code_point").Iterator; | 8 | const CodePointIterator = @import("code_point").Iterator; |
| 9 | const case_fold_map = @import("ziglyph").case_folding; | ||
| 10 | const norm_props = @import("ziglyph").normalization_props; | 9 | const norm_props = @import("ziglyph").normalization_props; |
| 11 | 10 | ||
| 12 | pub const NormData = @import("NormData"); | 11 | pub const NormData = @import("NormData"); |
| @@ -499,44 +498,6 @@ test "nfkc" { | |||
| 499 | try std.testing.expectEqualStrings("Complex char: \u{038E}", result.slice); | 498 | try std.testing.expectEqualStrings("Complex char: \u{038E}", result.slice); |
| 500 | } | 499 | } |
| 501 | 500 | ||
| 502 | /// Tests for equality as per Unicode rules for Identifiers. | ||
| 503 | pub fn eqlIdentifiers(allocator: std.mem.Allocator, a: []const u8, b: []const u8) !bool { | ||
| 504 | var list_a = try std.ArrayList(u21).initCapacity(allocator, a.len); | ||
| 505 | defer list_a.deinit(); | ||
| 506 | var list_b = try std.ArrayList(u21).initCapacity(allocator, b.len); | ||
| 507 | defer list_b.deinit(); | ||
| 508 | |||
| 509 | const Item = struct { | ||
| 510 | str: []const u8, | ||
| 511 | list: *std.ArrayList(u21), | ||
| 512 | }; | ||
| 513 | |||
| 514 | const items = [_]Item{ | ||
| 515 | .{ .str = a, .list = &list_a }, | ||
| 516 | .{ .str = b, .list = &list_b }, | ||
| 517 | }; | ||
| 518 | |||
| 519 | for (items) |item| { | ||
| 520 | var cp_iter = CodePointIterator{ .bytes = item.str }; | ||
| 521 | while (cp_iter.next()) |cp| { | ||
| 522 | if (norm_props.toNfkcCaseFold(cp.code)) |nfkcf| { | ||
| 523 | for (nfkcf) |c| { | ||
| 524 | if (c == 0) break; | ||
| 525 | item.list.appendAssumeCapacity(c); | ||
| 526 | } | ||
| 527 | } else { | ||
| 528 | item.list.appendAssumeCapacity(cp.code); // maps to itself | ||
| 529 | } | ||
| 530 | } | ||
| 531 | } | ||
| 532 | |||
| 533 | return std.mem.eql(u21, list_a.items, list_b.items); | ||
| 534 | } | ||
| 535 | |||
| 536 | test "eqlIdentifiers" { | ||
| 537 | try std.testing.expect(try eqlIdentifiers(std.testing.allocator, "Foé", "foé")); | ||
| 538 | } | ||
| 539 | |||
| 540 | /// Tests for equality of `a` and `b` after normalizing to NFD. | 501 | /// Tests for equality of `a` and `b` after normalizing to NFD. |
| 541 | pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u8) !bool { | 502 | pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u8) !bool { |
| 542 | var norm_result_a = try self.nfd(allocator, a); | 503 | var norm_result_a = try self.nfd(allocator, a); |
| @@ -557,74 +518,6 @@ test "eql" { | |||
| 557 | try std.testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); | 518 | try std.testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); |
| 558 | } | 519 | } |
| 559 | 520 | ||
| 560 | fn requiresNfdBeforeCaseFold(cp: u21) bool { | ||
| 561 | return switch (cp) { | ||
| 562 | 0x0345 => true, | ||
| 563 | 0x1F80...0x1FAF => true, | ||
| 564 | 0x1FB2...0x1FB4 => true, | ||
| 565 | 0x1FB7 => true, | ||
| 566 | 0x1FBC => true, | ||
| 567 | 0x1FC2...0x1FC4 => true, | ||
| 568 | 0x1FC7 => true, | ||
| 569 | 0x1FCC => true, | ||
| 570 | 0x1FF2...0x1FF4 => true, | ||
| 571 | 0x1FF7 => true, | ||
| 572 | 0x1FFC => true, | ||
| 573 | else => false, | ||
| 574 | }; | ||
| 575 | } | ||
| 576 | |||
| 577 | fn requiresPreNfd(str: []const u8) bool { | ||
| 578 | var cp_iter = CodePointIterator{ .bytes = str }; | ||
| 579 | |||
| 580 | return while (cp_iter.next()) |cp| { | ||
| 581 | if (requiresNfdBeforeCaseFold(cp.code)) break true; | ||
| 582 | } else false; | ||
| 583 | } | ||
| 584 | |||
| 585 | /// `eqlCaseless` tests for equality of `a` and `b` after normalizing to NFD and ignoring letter case. | ||
| 586 | pub fn eqlCaseless(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u8) !bool { | ||
| 587 | // The long winding road of normalized caseless matching... | ||
| 588 | // NFD(CaseFold(NFD(str))) or NFD(CaseFold(str)) | ||
| 589 | var norm_result_a: Result = Result{ .slice = a }; | ||
| 590 | if (requiresPreNfd(a)) { | ||
| 591 | if (!self.isFcd(a)) { | ||
| 592 | norm_result_a = try self.nfd(allocator, a); | ||
| 593 | } | ||
| 594 | } | ||
| 595 | defer norm_result_a.deinit(); | ||
| 596 | |||
| 597 | const cf_a = try case_fold_map.caseFoldStr(allocator, norm_result_a.slice); | ||
| 598 | defer allocator.free(cf_a); | ||
| 599 | norm_result_a.deinit(); | ||
| 600 | norm_result_a = try self.nfd(allocator, cf_a); | ||
| 601 | |||
| 602 | var norm_result_b: Result = Result{ .slice = b }; | ||
| 603 | if (requiresPreNfd(b)) { | ||
| 604 | if (!self.isFcd(b)) { | ||
| 605 | norm_result_b = try self.nfd(allocator, b); | ||
| 606 | } | ||
| 607 | } | ||
| 608 | defer norm_result_b.deinit(); | ||
| 609 | |||
| 610 | const cf_b = try case_fold_map.caseFoldStr(allocator, norm_result_b.slice); | ||
| 611 | defer allocator.free(cf_b); | ||
| 612 | norm_result_b.deinit(); | ||
| 613 | norm_result_b = try self.nfd(allocator, cf_b); | ||
| 614 | |||
| 615 | return std.mem.eql(u8, norm_result_a.slice, norm_result_b.slice); | ||
| 616 | } | ||
| 617 | |||
| 618 | test "eqlCaseless" { | ||
| 619 | const allocator = testing.allocator; | ||
| 620 | var data = try NormData.init(allocator); | ||
| 621 | defer data.deinit(); | ||
| 622 | var n = Self{ .norm_data = &data }; | ||
| 623 | |||
| 624 | try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}")); | ||
| 625 | try std.testing.expect(try n.eqlCaseless(allocator, "FOÉ", "foe\u{0301}")); // foÉ == foé | ||
| 626 | } | ||
| 627 | |||
| 628 | // FCD | 521 | // FCD |
| 629 | fn getLeadCcc(self: Self, cp: u21) u8 { | 522 | fn getLeadCcc(self: Self, cp: u21) u8 { |
| 630 | const dc = self.mapping(cp, .nfd); | 523 | const dc = self.mapping(cp, .nfd); |