diff options
Diffstat (limited to 'src/Normalizer.zig')
| -rw-r--r-- | src/Normalizer.zig | 155 |
1 files changed, 71 insertions, 84 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig index 6a19f47..848cf20 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig | |||
| @@ -3,26 +3,26 @@ | |||
| 3 | //! string equality under different parameters related to normalization (see `eql`, `eqlCaseless`, `eqlIdentifiers`). | 3 | //! string equality under different parameters related to normalization (see `eql`, `eqlCaseless`, `eqlIdentifiers`). |
| 4 | 4 | ||
| 5 | const std = @import("std"); | 5 | const std = @import("std"); |
| 6 | const testing = std.testing; | ||
| 6 | 7 | ||
| 7 | const CodePointIterator = @import("code_point").Iterator; | 8 | const CodePointIterator = @import("code_point").Iterator; |
| 8 | const case_fold_map = @import("ziglyph").case_folding; | 9 | const case_fold_map = @import("ziglyph").case_folding; |
| 9 | const hangul_map = @import("ziglyph").hangul; | 10 | const hangul_map = @import("ziglyph").hangul; |
| 10 | const norm_props = @import("ziglyph").normalization_props; | 11 | const norm_props = @import("ziglyph").normalization_props; |
| 11 | pub const Data = @import("CombiningClassData"); | ||
| 12 | 12 | ||
| 13 | ccc_data: *Data, | 13 | pub const NormData = @import("NormData"); |
| 14 | |||
| 14 | nfc_map: std.AutoHashMap([2]u21, u21), | 15 | nfc_map: std.AutoHashMap([2]u21, u21), |
| 15 | nfd_map: std.AutoHashMap(u21, [2]u21), | ||
| 16 | nfkd_map: std.AutoHashMap(u21, [18]u21), | 16 | nfkd_map: std.AutoHashMap(u21, [18]u21), |
| 17 | norm_data: *NormData, | ||
| 17 | 18 | ||
| 18 | const Self = @This(); | 19 | const Self = @This(); |
| 19 | 20 | ||
| 20 | pub fn init(allocator: std.mem.Allocator, data: *Data) !Self { | 21 | pub fn init(allocator: std.mem.Allocator, norm_data: *NormData) !Self { |
| 21 | var self = Self{ | 22 | var self = Self{ |
| 22 | .ccc_data = data, | ||
| 23 | .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator), | 23 | .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator), |
| 24 | .nfd_map = std.AutoHashMap(u21, [2]u21).init(allocator), | ||
| 25 | .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), | 24 | .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), |
| 25 | .norm_data = norm_data, | ||
| 26 | }; | 26 | }; |
| 27 | errdefer self.deinit(); | 27 | errdefer self.deinit(); |
| 28 | 28 | ||
| @@ -46,24 +46,6 @@ pub fn init(allocator: std.mem.Allocator, data: *Data) !Self { | |||
| 46 | try self.nfc_map.put(.{ cp_a, cp_b }, cp_c); | 46 | try self.nfc_map.put(.{ cp_a, cp_b }, cp_c); |
| 47 | } | 47 | } |
| 48 | 48 | ||
| 49 | // Canonical decompositions | ||
| 50 | const decomp_file = @embedFile("autogen/canonical_decompositions.txt.deflate"); | ||
| 51 | var decomp_stream = std.io.fixedBufferStream(decomp_file); | ||
| 52 | var decomp_decomp = try decompressor(allocator, decomp_stream.reader(), null); | ||
| 53 | defer decomp_decomp.deinit(); | ||
| 54 | |||
| 55 | var decomp_buf = std.io.bufferedReader(decomp_decomp.reader()); | ||
| 56 | const decomp_reader = decomp_buf.reader(); | ||
| 57 | |||
| 58 | while (try decomp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| { | ||
| 59 | if (line.len == 0) continue; | ||
| 60 | var fields = std.mem.split(u8, line, ";"); | ||
| 61 | const cp_a = try std.fmt.parseInt(u21, fields.next().?, 16); | ||
| 62 | const cp_b = try std.fmt.parseInt(u21, fields.next().?, 16); | ||
| 63 | const cp_c = try std.fmt.parseInt(u21, fields.next().?, 16); | ||
| 64 | try self.nfd_map.put(cp_a, .{ cp_b, cp_c }); | ||
| 65 | } | ||
| 66 | |||
| 67 | // Compatibility decompositions | 49 | // Compatibility decompositions |
| 68 | const dekomp_file = @embedFile("autogen/compatibility_decompositions.txt.deflate"); | 50 | const dekomp_file = @embedFile("autogen/compatibility_decompositions.txt.deflate"); |
| 69 | var dekomp_stream = std.io.fixedBufferStream(dekomp_file); | 51 | var dekomp_stream = std.io.fixedBufferStream(dekomp_file); |
| @@ -92,14 +74,14 @@ pub fn init(allocator: std.mem.Allocator, data: *Data) !Self { | |||
| 92 | 74 | ||
| 93 | pub fn deinit(self: *Self) void { | 75 | pub fn deinit(self: *Self) void { |
| 94 | self.nfc_map.deinit(); | 76 | self.nfc_map.deinit(); |
| 95 | self.nfd_map.deinit(); | ||
| 96 | self.nfkd_map.deinit(); | 77 | self.nfkd_map.deinit(); |
| 97 | } | 78 | } |
| 98 | 79 | ||
| 99 | test "init / deinit" { | 80 | test "init / deinit" { |
| 100 | var data = try Data.init(std.testing.allocator); | 81 | const allocator = testing.allocator; |
| 101 | defer data.deinit(); | 82 | var norm_data = try NormData.init(allocator); |
| 102 | var n = try init(std.testing.allocator, &data); | 83 | defer norm_data.deinit(); |
| 84 | var n = try init(allocator, &norm_data); | ||
| 103 | defer n.deinit(); | 85 | defer n.deinit(); |
| 104 | } | 86 | } |
| 105 | 87 | ||
| @@ -169,17 +151,22 @@ const Decomp = struct { | |||
| 169 | pub fn mapping(self: Self, cp: u21, form: Form) Decomp { | 151 | pub fn mapping(self: Self, cp: u21, form: Form) Decomp { |
| 170 | std.debug.assert(form == .nfd or form == .nfkd); | 152 | std.debug.assert(form == .nfd or form == .nfkd); |
| 171 | 153 | ||
| 172 | var dc = Decomp{ .form = .same }; | 154 | var dc = Decomp{ .form = .nfd }; |
| 173 | dc.cps[0] = cp; | 155 | const canon_dc = self.norm_data.canon_data.toNfd(cp); |
| 156 | const len: usize = if (canon_dc[1] == 0) 1 else 2; | ||
| 157 | |||
| 158 | if (len == 1 and canon_dc[0] == cp) { | ||
| 159 | dc.form = .same; | ||
| 160 | dc.cps[0] = cp; | ||
| 161 | } else { | ||
| 162 | @memcpy(dc.cps[0..len], canon_dc[0..len]); | ||
| 163 | } | ||
| 174 | 164 | ||
| 175 | if (self.nfkd_map.get(cp)) |array| { | 165 | if (self.nfkd_map.get(cp)) |array| { |
| 176 | if (form != .nfd) { | 166 | if (form != .nfd) { |
| 177 | dc.form = .nfkd; | 167 | dc.form = .nfkd; |
| 178 | @memcpy(dc.cps[0..array.len], &array); | 168 | @memcpy(dc.cps[0..array.len], &array); |
| 179 | } | 169 | } |
| 180 | } else if (self.nfd_map.get(cp)) |array| { | ||
| 181 | dc.form = .nfd; | ||
| 182 | @memcpy(dc.cps[0..array.len], &array); | ||
| 183 | } | 170 | } |
| 184 | 171 | ||
| 185 | return dc; | 172 | return dc; |
| @@ -244,10 +231,10 @@ pub fn decompose(self: Self, cp: u21, form: Form) Decomp { | |||
| 244 | } | 231 | } |
| 245 | 232 | ||
| 246 | test "decompose" { | 233 | test "decompose" { |
| 247 | const allocator = std.testing.allocator; | 234 | const allocator = testing.allocator; |
| 248 | var data = try Data.init(allocator); | 235 | var norm_data = try NormData.init(allocator); |
| 249 | defer data.deinit(); | 236 | defer norm_data.deinit(); |
| 250 | var n = try init(allocator, &data); | 237 | var n = try init(allocator, &norm_data); |
| 251 | defer n.deinit(); | 238 | defer n.deinit(); |
| 252 | 239 | ||
| 253 | var dc = n.decompose('é', .nfd); | 240 | var dc = n.decompose('é', .nfd); |
| @@ -314,7 +301,7 @@ pub const Result = struct { | |||
| 314 | 301 | ||
| 315 | // Compares code points by Canonical Combining Class order. | 302 | // Compares code points by Canonical Combining Class order. |
| 316 | fn cccLess(self: Self, lhs: u21, rhs: u21) bool { | 303 | fn cccLess(self: Self, lhs: u21, rhs: u21) bool { |
| 317 | return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs); | 304 | return self.norm_data.ccc_data.ccc(lhs) < self.norm_data.ccc_data.ccc(rhs); |
| 318 | } | 305 | } |
| 319 | 306 | ||
| 320 | // Applies the Canonical Sorting Algorithm. | 307 | // Applies the Canonical Sorting Algorithm. |
| @@ -322,7 +309,7 @@ fn canonicalSort(self: Self, cps: []u21) void { | |||
| 322 | var i: usize = 0; | 309 | var i: usize = 0; |
| 323 | while (i < cps.len) : (i += 1) { | 310 | while (i < cps.len) : (i += 1) { |
| 324 | const start: usize = i; | 311 | const start: usize = i; |
| 325 | while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} | 312 | while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} |
| 326 | std.mem.sort(u21, cps[start..i], self, cccLess); | 313 | std.mem.sort(u21, cps[start..i], self, cccLess); |
| 327 | } | 314 | } |
| 328 | } | 315 | } |
| @@ -368,10 +355,10 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 368 | } | 355 | } |
| 369 | 356 | ||
| 370 | test "nfd ASCII / no-alloc" { | 357 | test "nfd ASCII / no-alloc" { |
| 371 | const allocator = std.testing.allocator; | 358 | const allocator = testing.allocator; |
| 372 | var data = try Data.init(allocator); | 359 | var norm_data = try NormData.init(allocator); |
| 373 | defer data.deinit(); | 360 | defer norm_data.deinit(); |
| 374 | var n = try init(allocator, &data); | 361 | var n = try init(allocator, &norm_data); |
| 375 | defer n.deinit(); | 362 | defer n.deinit(); |
| 376 | 363 | ||
| 377 | var result = try n.nfd(allocator, "Hello World!"); | 364 | var result = try n.nfd(allocator, "Hello World!"); |
| @@ -381,10 +368,10 @@ test "nfd ASCII / no-alloc" { | |||
| 381 | } | 368 | } |
| 382 | 369 | ||
| 383 | test "nfd !ASCII / alloc" { | 370 | test "nfd !ASCII / alloc" { |
| 384 | const allocator = std.testing.allocator; | 371 | const allocator = testing.allocator; |
| 385 | var data = try Data.init(allocator); | 372 | var norm_data = try NormData.init(allocator); |
| 386 | defer data.deinit(); | 373 | defer norm_data.deinit(); |
| 387 | var n = try init(allocator, &data); | 374 | var n = try init(allocator, &norm_data); |
| 388 | defer n.deinit(); | 375 | defer n.deinit(); |
| 389 | 376 | ||
| 390 | var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); | 377 | var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); |
| @@ -394,10 +381,10 @@ test "nfd !ASCII / alloc" { | |||
| 394 | } | 381 | } |
| 395 | 382 | ||
| 396 | test "nfkd ASCII / no-alloc" { | 383 | test "nfkd ASCII / no-alloc" { |
| 397 | const allocator = std.testing.allocator; | 384 | const allocator = testing.allocator; |
| 398 | var data = try Data.init(allocator); | 385 | var norm_data = try NormData.init(allocator); |
| 399 | defer data.deinit(); | 386 | defer norm_data.deinit(); |
| 400 | var n = try init(allocator, &data); | 387 | var n = try init(allocator, &norm_data); |
| 401 | defer n.deinit(); | 388 | defer n.deinit(); |
| 402 | 389 | ||
| 403 | var result = try n.nfkd(allocator, "Hello World!"); | 390 | var result = try n.nfkd(allocator, "Hello World!"); |
| @@ -407,10 +394,10 @@ test "nfkd ASCII / no-alloc" { | |||
| 407 | } | 394 | } |
| 408 | 395 | ||
| 409 | test "nfkd !ASCII / alloc" { | 396 | test "nfkd !ASCII / alloc" { |
| 410 | const allocator = std.testing.allocator; | 397 | const allocator = testing.allocator; |
| 411 | var data = try Data.init(allocator); | 398 | var norm_data = try NormData.init(allocator); |
| 412 | defer data.deinit(); | 399 | defer norm_data.deinit(); |
| 413 | var n = try init(allocator, &data); | 400 | var n = try init(allocator, &norm_data); |
| 414 | defer n.deinit(); | 401 | defer n.deinit(); |
| 415 | 402 | ||
| 416 | var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); | 403 | var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); |
| @@ -426,7 +413,7 @@ fn isHangul(cp: u21) bool { | |||
| 426 | } | 413 | } |
| 427 | 414 | ||
| 428 | fn isNonHangulStarter(self: Self, cp: u21) bool { | 415 | fn isNonHangulStarter(self: Self, cp: u21) bool { |
| 429 | return !isHangul(cp) and self.ccc_data.isStarter(cp); | 416 | return !isHangul(cp) and self.norm_data.ccc_data.isStarter(cp); |
| 430 | } | 417 | } |
| 431 | 418 | ||
| 432 | /// Normalizes `str` to NFC. | 419 | /// Normalizes `str` to NFC. |
| @@ -468,7 +455,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 468 | 455 | ||
| 469 | block_check: while (i < d_list.items.len) : (i += 1) { | 456 | block_check: while (i < d_list.items.len) : (i += 1) { |
| 470 | const C = d_list.items[i]; | 457 | const C = d_list.items[i]; |
| 471 | const cc_C = self.ccc_data.ccc(C); | 458 | const cc_C = self.norm_data.ccc_data.ccc(C); |
| 472 | var starter_index: ?usize = null; | 459 | var starter_index: ?usize = null; |
| 473 | var j: usize = i; | 460 | var j: usize = i; |
| 474 | 461 | ||
| @@ -476,10 +463,10 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 476 | j -= 1; | 463 | j -= 1; |
| 477 | 464 | ||
| 478 | // Check for starter. | 465 | // Check for starter. |
| 479 | if (self.ccc_data.isStarter(d_list.items[j])) { | 466 | if (self.norm_data.ccc_data.isStarter(d_list.items[j])) { |
| 480 | if (i - j > 1) { // If there's distance between the starting point and the current position. | 467 | if (i - j > 1) { // If there's distance between the starting point and the current position. |
| 481 | for (d_list.items[(j + 1)..i]) |B| { | 468 | for (d_list.items[(j + 1)..i]) |B| { |
| 482 | const cc_B = self.ccc_data.ccc(B); | 469 | const cc_B = self.norm_data.ccc_data.ccc(B); |
| 483 | // Check for blocking conditions. | 470 | // Check for blocking conditions. |
| 484 | if (isHangul(C)) { | 471 | if (isHangul(C)) { |
| 485 | if (cc_B != 0 or self.isNonHangulStarter(B)) continue :block_check; | 472 | if (cc_B != 0 or self.isNonHangulStarter(B)) continue :block_check; |
| @@ -563,10 +550,10 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 563 | } | 550 | } |
| 564 | 551 | ||
| 565 | test "nfc" { | 552 | test "nfc" { |
| 566 | const allocator = std.testing.allocator; | 553 | const allocator = testing.allocator; |
| 567 | var data = try Data.init(allocator); | 554 | var norm_data = try NormData.init(allocator); |
| 568 | defer data.deinit(); | 555 | defer norm_data.deinit(); |
| 569 | var n = try init(allocator, &data); | 556 | var n = try init(allocator, &norm_data); |
| 570 | defer n.deinit(); | 557 | defer n.deinit(); |
| 571 | 558 | ||
| 572 | var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); | 559 | var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); |
| @@ -576,10 +563,10 @@ test "nfc" { | |||
| 576 | } | 563 | } |
| 577 | 564 | ||
| 578 | test "nfkc" { | 565 | test "nfkc" { |
| 579 | const allocator = std.testing.allocator; | 566 | const allocator = testing.allocator; |
| 580 | var data = try Data.init(allocator); | 567 | var norm_data = try NormData.init(allocator); |
| 581 | defer data.deinit(); | 568 | defer norm_data.deinit(); |
| 582 | var n = try init(allocator, &data); | 569 | var n = try init(allocator, &norm_data); |
| 583 | defer n.deinit(); | 570 | defer n.deinit(); |
| 584 | 571 | ||
| 585 | var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); | 572 | var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); |
| @@ -637,10 +624,10 @@ pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u | |||
| 637 | } | 624 | } |
| 638 | 625 | ||
| 639 | test "eql" { | 626 | test "eql" { |
| 640 | const allocator = std.testing.allocator; | 627 | const allocator = testing.allocator; |
| 641 | var data = try Data.init(allocator); | 628 | var norm_data = try NormData.init(allocator); |
| 642 | defer data.deinit(); | 629 | defer norm_data.deinit(); |
| 643 | var n = try init(allocator, &data); | 630 | var n = try init(allocator, &norm_data); |
| 644 | defer n.deinit(); | 631 | defer n.deinit(); |
| 645 | 632 | ||
| 646 | try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); | 633 | try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); |
| @@ -706,10 +693,10 @@ pub fn eqlCaseless(self: Self, allocator: std.mem.Allocator, a: []const u8, b: [ | |||
| 706 | } | 693 | } |
| 707 | 694 | ||
| 708 | test "eqlCaseless" { | 695 | test "eqlCaseless" { |
| 709 | const allocator = std.testing.allocator; | 696 | const allocator = testing.allocator; |
| 710 | var data = try Data.init(allocator); | 697 | var norm_data = try NormData.init(allocator); |
| 711 | defer data.deinit(); | 698 | defer norm_data.deinit(); |
| 712 | var n = try init(allocator, &data); | 699 | var n = try init(allocator, &norm_data); |
| 713 | defer n.deinit(); | 700 | defer n.deinit(); |
| 714 | 701 | ||
| 715 | try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}")); | 702 | try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}")); |
| @@ -719,7 +706,7 @@ test "eqlCaseless" { | |||
| 719 | // FCD | 706 | // FCD |
| 720 | fn getLeadCcc(self: Self, cp: u21) u8 { | 707 | fn getLeadCcc(self: Self, cp: u21) u8 { |
| 721 | const dc = self.mapping(cp, .nfd); | 708 | const dc = self.mapping(cp, .nfd); |
| 722 | return self.ccc_data.ccc(dc.cps[0]); | 709 | return self.norm_data.ccc_data.ccc(dc.cps[0]); |
| 723 | } | 710 | } |
| 724 | 711 | ||
| 725 | fn getTrailCcc(self: Self, cp: u21) u8 { | 712 | fn getTrailCcc(self: Self, cp: u21) u8 { |
| @@ -727,7 +714,7 @@ fn getTrailCcc(self: Self, cp: u21) u8 { | |||
| 727 | const len = for (dc.cps, 0..) |dcp, i| { | 714 | const len = for (dc.cps, 0..) |dcp, i| { |
| 728 | if (dcp == 0) break i; | 715 | if (dcp == 0) break i; |
| 729 | } else dc.cps.len; | 716 | } else dc.cps.len; |
| 730 | return self.ccc_data.ccc(dc.cps[len - 1]); | 717 | return self.norm_data.ccc_data.ccc(dc.cps[len - 1]); |
| 731 | } | 718 | } |
| 732 | 719 | ||
| 733 | /// Fast check to detect if a string is already in NFC or NFD form. | 720 | /// Fast check to detect if a string is already in NFC or NFD form. |
| @@ -743,10 +730,10 @@ pub fn isFcd(self: Self, str: []const u8) bool { | |||
| 743 | } | 730 | } |
| 744 | 731 | ||
| 745 | test "isFcd" { | 732 | test "isFcd" { |
| 746 | const allocator = std.testing.allocator; | 733 | const allocator = testing.allocator; |
| 747 | var data = try Data.init(allocator); | 734 | var norm_data = try NormData.init(allocator); |
| 748 | defer data.deinit(); | 735 | defer norm_data.deinit(); |
| 749 | var n = try init(allocator, &data); | 736 | var n = try init(allocator, &norm_data); |
| 750 | defer n.deinit(); | 737 | defer n.deinit(); |
| 751 | 738 | ||
| 752 | const is_nfc = "José \u{3D3}"; | 739 | const is_nfc = "José \u{3D3}"; |
| @@ -764,9 +751,9 @@ test "Unicode normalization tests" { | |||
| 764 | defer arena.deinit(); | 751 | defer arena.deinit(); |
| 765 | var allocator = arena.allocator(); | 752 | var allocator = arena.allocator(); |
| 766 | 753 | ||
| 767 | var data = try Data.init(allocator); | 754 | var norm_data = try NormData.init(allocator); |
| 768 | defer data.deinit(); | 755 | defer norm_data.deinit(); |
| 769 | var n = try init(allocator, &data); | 756 | var n = try init(allocator, &norm_data); |
| 770 | defer n.deinit(); | 757 | defer n.deinit(); |
| 771 | 758 | ||
| 772 | var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); | 759 | var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); |