diff options
Diffstat (limited to 'src/Normalizer.zig')
| -rw-r--r-- | src/Normalizer.zig | 129 |
1 files changed, 36 insertions, 93 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig index 2e2e6e4..1434043 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig | |||
| @@ -12,57 +12,10 @@ const norm_props = @import("ziglyph").normalization_props; | |||
| 12 | 12 | ||
| 13 | pub const NormData = @import("NormData"); | 13 | pub const NormData = @import("NormData"); |
| 14 | 14 | ||
| 15 | nfkd_map: std.AutoHashMap(u21, [18]u21), | ||
| 16 | norm_data: *NormData, | 15 | norm_data: *NormData, |
| 17 | 16 | ||
| 18 | const Self = @This(); | 17 | const Self = @This(); |
| 19 | 18 | ||
| 20 | pub fn init(allocator: std.mem.Allocator, norm_data: *NormData) !Self { | ||
| 21 | var self = Self{ | ||
| 22 | .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), | ||
| 23 | .norm_data = norm_data, | ||
| 24 | }; | ||
| 25 | errdefer self.deinit(); | ||
| 26 | |||
| 27 | // Compatibility decompositions | ||
| 28 | const dekomp_file = @embedFile("autogen/compatibility_decompositions.txt.deflate"); | ||
| 29 | var dekomp_stream = std.io.fixedBufferStream(dekomp_file); | ||
| 30 | var dekomp_decomp = try std.compress.deflate.decompressor(allocator, dekomp_stream.reader(), null); | ||
| 31 | defer dekomp_decomp.deinit(); | ||
| 32 | |||
| 33 | var dekomp_buf = std.io.bufferedReader(dekomp_decomp.reader()); | ||
| 34 | const dekomp_reader = dekomp_buf.reader(); | ||
| 35 | var buf: [4096]u8 = undefined; | ||
| 36 | |||
| 37 | while (try dekomp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| { | ||
| 38 | if (line.len == 0) continue; | ||
| 39 | var fields = std.mem.split(u8, line, ";"); | ||
| 40 | const cp_a = try std.fmt.parseInt(u21, fields.next().?, 16); | ||
| 41 | var cps = [_]u21{0} ** 18; | ||
| 42 | var i: usize = 0; | ||
| 43 | |||
| 44 | while (fields.next()) |cp| : (i += 1) { | ||
| 45 | cps[i] = try std.fmt.parseInt(u21, cp, 16); | ||
| 46 | } | ||
| 47 | |||
| 48 | try self.nfkd_map.put(cp_a, cps); | ||
| 49 | } | ||
| 50 | |||
| 51 | return self; | ||
| 52 | } | ||
| 53 | |||
| 54 | pub fn deinit(self: *Self) void { | ||
| 55 | self.nfkd_map.deinit(); | ||
| 56 | } | ||
| 57 | |||
| 58 | test "init / deinit" { | ||
| 59 | const allocator = testing.allocator; | ||
| 60 | var norm_data = try NormData.init(allocator); | ||
| 61 | defer norm_data.deinit(); | ||
| 62 | var n = try init(allocator, &norm_data); | ||
| 63 | defer n.deinit(); | ||
| 64 | } | ||
| 65 | |||
| 66 | // Hangul processing utilities. | 19 | // Hangul processing utilities. |
| 67 | fn isHangulPrecomposed(cp: u21) bool { | 20 | fn isHangulPrecomposed(cp: u21) bool { |
| 68 | if (hangul_map.syllableType(cp)) |kind| return kind == .LV or kind == .LVT; | 21 | if (hangul_map.syllableType(cp)) |kind| return kind == .LV or kind == .LVT; |
| @@ -140,10 +93,11 @@ pub fn mapping(self: Self, cp: u21, form: Form) Decomp { | |||
| 140 | @memcpy(dc.cps[0..len], canon_dc[0..len]); | 93 | @memcpy(dc.cps[0..len], canon_dc[0..len]); |
| 141 | } | 94 | } |
| 142 | 95 | ||
| 143 | if (self.nfkd_map.get(cp)) |array| { | 96 | const compat_dc = self.norm_data.compat_data.toNfkd(cp); |
| 97 | if (compat_dc.len != 0) { | ||
| 144 | if (form != .nfd) { | 98 | if (form != .nfd) { |
| 145 | dc.form = .nfkd; | 99 | dc.form = .nfkd; |
| 146 | @memcpy(dc.cps[0..array.len], &array); | 100 | @memcpy(dc.cps[0..compat_dc.len], compat_dc); |
| 147 | } | 101 | } |
| 148 | } | 102 | } |
| 149 | 103 | ||
| @@ -210,10 +164,9 @@ pub fn decompose(self: Self, cp: u21, form: Form) Decomp { | |||
| 210 | 164 | ||
| 211 | test "decompose" { | 165 | test "decompose" { |
| 212 | const allocator = testing.allocator; | 166 | const allocator = testing.allocator; |
| 213 | var norm_data = try NormData.init(allocator); | 167 | var data = try NormData.init(allocator); |
| 214 | defer norm_data.deinit(); | 168 | defer data.deinit(); |
| 215 | var n = try init(allocator, &norm_data); | 169 | var n = Self{ .norm_data = &data }; |
| 216 | defer n.deinit(); | ||
| 217 | 170 | ||
| 218 | var dc = n.decompose('é', .nfd); | 171 | var dc = n.decompose('é', .nfd); |
| 219 | try std.testing.expect(dc.form == .nfd); | 172 | try std.testing.expect(dc.form == .nfd); |
| @@ -334,10 +287,9 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 334 | 287 | ||
| 335 | test "nfd ASCII / no-alloc" { | 288 | test "nfd ASCII / no-alloc" { |
| 336 | const allocator = testing.allocator; | 289 | const allocator = testing.allocator; |
| 337 | var norm_data = try NormData.init(allocator); | 290 | var data = try NormData.init(allocator); |
| 338 | defer norm_data.deinit(); | 291 | defer data.deinit(); |
| 339 | var n = try init(allocator, &norm_data); | 292 | var n = Self{ .norm_data = &data }; |
| 340 | defer n.deinit(); | ||
| 341 | 293 | ||
| 342 | var result = try n.nfd(allocator, "Hello World!"); | 294 | var result = try n.nfd(allocator, "Hello World!"); |
| 343 | defer result.deinit(); | 295 | defer result.deinit(); |
| @@ -347,10 +299,9 @@ test "nfd ASCII / no-alloc" { | |||
| 347 | 299 | ||
| 348 | test "nfd !ASCII / alloc" { | 300 | test "nfd !ASCII / alloc" { |
| 349 | const allocator = testing.allocator; | 301 | const allocator = testing.allocator; |
| 350 | var norm_data = try NormData.init(allocator); | 302 | var data = try NormData.init(allocator); |
| 351 | defer norm_data.deinit(); | 303 | defer data.deinit(); |
| 352 | var n = try init(allocator, &norm_data); | 304 | var n = Self{ .norm_data = &data }; |
| 353 | defer n.deinit(); | ||
| 354 | 305 | ||
| 355 | var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); | 306 | var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); |
| 356 | defer result.deinit(); | 307 | defer result.deinit(); |
| @@ -360,10 +311,9 @@ test "nfd !ASCII / alloc" { | |||
| 360 | 311 | ||
| 361 | test "nfkd ASCII / no-alloc" { | 312 | test "nfkd ASCII / no-alloc" { |
| 362 | const allocator = testing.allocator; | 313 | const allocator = testing.allocator; |
| 363 | var norm_data = try NormData.init(allocator); | 314 | var data = try NormData.init(allocator); |
| 364 | defer norm_data.deinit(); | 315 | defer data.deinit(); |
| 365 | var n = try init(allocator, &norm_data); | 316 | var n = Self{ .norm_data = &data }; |
| 366 | defer n.deinit(); | ||
| 367 | 317 | ||
| 368 | var result = try n.nfkd(allocator, "Hello World!"); | 318 | var result = try n.nfkd(allocator, "Hello World!"); |
| 369 | defer result.deinit(); | 319 | defer result.deinit(); |
| @@ -373,10 +323,9 @@ test "nfkd ASCII / no-alloc" { | |||
| 373 | 323 | ||
| 374 | test "nfkd !ASCII / alloc" { | 324 | test "nfkd !ASCII / alloc" { |
| 375 | const allocator = testing.allocator; | 325 | const allocator = testing.allocator; |
| 376 | var norm_data = try NormData.init(allocator); | 326 | var data = try NormData.init(allocator); |
| 377 | defer norm_data.deinit(); | 327 | defer data.deinit(); |
| 378 | var n = try init(allocator, &norm_data); | 328 | var n = Self{ .norm_data = &data }; |
| 379 | defer n.deinit(); | ||
| 380 | 329 | ||
| 381 | var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); | 330 | var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); |
| 382 | defer result.deinit(); | 331 | defer result.deinit(); |
| @@ -529,10 +478,9 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 529 | 478 | ||
| 530 | test "nfc" { | 479 | test "nfc" { |
| 531 | const allocator = testing.allocator; | 480 | const allocator = testing.allocator; |
| 532 | var norm_data = try NormData.init(allocator); | 481 | var data = try NormData.init(allocator); |
| 533 | defer norm_data.deinit(); | 482 | defer data.deinit(); |
| 534 | var n = try init(allocator, &norm_data); | 483 | var n = Self{ .norm_data = &data }; |
| 535 | defer n.deinit(); | ||
| 536 | 484 | ||
| 537 | var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); | 485 | var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); |
| 538 | defer result.deinit(); | 486 | defer result.deinit(); |
| @@ -542,10 +490,9 @@ test "nfc" { | |||
| 542 | 490 | ||
| 543 | test "nfkc" { | 491 | test "nfkc" { |
| 544 | const allocator = testing.allocator; | 492 | const allocator = testing.allocator; |
| 545 | var norm_data = try NormData.init(allocator); | 493 | var data = try NormData.init(allocator); |
| 546 | defer norm_data.deinit(); | 494 | defer data.deinit(); |
| 547 | var n = try init(allocator, &norm_data); | 495 | var n = Self{ .norm_data = &data }; |
| 548 | defer n.deinit(); | ||
| 549 | 496 | ||
| 550 | var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); | 497 | var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); |
| 551 | defer result.deinit(); | 498 | defer result.deinit(); |
| @@ -603,10 +550,9 @@ pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u | |||
| 603 | 550 | ||
| 604 | test "eql" { | 551 | test "eql" { |
| 605 | const allocator = testing.allocator; | 552 | const allocator = testing.allocator; |
| 606 | var norm_data = try NormData.init(allocator); | 553 | var data = try NormData.init(allocator); |
| 607 | defer norm_data.deinit(); | 554 | defer data.deinit(); |
| 608 | var n = try init(allocator, &norm_data); | 555 | var n = Self{ .norm_data = &data }; |
| 609 | defer n.deinit(); | ||
| 610 | 556 | ||
| 611 | try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); | 557 | try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); |
| 612 | try std.testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); | 558 | try std.testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); |
| @@ -672,10 +618,9 @@ pub fn eqlCaseless(self: Self, allocator: std.mem.Allocator, a: []const u8, b: [ | |||
| 672 | 618 | ||
| 673 | test "eqlCaseless" { | 619 | test "eqlCaseless" { |
| 674 | const allocator = testing.allocator; | 620 | const allocator = testing.allocator; |
| 675 | var norm_data = try NormData.init(allocator); | 621 | var data = try NormData.init(allocator); |
| 676 | defer norm_data.deinit(); | 622 | defer data.deinit(); |
| 677 | var n = try init(allocator, &norm_data); | 623 | var n = Self{ .norm_data = &data }; |
| 678 | defer n.deinit(); | ||
| 679 | 624 | ||
| 680 | try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}")); | 625 | try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}")); |
| 681 | try std.testing.expect(try n.eqlCaseless(allocator, "FOÉ", "foe\u{0301}")); // foÉ == foé | 626 | try std.testing.expect(try n.eqlCaseless(allocator, "FOÉ", "foe\u{0301}")); // foÉ == foé |
| @@ -709,10 +654,9 @@ pub fn isFcd(self: Self, str: []const u8) bool { | |||
| 709 | 654 | ||
| 710 | test "isFcd" { | 655 | test "isFcd" { |
| 711 | const allocator = testing.allocator; | 656 | const allocator = testing.allocator; |
| 712 | var norm_data = try NormData.init(allocator); | 657 | var data = try NormData.init(allocator); |
| 713 | defer norm_data.deinit(); | 658 | defer data.deinit(); |
| 714 | var n = try init(allocator, &norm_data); | 659 | var n = Self{ .norm_data = &data }; |
| 715 | defer n.deinit(); | ||
| 716 | 660 | ||
| 717 | const is_nfc = "José \u{3D3}"; | 661 | const is_nfc = "José \u{3D3}"; |
| 718 | try std.testing.expect(n.isFcd(is_nfc)); | 662 | try std.testing.expect(n.isFcd(is_nfc)); |
| @@ -729,10 +673,9 @@ test "Unicode normalization tests" { | |||
| 729 | defer arena.deinit(); | 673 | defer arena.deinit(); |
| 730 | var allocator = arena.allocator(); | 674 | var allocator = arena.allocator(); |
| 731 | 675 | ||
| 732 | var norm_data = try NormData.init(allocator); | 676 | var data = try NormData.init(allocator); |
| 733 | defer norm_data.deinit(); | 677 | defer data.deinit(); |
| 734 | var n = try init(allocator, &norm_data); | 678 | var n = Self{ .norm_data = &data }; |
| 735 | defer n.deinit(); | ||
| 736 | 679 | ||
| 737 | var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); | 680 | var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); |
| 738 | defer file.close(); | 681 | defer file.close(); |