summaryrefslogtreecommitdiff
path: root/src/Normalizer.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/Normalizer.zig')
-rw-r--r--src/Normalizer.zig129
1 files changed, 36 insertions, 93 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig
index 2e2e6e4..1434043 100644
--- a/src/Normalizer.zig
+++ b/src/Normalizer.zig
@@ -12,57 +12,10 @@ const norm_props = @import("ziglyph").normalization_props;
12 12
13pub const NormData = @import("NormData"); 13pub const NormData = @import("NormData");
14 14
15nfkd_map: std.AutoHashMap(u21, [18]u21),
16norm_data: *NormData, 15norm_data: *NormData,
17 16
18const Self = @This(); 17const Self = @This();
19 18
20pub fn init(allocator: std.mem.Allocator, norm_data: *NormData) !Self {
21 var self = Self{
22 .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator),
23 .norm_data = norm_data,
24 };
25 errdefer self.deinit();
26
27 // Compatibility decompositions
28 const dekomp_file = @embedFile("autogen/compatibility_decompositions.txt.deflate");
29 var dekomp_stream = std.io.fixedBufferStream(dekomp_file);
30 var dekomp_decomp = try std.compress.deflate.decompressor(allocator, dekomp_stream.reader(), null);
31 defer dekomp_decomp.deinit();
32
33 var dekomp_buf = std.io.bufferedReader(dekomp_decomp.reader());
34 const dekomp_reader = dekomp_buf.reader();
35 var buf: [4096]u8 = undefined;
36
37 while (try dekomp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| {
38 if (line.len == 0) continue;
39 var fields = std.mem.split(u8, line, ";");
40 const cp_a = try std.fmt.parseInt(u21, fields.next().?, 16);
41 var cps = [_]u21{0} ** 18;
42 var i: usize = 0;
43
44 while (fields.next()) |cp| : (i += 1) {
45 cps[i] = try std.fmt.parseInt(u21, cp, 16);
46 }
47
48 try self.nfkd_map.put(cp_a, cps);
49 }
50
51 return self;
52}
53
54pub fn deinit(self: *Self) void {
55 self.nfkd_map.deinit();
56}
57
58test "init / deinit" {
59 const allocator = testing.allocator;
60 var norm_data = try NormData.init(allocator);
61 defer norm_data.deinit();
62 var n = try init(allocator, &norm_data);
63 defer n.deinit();
64}
65
66// Hangul processing utilities. 19// Hangul processing utilities.
67fn isHangulPrecomposed(cp: u21) bool { 20fn isHangulPrecomposed(cp: u21) bool {
68 if (hangul_map.syllableType(cp)) |kind| return kind == .LV or kind == .LVT; 21 if (hangul_map.syllableType(cp)) |kind| return kind == .LV or kind == .LVT;
@@ -140,10 +93,11 @@ pub fn mapping(self: Self, cp: u21, form: Form) Decomp {
140 @memcpy(dc.cps[0..len], canon_dc[0..len]); 93 @memcpy(dc.cps[0..len], canon_dc[0..len]);
141 } 94 }
142 95
143 if (self.nfkd_map.get(cp)) |array| { 96 const compat_dc = self.norm_data.compat_data.toNfkd(cp);
97 if (compat_dc.len != 0) {
144 if (form != .nfd) { 98 if (form != .nfd) {
145 dc.form = .nfkd; 99 dc.form = .nfkd;
146 @memcpy(dc.cps[0..array.len], &array); 100 @memcpy(dc.cps[0..compat_dc.len], compat_dc);
147 } 101 }
148 } 102 }
149 103
@@ -210,10 +164,9 @@ pub fn decompose(self: Self, cp: u21, form: Form) Decomp {
210 164
211test "decompose" { 165test "decompose" {
212 const allocator = testing.allocator; 166 const allocator = testing.allocator;
213 var norm_data = try NormData.init(allocator); 167 var data = try NormData.init(allocator);
214 defer norm_data.deinit(); 168 defer data.deinit();
215 var n = try init(allocator, &norm_data); 169 var n = Self{ .norm_data = &data };
216 defer n.deinit();
217 170
218 var dc = n.decompose('é', .nfd); 171 var dc = n.decompose('é', .nfd);
219 try std.testing.expect(dc.form == .nfd); 172 try std.testing.expect(dc.form == .nfd);
@@ -334,10 +287,9 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
334 287
335test "nfd ASCII / no-alloc" { 288test "nfd ASCII / no-alloc" {
336 const allocator = testing.allocator; 289 const allocator = testing.allocator;
337 var norm_data = try NormData.init(allocator); 290 var data = try NormData.init(allocator);
338 defer norm_data.deinit(); 291 defer data.deinit();
339 var n = try init(allocator, &norm_data); 292 var n = Self{ .norm_data = &data };
340 defer n.deinit();
341 293
342 var result = try n.nfd(allocator, "Hello World!"); 294 var result = try n.nfd(allocator, "Hello World!");
343 defer result.deinit(); 295 defer result.deinit();
@@ -347,10 +299,9 @@ test "nfd ASCII / no-alloc" {
347 299
348test "nfd !ASCII / alloc" { 300test "nfd !ASCII / alloc" {
349 const allocator = testing.allocator; 301 const allocator = testing.allocator;
350 var norm_data = try NormData.init(allocator); 302 var data = try NormData.init(allocator);
351 defer norm_data.deinit(); 303 defer data.deinit();
352 var n = try init(allocator, &norm_data); 304 var n = Self{ .norm_data = &data };
353 defer n.deinit();
354 305
355 var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); 306 var result = try n.nfd(allocator, "Héllo World! \u{3d3}");
356 defer result.deinit(); 307 defer result.deinit();
@@ -360,10 +311,9 @@ test "nfd !ASCII / alloc" {
360 311
361test "nfkd ASCII / no-alloc" { 312test "nfkd ASCII / no-alloc" {
362 const allocator = testing.allocator; 313 const allocator = testing.allocator;
363 var norm_data = try NormData.init(allocator); 314 var data = try NormData.init(allocator);
364 defer norm_data.deinit(); 315 defer data.deinit();
365 var n = try init(allocator, &norm_data); 316 var n = Self{ .norm_data = &data };
366 defer n.deinit();
367 317
368 var result = try n.nfkd(allocator, "Hello World!"); 318 var result = try n.nfkd(allocator, "Hello World!");
369 defer result.deinit(); 319 defer result.deinit();
@@ -373,10 +323,9 @@ test "nfkd ASCII / no-alloc" {
373 323
374test "nfkd !ASCII / alloc" { 324test "nfkd !ASCII / alloc" {
375 const allocator = testing.allocator; 325 const allocator = testing.allocator;
376 var norm_data = try NormData.init(allocator); 326 var data = try NormData.init(allocator);
377 defer norm_data.deinit(); 327 defer data.deinit();
378 var n = try init(allocator, &norm_data); 328 var n = Self{ .norm_data = &data };
379 defer n.deinit();
380 329
381 var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); 330 var result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
382 defer result.deinit(); 331 defer result.deinit();
@@ -529,10 +478,9 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
529 478
530test "nfc" { 479test "nfc" {
531 const allocator = testing.allocator; 480 const allocator = testing.allocator;
532 var norm_data = try NormData.init(allocator); 481 var data = try NormData.init(allocator);
533 defer norm_data.deinit(); 482 defer data.deinit();
534 var n = try init(allocator, &norm_data); 483 var n = Self{ .norm_data = &data };
535 defer n.deinit();
536 484
537 var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); 485 var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
538 defer result.deinit(); 486 defer result.deinit();
@@ -542,10 +490,9 @@ test "nfc" {
542 490
543test "nfkc" { 491test "nfkc" {
544 const allocator = testing.allocator; 492 const allocator = testing.allocator;
545 var norm_data = try NormData.init(allocator); 493 var data = try NormData.init(allocator);
546 defer norm_data.deinit(); 494 defer data.deinit();
547 var n = try init(allocator, &norm_data); 495 var n = Self{ .norm_data = &data };
548 defer n.deinit();
549 496
550 var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); 497 var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
551 defer result.deinit(); 498 defer result.deinit();
@@ -603,10 +550,9 @@ pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u
603 550
604test "eql" { 551test "eql" {
605 const allocator = testing.allocator; 552 const allocator = testing.allocator;
606 var norm_data = try NormData.init(allocator); 553 var data = try NormData.init(allocator);
607 defer norm_data.deinit(); 554 defer data.deinit();
608 var n = try init(allocator, &norm_data); 555 var n = Self{ .norm_data = &data };
609 defer n.deinit();
610 556
611 try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); 557 try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
612 try std.testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); 558 try std.testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
@@ -672,10 +618,9 @@ pub fn eqlCaseless(self: Self, allocator: std.mem.Allocator, a: []const u8, b: [
672 618
673test "eqlCaseless" { 619test "eqlCaseless" {
674 const allocator = testing.allocator; 620 const allocator = testing.allocator;
675 var norm_data = try NormData.init(allocator); 621 var data = try NormData.init(allocator);
676 defer norm_data.deinit(); 622 defer data.deinit();
677 var n = try init(allocator, &norm_data); 623 var n = Self{ .norm_data = &data };
678 defer n.deinit();
679 624
680 try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}")); 625 try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}"));
681 try std.testing.expect(try n.eqlCaseless(allocator, "FOÉ", "foe\u{0301}")); // foÉ == foé 626 try std.testing.expect(try n.eqlCaseless(allocator, "FOÉ", "foe\u{0301}")); // foÉ == foé
@@ -709,10 +654,9 @@ pub fn isFcd(self: Self, str: []const u8) bool {
709 654
710test "isFcd" { 655test "isFcd" {
711 const allocator = testing.allocator; 656 const allocator = testing.allocator;
712 var norm_data = try NormData.init(allocator); 657 var data = try NormData.init(allocator);
713 defer norm_data.deinit(); 658 defer data.deinit();
714 var n = try init(allocator, &norm_data); 659 var n = Self{ .norm_data = &data };
715 defer n.deinit();
716 660
717 const is_nfc = "José \u{3D3}"; 661 const is_nfc = "José \u{3D3}";
718 try std.testing.expect(n.isFcd(is_nfc)); 662 try std.testing.expect(n.isFcd(is_nfc));
@@ -729,10 +673,9 @@ test "Unicode normalization tests" {
729 defer arena.deinit(); 673 defer arena.deinit();
730 var allocator = arena.allocator(); 674 var allocator = arena.allocator();
731 675
732 var norm_data = try NormData.init(allocator); 676 var data = try NormData.init(allocator);
733 defer norm_data.deinit(); 677 defer data.deinit();
734 var n = try init(allocator, &norm_data); 678 var n = Self{ .norm_data = &data };
735 defer n.deinit();
736 679
737 var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); 680 var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
738 defer file.close(); 681 defer file.close();