summaryrefslogtreecommitdiff
path: root/src/Normalizer.zig
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-26 18:54:11 -0400
committerGravatar Jose Colon Rodriguez2024-02-26 18:54:11 -0400
commit7d8f330db2bfb625a054eb7e21d397ff696c0b3f (patch)
treece62411836c8b5b8bd5addff4c0a66422a9922cc /src/Normalizer.zig
parentUsing NormData and CanonData in Normalizer (diff)
downloadzg-7d8f330db2bfb625a054eb7e21d397ff696c0b3f.tar.gz
zg-7d8f330db2bfb625a054eb7e21d397ff696c0b3f.tar.xz
zg-7d8f330db2bfb625a054eb7e21d397ff696c0b3f.zip
Using NormData nfc and nfd
Diffstat (limited to 'src/Normalizer.zig')
-rw-r--r--src/Normalizer.zig28
1 files changed, 3 insertions, 25 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig
index 848cf20..2e2e6e4 100644
--- a/src/Normalizer.zig
+++ b/src/Normalizer.zig
@@ -12,7 +12,6 @@ const norm_props = @import("ziglyph").normalization_props;
12 12
13pub const NormData = @import("NormData"); 13pub const NormData = @import("NormData");
14 14
15nfc_map: std.AutoHashMap([2]u21, u21),
16nfkd_map: std.AutoHashMap(u21, [18]u21), 15nfkd_map: std.AutoHashMap(u21, [18]u21),
17norm_data: *NormData, 16norm_data: *NormData,
18 17
@@ -20,40 +19,20 @@ const Self = @This();
20 19
21pub fn init(allocator: std.mem.Allocator, norm_data: *NormData) !Self { 20pub fn init(allocator: std.mem.Allocator, norm_data: *NormData) !Self {
22 var self = Self{ 21 var self = Self{
23 .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator),
24 .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), 22 .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator),
25 .norm_data = norm_data, 23 .norm_data = norm_data,
26 }; 24 };
27 errdefer self.deinit(); 25 errdefer self.deinit();
28 26
29 // Canonical compositions
30 const decompressor = std.compress.deflate.decompressor;
31 const comp_file = @embedFile("autogen/canonical_compositions.txt.deflate");
32 var comp_stream = std.io.fixedBufferStream(comp_file);
33 var comp_decomp = try decompressor(allocator, comp_stream.reader(), null);
34 defer comp_decomp.deinit();
35
36 var comp_buf = std.io.bufferedReader(comp_decomp.reader());
37 const comp_reader = comp_buf.reader();
38 var buf: [4096]u8 = undefined;
39
40 while (try comp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| {
41 if (line.len == 0) continue;
42 var fields = std.mem.split(u8, line, ";");
43 const cp_a = try std.fmt.parseInt(u21, fields.next().?, 16);
44 const cp_b = try std.fmt.parseInt(u21, fields.next().?, 16);
45 const cp_c = try std.fmt.parseInt(u21, fields.next().?, 16);
46 try self.nfc_map.put(.{ cp_a, cp_b }, cp_c);
47 }
48
49 // Compatibility decompositions 27 // Compatibility decompositions
50 const dekomp_file = @embedFile("autogen/compatibility_decompositions.txt.deflate"); 28 const dekomp_file = @embedFile("autogen/compatibility_decompositions.txt.deflate");
51 var dekomp_stream = std.io.fixedBufferStream(dekomp_file); 29 var dekomp_stream = std.io.fixedBufferStream(dekomp_file);
52 var dekomp_decomp = try decompressor(allocator, dekomp_stream.reader(), null); 30 var dekomp_decomp = try std.compress.deflate.decompressor(allocator, dekomp_stream.reader(), null);
53 defer dekomp_decomp.deinit(); 31 defer dekomp_decomp.deinit();
54 32
55 var dekomp_buf = std.io.bufferedReader(dekomp_decomp.reader()); 33 var dekomp_buf = std.io.bufferedReader(dekomp_decomp.reader());
56 const dekomp_reader = dekomp_buf.reader(); 34 const dekomp_reader = dekomp_buf.reader();
35 var buf: [4096]u8 = undefined;
57 36
58 while (try dekomp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| { 37 while (try dekomp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| {
59 if (line.len == 0) continue; 38 if (line.len == 0) continue;
@@ -73,7 +52,6 @@ pub fn init(allocator: std.mem.Allocator, norm_data: *NormData) !Self {
73} 52}
74 53
75pub fn deinit(self: *Self) void { 54pub fn deinit(self: *Self) void {
76 self.nfc_map.deinit();
77 self.nfkd_map.deinit(); 55 self.nfkd_map.deinit();
78} 56}
79 57
@@ -510,7 +488,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
510 488
511 if (!processed_hangul) { 489 if (!processed_hangul) {
512 // L -> C not Hangul. 490 // L -> C not Hangul.
513 if (self.nfc_map.get(.{ L, C })) |P| { 491 if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| {
514 if (!norm_props.isFcx(P)) { 492 if (!norm_props.isFcx(P)) {
515 d_list.items[sidx] = P; 493 d_list.items[sidx] = P;
516 d_list.items[i] = tombstone; // Mark for deletion. 494 d_list.items[i] = tombstone; // Mark for deletion.