summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-26 18:54:11 -0400
committerGravatar Jose Colon Rodriguez2024-02-26 18:54:11 -0400
commit7d8f330db2bfb625a054eb7e21d397ff696c0b3f (patch)
treece62411836c8b5b8bd5addff4c0a66422a9922cc
parentUsing NormData and CanonData in Normalizer (diff)
downloadzg-7d8f330db2bfb625a054eb7e21d397ff696c0b3f.tar.gz
zg-7d8f330db2bfb625a054eb7e21d397ff696c0b3f.tar.xz
zg-7d8f330db2bfb625a054eb7e21d397ff696c0b3f.zip
Using NormData nfc and nfd
-rw-r--r--src/Canonical.zig13
-rw-r--r--src/Normalizer.zig28
-rw-r--r--src/autogen/canonical_compositions.txt.deflatebin4410 -> 0 bytes
-rw-r--r--src/autogen/canonical_decompositions.txt.deflatebin9541 -> 0 bytes
-rw-r--r--src/main.zig2
5 files changed, 16 insertions, 27 deletions
diff --git a/src/Canonical.zig b/src/Canonical.zig
index d54e828..81d3eec 100644
--- a/src/Canonical.zig
+++ b/src/Canonical.zig
@@ -4,6 +4,7 @@ const compress = std.compress;
4const mem = std.mem; 4const mem = std.mem;
5 5
6allocator: mem.Allocator, 6allocator: mem.Allocator,
7nfc: std.AutoHashMap([2]u21, u21),
7nfd: [][2]u21 = undefined, 8nfd: [][2]u21 = undefined,
8 9
9const Self = @This(); 10const Self = @This();
@@ -19,6 +20,7 @@ pub fn init(allocator: mem.Allocator) !Self {
19 const endian = builtin.cpu.arch.endian(); 20 const endian = builtin.cpu.arch.endian();
20 var self = Self{ 21 var self = Self{
21 .allocator = allocator, 22 .allocator = allocator,
23 .nfc = std.AutoHashMap([2]u21, u21).init(allocator),
22 .nfd = try allocator.alloc([2]u21, 0x110000), 24 .nfd = try allocator.alloc([2]u21, 0x110000),
23 }; 25 };
24 26
@@ -29,13 +31,17 @@ pub fn init(allocator: mem.Allocator) !Self {
29 if (len == 0) break; 31 if (len == 0) break;
30 const cp = try reader.readInt(u24, endian); 32 const cp = try reader.readInt(u24, endian);
31 self.nfd[cp][0] = @intCast(try reader.readInt(u24, endian)); 33 self.nfd[cp][0] = @intCast(try reader.readInt(u24, endian));
32 if (len == 3) self.nfd[cp][1] = @intCast(try reader.readInt(u24, endian)); 34 if (len == 3) {
35 self.nfd[cp][1] = @intCast(try reader.readInt(u24, endian));
36 try self.nfc.put(self.nfd[cp], @intCast(cp));
37 }
33 } 38 }
34 39
35 return self; 40 return self;
36} 41}
37 42
38pub fn deinit(self: *Self) void { 43pub fn deinit(self: *Self) void {
44 self.nfc.deinit();
39 self.allocator.free(self.nfd); 45 self.allocator.free(self.nfd);
40} 46}
41 47
@@ -43,3 +49,8 @@ pub fn deinit(self: *Self) void {
43pub inline fn toNfd(self: Self, cp: u21) [2]u21 { 49pub inline fn toNfd(self: Self, cp: u21) [2]u21 {
44 return self.nfd[cp]; 50 return self.nfd[cp];
45} 51}
52
53// Returns the primary composite for the codepoints in `cp`.
54pub inline fn toNfc(self: Self, cps: [2]u21) ?u21 {
55 return self.nfc.get(cps);
56}
diff --git a/src/Normalizer.zig b/src/Normalizer.zig
index 848cf20..2e2e6e4 100644
--- a/src/Normalizer.zig
+++ b/src/Normalizer.zig
@@ -12,7 +12,6 @@ const norm_props = @import("ziglyph").normalization_props;
12 12
13pub const NormData = @import("NormData"); 13pub const NormData = @import("NormData");
14 14
15nfc_map: std.AutoHashMap([2]u21, u21),
16nfkd_map: std.AutoHashMap(u21, [18]u21), 15nfkd_map: std.AutoHashMap(u21, [18]u21),
17norm_data: *NormData, 16norm_data: *NormData,
18 17
@@ -20,40 +19,20 @@ const Self = @This();
20 19
21pub fn init(allocator: std.mem.Allocator, norm_data: *NormData) !Self { 20pub fn init(allocator: std.mem.Allocator, norm_data: *NormData) !Self {
22 var self = Self{ 21 var self = Self{
23 .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator),
24 .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), 22 .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator),
25 .norm_data = norm_data, 23 .norm_data = norm_data,
26 }; 24 };
27 errdefer self.deinit(); 25 errdefer self.deinit();
28 26
29 // Canonical compositions
30 const decompressor = std.compress.deflate.decompressor;
31 const comp_file = @embedFile("autogen/canonical_compositions.txt.deflate");
32 var comp_stream = std.io.fixedBufferStream(comp_file);
33 var comp_decomp = try decompressor(allocator, comp_stream.reader(), null);
34 defer comp_decomp.deinit();
35
36 var comp_buf = std.io.bufferedReader(comp_decomp.reader());
37 const comp_reader = comp_buf.reader();
38 var buf: [4096]u8 = undefined;
39
40 while (try comp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| {
41 if (line.len == 0) continue;
42 var fields = std.mem.split(u8, line, ";");
43 const cp_a = try std.fmt.parseInt(u21, fields.next().?, 16);
44 const cp_b = try std.fmt.parseInt(u21, fields.next().?, 16);
45 const cp_c = try std.fmt.parseInt(u21, fields.next().?, 16);
46 try self.nfc_map.put(.{ cp_a, cp_b }, cp_c);
47 }
48
49 // Compatibility decompositions 27 // Compatibility decompositions
50 const dekomp_file = @embedFile("autogen/compatibility_decompositions.txt.deflate"); 28 const dekomp_file = @embedFile("autogen/compatibility_decompositions.txt.deflate");
51 var dekomp_stream = std.io.fixedBufferStream(dekomp_file); 29 var dekomp_stream = std.io.fixedBufferStream(dekomp_file);
52 var dekomp_decomp = try decompressor(allocator, dekomp_stream.reader(), null); 30 var dekomp_decomp = try std.compress.deflate.decompressor(allocator, dekomp_stream.reader(), null);
53 defer dekomp_decomp.deinit(); 31 defer dekomp_decomp.deinit();
54 32
55 var dekomp_buf = std.io.bufferedReader(dekomp_decomp.reader()); 33 var dekomp_buf = std.io.bufferedReader(dekomp_decomp.reader());
56 const dekomp_reader = dekomp_buf.reader(); 34 const dekomp_reader = dekomp_buf.reader();
35 var buf: [4096]u8 = undefined;
57 36
58 while (try dekomp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| { 37 while (try dekomp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| {
59 if (line.len == 0) continue; 38 if (line.len == 0) continue;
@@ -73,7 +52,6 @@ pub fn init(allocator: std.mem.Allocator, norm_data: *NormData) !Self {
73} 52}
74 53
75pub fn deinit(self: *Self) void { 54pub fn deinit(self: *Self) void {
76 self.nfc_map.deinit();
77 self.nfkd_map.deinit(); 55 self.nfkd_map.deinit();
78} 56}
79 57
@@ -510,7 +488,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
510 488
511 if (!processed_hangul) { 489 if (!processed_hangul) {
512 // L -> C not Hangul. 490 // L -> C not Hangul.
513 if (self.nfc_map.get(.{ L, C })) |P| { 491 if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| {
514 if (!norm_props.isFcx(P)) { 492 if (!norm_props.isFcx(P)) {
515 d_list.items[sidx] = P; 493 d_list.items[sidx] = P;
516 d_list.items[i] = tombstone; // Mark for deletion. 494 d_list.items[i] = tombstone; // Mark for deletion.
diff --git a/src/autogen/canonical_compositions.txt.deflate b/src/autogen/canonical_compositions.txt.deflate
deleted file mode 100644
index 4ca2593..0000000
--- a/src/autogen/canonical_compositions.txt.deflate
+++ /dev/null
Binary files differ
diff --git a/src/autogen/canonical_decompositions.txt.deflate b/src/autogen/canonical_decompositions.txt.deflate
deleted file mode 100644
index 5169e34..0000000
--- a/src/autogen/canonical_decompositions.txt.deflate
+++ /dev/null
Binary files differ
diff --git a/src/main.zig b/src/main.zig
index d1a0bb3..05c2ea4 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -52,7 +52,7 @@ pub fn main() !void {
52 // while (iter.next()) |_| result += 1; 52 // while (iter.next()) |_| result += 1;
53 // while (iter.next()) |line| result += strWidth(line, &data); 53 // while (iter.next()) |line| result += strWidth(line, &data);
54 while (iter.next()) |line| { 54 while (iter.next()) |line| {
55 var nfc = try n.nfd(allocator, line); 55 var nfc = try n.nfc(allocator, line);
56 result += nfc.slice.len; 56 result += nfc.slice.len;
57 nfc.deinit(); 57 nfc.deinit();
58 } 58 }