diff options
| author | 2024-02-26 18:54:11 -0400 | |
|---|---|---|
| committer | 2024-02-26 18:54:11 -0400 | |
| commit | 7d8f330db2bfb625a054eb7e21d397ff696c0b3f (patch) | |
| tree | ce62411836c8b5b8bd5addff4c0a66422a9922cc | |
| parent | Using NormData and CanonData in Normalizer (diff) | |
| download | zg-7d8f330db2bfb625a054eb7e21d397ff696c0b3f.tar.gz zg-7d8f330db2bfb625a054eb7e21d397ff696c0b3f.tar.xz zg-7d8f330db2bfb625a054eb7e21d397ff696c0b3f.zip | |
Using NormData nfc and nfd
| -rw-r--r-- | src/Canonical.zig | 13 | ||||
| -rw-r--r-- | src/Normalizer.zig | 28 | ||||
| -rw-r--r-- | src/autogen/canonical_compositions.txt.deflate | bin | 4410 -> 0 bytes | |||
| -rw-r--r-- | src/autogen/canonical_decompositions.txt.deflate | bin | 9541 -> 0 bytes | |||
| -rw-r--r-- | src/main.zig | 2 |
5 files changed, 16 insertions, 27 deletions
diff --git a/src/Canonical.zig b/src/Canonical.zig index d54e828..81d3eec 100644 --- a/src/Canonical.zig +++ b/src/Canonical.zig | |||
| @@ -4,6 +4,7 @@ const compress = std.compress; | |||
| 4 | const mem = std.mem; | 4 | const mem = std.mem; |
| 5 | 5 | ||
| 6 | allocator: mem.Allocator, | 6 | allocator: mem.Allocator, |
| 7 | nfc: std.AutoHashMap([2]u21, u21), | ||
| 7 | nfd: [][2]u21 = undefined, | 8 | nfd: [][2]u21 = undefined, |
| 8 | 9 | ||
| 9 | const Self = @This(); | 10 | const Self = @This(); |
| @@ -19,6 +20,7 @@ pub fn init(allocator: mem.Allocator) !Self { | |||
| 19 | const endian = builtin.cpu.arch.endian(); | 20 | const endian = builtin.cpu.arch.endian(); |
| 20 | var self = Self{ | 21 | var self = Self{ |
| 21 | .allocator = allocator, | 22 | .allocator = allocator, |
| 23 | .nfc = std.AutoHashMap([2]u21, u21).init(allocator), | ||
| 22 | .nfd = try allocator.alloc([2]u21, 0x110000), | 24 | .nfd = try allocator.alloc([2]u21, 0x110000), |
| 23 | }; | 25 | }; |
| 24 | 26 | ||
| @@ -29,13 +31,17 @@ pub fn init(allocator: mem.Allocator) !Self { | |||
| 29 | if (len == 0) break; | 31 | if (len == 0) break; |
| 30 | const cp = try reader.readInt(u24, endian); | 32 | const cp = try reader.readInt(u24, endian); |
| 31 | self.nfd[cp][0] = @intCast(try reader.readInt(u24, endian)); | 33 | self.nfd[cp][0] = @intCast(try reader.readInt(u24, endian)); |
| 32 | if (len == 3) self.nfd[cp][1] = @intCast(try reader.readInt(u24, endian)); | 34 | if (len == 3) { |
| 35 | self.nfd[cp][1] = @intCast(try reader.readInt(u24, endian)); | ||
| 36 | try self.nfc.put(self.nfd[cp], @intCast(cp)); | ||
| 37 | } | ||
| 33 | } | 38 | } |
| 34 | 39 | ||
| 35 | return self; | 40 | return self; |
| 36 | } | 41 | } |
| 37 | 42 | ||
| 38 | pub fn deinit(self: *Self) void { | 43 | pub fn deinit(self: *Self) void { |
| 44 | self.nfc.deinit(); | ||
| 39 | self.allocator.free(self.nfd); | 45 | self.allocator.free(self.nfd); |
| 40 | } | 46 | } |
| 41 | 47 | ||
| @@ -43,3 +49,8 @@ pub fn deinit(self: *Self) void { | |||
| 43 | pub inline fn toNfd(self: Self, cp: u21) [2]u21 { | 49 | pub inline fn toNfd(self: Self, cp: u21) [2]u21 { |
| 44 | return self.nfd[cp]; | 50 | return self.nfd[cp]; |
| 45 | } | 51 | } |
| 52 | |||
| 53 | // Returns the primary composite for the codepoints in `cp`. | ||
| 54 | pub inline fn toNfc(self: Self, cps: [2]u21) ?u21 { | ||
| 55 | return self.nfc.get(cps); | ||
| 56 | } | ||
diff --git a/src/Normalizer.zig b/src/Normalizer.zig index 848cf20..2e2e6e4 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig | |||
| @@ -12,7 +12,6 @@ const norm_props = @import("ziglyph").normalization_props; | |||
| 12 | 12 | ||
| 13 | pub const NormData = @import("NormData"); | 13 | pub const NormData = @import("NormData"); |
| 14 | 14 | ||
| 15 | nfc_map: std.AutoHashMap([2]u21, u21), | ||
| 16 | nfkd_map: std.AutoHashMap(u21, [18]u21), | 15 | nfkd_map: std.AutoHashMap(u21, [18]u21), |
| 17 | norm_data: *NormData, | 16 | norm_data: *NormData, |
| 18 | 17 | ||
| @@ -20,40 +19,20 @@ const Self = @This(); | |||
| 20 | 19 | ||
| 21 | pub fn init(allocator: std.mem.Allocator, norm_data: *NormData) !Self { | 20 | pub fn init(allocator: std.mem.Allocator, norm_data: *NormData) !Self { |
| 22 | var self = Self{ | 21 | var self = Self{ |
| 23 | .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator), | ||
| 24 | .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), | 22 | .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), |
| 25 | .norm_data = norm_data, | 23 | .norm_data = norm_data, |
| 26 | }; | 24 | }; |
| 27 | errdefer self.deinit(); | 25 | errdefer self.deinit(); |
| 28 | 26 | ||
| 29 | // Canonical compositions | ||
| 30 | const decompressor = std.compress.deflate.decompressor; | ||
| 31 | const comp_file = @embedFile("autogen/canonical_compositions.txt.deflate"); | ||
| 32 | var comp_stream = std.io.fixedBufferStream(comp_file); | ||
| 33 | var comp_decomp = try decompressor(allocator, comp_stream.reader(), null); | ||
| 34 | defer comp_decomp.deinit(); | ||
| 35 | |||
| 36 | var comp_buf = std.io.bufferedReader(comp_decomp.reader()); | ||
| 37 | const comp_reader = comp_buf.reader(); | ||
| 38 | var buf: [4096]u8 = undefined; | ||
| 39 | |||
| 40 | while (try comp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| { | ||
| 41 | if (line.len == 0) continue; | ||
| 42 | var fields = std.mem.split(u8, line, ";"); | ||
| 43 | const cp_a = try std.fmt.parseInt(u21, fields.next().?, 16); | ||
| 44 | const cp_b = try std.fmt.parseInt(u21, fields.next().?, 16); | ||
| 45 | const cp_c = try std.fmt.parseInt(u21, fields.next().?, 16); | ||
| 46 | try self.nfc_map.put(.{ cp_a, cp_b }, cp_c); | ||
| 47 | } | ||
| 48 | |||
| 49 | // Compatibility decompositions | 27 | // Compatibility decompositions |
| 50 | const dekomp_file = @embedFile("autogen/compatibility_decompositions.txt.deflate"); | 28 | const dekomp_file = @embedFile("autogen/compatibility_decompositions.txt.deflate"); |
| 51 | var dekomp_stream = std.io.fixedBufferStream(dekomp_file); | 29 | var dekomp_stream = std.io.fixedBufferStream(dekomp_file); |
| 52 | var dekomp_decomp = try decompressor(allocator, dekomp_stream.reader(), null); | 30 | var dekomp_decomp = try std.compress.deflate.decompressor(allocator, dekomp_stream.reader(), null); |
| 53 | defer dekomp_decomp.deinit(); | 31 | defer dekomp_decomp.deinit(); |
| 54 | 32 | ||
| 55 | var dekomp_buf = std.io.bufferedReader(dekomp_decomp.reader()); | 33 | var dekomp_buf = std.io.bufferedReader(dekomp_decomp.reader()); |
| 56 | const dekomp_reader = dekomp_buf.reader(); | 34 | const dekomp_reader = dekomp_buf.reader(); |
| 35 | var buf: [4096]u8 = undefined; | ||
| 57 | 36 | ||
| 58 | while (try dekomp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| { | 37 | while (try dekomp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| { |
| 59 | if (line.len == 0) continue; | 38 | if (line.len == 0) continue; |
| @@ -73,7 +52,6 @@ pub fn init(allocator: std.mem.Allocator, norm_data: *NormData) !Self { | |||
| 73 | } | 52 | } |
| 74 | 53 | ||
| 75 | pub fn deinit(self: *Self) void { | 54 | pub fn deinit(self: *Self) void { |
| 76 | self.nfc_map.deinit(); | ||
| 77 | self.nfkd_map.deinit(); | 55 | self.nfkd_map.deinit(); |
| 78 | } | 56 | } |
| 79 | 57 | ||
| @@ -510,7 +488,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 510 | 488 | ||
| 511 | if (!processed_hangul) { | 489 | if (!processed_hangul) { |
| 512 | // L -> C not Hangul. | 490 | // L -> C not Hangul. |
| 513 | if (self.nfc_map.get(.{ L, C })) |P| { | 491 | if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| { |
| 514 | if (!norm_props.isFcx(P)) { | 492 | if (!norm_props.isFcx(P)) { |
| 515 | d_list.items[sidx] = P; | 493 | d_list.items[sidx] = P; |
| 516 | d_list.items[i] = tombstone; // Mark for deletion. | 494 | d_list.items[i] = tombstone; // Mark for deletion. |
diff --git a/src/autogen/canonical_compositions.txt.deflate b/src/autogen/canonical_compositions.txt.deflate deleted file mode 100644 index 4ca2593..0000000 --- a/src/autogen/canonical_compositions.txt.deflate +++ /dev/null | |||
| Binary files differ | |||
diff --git a/src/autogen/canonical_decompositions.txt.deflate b/src/autogen/canonical_decompositions.txt.deflate deleted file mode 100644 index 5169e34..0000000 --- a/src/autogen/canonical_decompositions.txt.deflate +++ /dev/null | |||
| Binary files differ | |||
diff --git a/src/main.zig b/src/main.zig index d1a0bb3..05c2ea4 100644 --- a/src/main.zig +++ b/src/main.zig | |||
| @@ -52,7 +52,7 @@ pub fn main() !void { | |||
| 52 | // while (iter.next()) |_| result += 1; | 52 | // while (iter.next()) |_| result += 1; |
| 53 | // while (iter.next()) |line| result += strWidth(line, &data); | 53 | // while (iter.next()) |line| result += strWidth(line, &data); |
| 54 | while (iter.next()) |line| { | 54 | while (iter.next()) |line| { |
| 55 | var nfc = try n.nfd(allocator, line); | 55 | var nfc = try n.nfc(allocator, line); |
| 56 | result += nfc.slice.len; | 56 | result += nfc.slice.len; |
| 57 | nfc.deinit(); | 57 | nfc.deinit(); |
| 58 | } | 58 | } |