From 7d8f330db2bfb625a054eb7e21d397ff696c0b3f Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Mon, 26 Feb 2024 18:54:11 -0400 Subject: Using NormData nfc and nfd --- src/Canonical.zig | 13 ++++++++++- src/Normalizer.zig | 28 +++-------------------- src/autogen/canonical_compositions.txt.deflate | Bin 4410 -> 0 bytes src/autogen/canonical_decompositions.txt.deflate | Bin 9541 -> 0 bytes src/main.zig | 2 +- 5 files changed, 16 insertions(+), 27 deletions(-) delete mode 100644 src/autogen/canonical_compositions.txt.deflate delete mode 100644 src/autogen/canonical_decompositions.txt.deflate (limited to 'src') diff --git a/src/Canonical.zig b/src/Canonical.zig index d54e828..81d3eec 100644 --- a/src/Canonical.zig +++ b/src/Canonical.zig @@ -4,6 +4,7 @@ const compress = std.compress; const mem = std.mem; allocator: mem.Allocator, +nfc: std.AutoHashMap([2]u21, u21), nfd: [][2]u21 = undefined, const Self = @This(); @@ -19,6 +20,7 @@ pub fn init(allocator: mem.Allocator) !Self { const endian = builtin.cpu.arch.endian(); var self = Self{ .allocator = allocator, + .nfc = std.AutoHashMap([2]u21, u21).init(allocator), .nfd = try allocator.alloc([2]u21, 0x110000), }; @@ -29,13 +31,17 @@ pub fn init(allocator: mem.Allocator) !Self { if (len == 0) break; const cp = try reader.readInt(u24, endian); self.nfd[cp][0] = @intCast(try reader.readInt(u24, endian)); - if (len == 3) self.nfd[cp][1] = @intCast(try reader.readInt(u24, endian)); + if (len == 3) { + self.nfd[cp][1] = @intCast(try reader.readInt(u24, endian)); + try self.nfc.put(self.nfd[cp], @intCast(cp)); + } } return self; } pub fn deinit(self: *Self) void { + self.nfc.deinit(); self.allocator.free(self.nfd); } @@ -43,3 +49,8 @@ pub fn deinit(self: *Self) void { pub inline fn toNfd(self: Self, cp: u21) [2]u21 { return self.nfd[cp]; } + +// Returns the primary composite for the codepoints in `cp`. +pub inline fn toNfc(self: Self, cps: [2]u21) ?u21 { + return self.nfc.get(cps); +} diff --git a/src/Normalizer.zig b/src/Normalizer.zig index 848cf20..2e2e6e4 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig @@ -12,7 +12,6 @@ const norm_props = @import("ziglyph").normalization_props; pub const NormData = @import("NormData"); -nfc_map: std.AutoHashMap([2]u21, u21), nfkd_map: std.AutoHashMap(u21, [18]u21), norm_data: *NormData, @@ -20,40 +19,20 @@ const Self = @This(); pub fn init(allocator: std.mem.Allocator, norm_data: *NormData) !Self { var self = Self{ - .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator), .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), .norm_data = norm_data, }; errdefer self.deinit(); - // Canonical compositions - const decompressor = std.compress.deflate.decompressor; - const comp_file = @embedFile("autogen/canonical_compositions.txt.deflate"); - var comp_stream = std.io.fixedBufferStream(comp_file); - var comp_decomp = try decompressor(allocator, comp_stream.reader(), null); - defer comp_decomp.deinit(); - - var comp_buf = std.io.bufferedReader(comp_decomp.reader()); - const comp_reader = comp_buf.reader(); - var buf: [4096]u8 = undefined; - - while (try comp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| { - if (line.len == 0) continue; - var fields = std.mem.split(u8, line, ";"); - const cp_a = try std.fmt.parseInt(u21, fields.next().?, 16); - const cp_b = try std.fmt.parseInt(u21, fields.next().?, 16); - const cp_c = try std.fmt.parseInt(u21, fields.next().?, 16); - try self.nfc_map.put(.{ cp_a, cp_b }, cp_c); - } - // Compatibility decompositions const dekomp_file = @embedFile("autogen/compatibility_decompositions.txt.deflate"); var dekomp_stream = std.io.fixedBufferStream(dekomp_file); - var dekomp_decomp = try decompressor(allocator, dekomp_stream.reader(), null); + var dekomp_decomp = try std.compress.deflate.decompressor(allocator, dekomp_stream.reader(), null); defer dekomp_decomp.deinit(); var dekomp_buf = std.io.bufferedReader(dekomp_decomp.reader()); const dekomp_reader = dekomp_buf.reader(); + var buf: [4096]u8 = undefined; while (try dekomp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| { if (line.len == 0) continue; @@ -73,7 +52,6 @@ pub fn init(allocator: std.mem.Allocator, norm_data: *NormData) !Self { } pub fn deinit(self: *Self) void { - self.nfc_map.deinit(); self.nfkd_map.deinit(); } @@ -510,7 +488,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! if (!processed_hangul) { // L -> C not Hangul. - if (self.nfc_map.get(.{ L, C })) |P| { + if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| { if (!norm_props.isFcx(P)) { d_list.items[sidx] = P; d_list.items[i] = tombstone; // Mark for deletion. diff --git a/src/autogen/canonical_compositions.txt.deflate b/src/autogen/canonical_compositions.txt.deflate deleted file mode 100644 index 4ca2593..0000000 Binary files a/src/autogen/canonical_compositions.txt.deflate and /dev/null differ diff --git a/src/autogen/canonical_decompositions.txt.deflate b/src/autogen/canonical_decompositions.txt.deflate deleted file mode 100644 index 5169e34..0000000 Binary files a/src/autogen/canonical_decompositions.txt.deflate and /dev/null differ diff --git a/src/main.zig b/src/main.zig index d1a0bb3..05c2ea4 100644 --- a/src/main.zig +++ b/src/main.zig @@ -52,7 +52,7 @@ pub fn main() !void { // while (iter.next()) |_| result += 1; // while (iter.next()) |line| result += strWidth(line, &data); while (iter.next()) |line| { - var nfc = try n.nfd(allocator, line); + var nfc = try n.nfc(allocator, line); result += nfc.slice.len; nfc.deinit(); } -- cgit v1.2.3