diff options
| author | 2024-02-27 11:16:41 -0400 | |
|---|---|---|
| committer | 2024-02-27 11:16:41 -0400 | |
| commit | f6418d582fc2294983bfa647c7148a263af13db5 (patch) | |
| tree | 8f05af609928553bd91c61c31580d78a0de68736 | |
| parent | Removed equality functions from Normalizer (diff) | |
| download | zg-f6418d582fc2294983bfa647c7148a263af13db5.tar.gz zg-f6418d582fc2294983bfa647c7148a263af13db5.tar.xz zg-f6418d582fc2294983bfa647c7148a263af13db5.zip | |
Using NormPropsData in NormData; No Ziglyph deps in NOrmalizer
| -rw-r--r-- | build.zig | 19 | ||||
| -rw-r--r-- | codegen/normp.zig | 135 | ||||
| -rw-r--r-- | src/NormData.zig | 4 | ||||
| -rw-r--r-- | src/NormPropsData.zig | 53 | ||||
| -rw-r--r-- | src/Normalizer.zig | 9 | ||||
| -rw-r--r-- | src/main.zig | 10 |
6 files changed, 220 insertions, 10 deletions
| @@ -61,6 +61,15 @@ pub fn build(b: *std.Build) void { | |||
| 61 | const run_hangul_gen_exe = b.addRunArtifact(hangul_gen_exe); | 61 | const run_hangul_gen_exe = b.addRunArtifact(hangul_gen_exe); |
| 62 | const hangul_gen_out = run_hangul_gen_exe.addOutputFileArg("hangul.bin.z"); | 62 | const hangul_gen_out = run_hangul_gen_exe.addOutputFileArg("hangul.bin.z"); |
| 63 | 63 | ||
| 64 | const normp_gen_exe = b.addExecutable(.{ | ||
| 65 | .name = "normp", | ||
| 66 | .root_source_file = .{ .path = "codegen/normp.zig" }, | ||
| 67 | .target = b.host, | ||
| 68 | .optimize = .Debug, | ||
| 69 | }); | ||
| 70 | const run_normp_gen_exe = b.addRunArtifact(normp_gen_exe); | ||
| 71 | const normp_gen_out = run_normp_gen_exe.addOutputFileArg("normp.bin.z"); | ||
| 72 | |||
| 64 | const ccc_gen_exe = b.addExecutable(.{ | 73 | const ccc_gen_exe = b.addExecutable(.{ |
| 65 | .name = "ccc", | 74 | .name = "ccc", |
| 66 | .root_source_file = .{ .path = "codegen/ccc.zig" }, | 75 | .root_source_file = .{ .path = "codegen/ccc.zig" }, |
| @@ -149,6 +158,13 @@ pub fn build(b: *std.Build) void { | |||
| 149 | }); | 158 | }); |
| 150 | hangul_data.addAnonymousImport("hangul", .{ .root_source_file = hangul_gen_out }); | 159 | hangul_data.addAnonymousImport("hangul", .{ .root_source_file = hangul_gen_out }); |
| 151 | 160 | ||
| 161 | const normp_data = b.createModule(.{ | ||
| 162 | .root_source_file = .{ .path = "src/NormPropsData.zig" }, | ||
| 163 | .target = target, | ||
| 164 | .optimize = optimize, | ||
| 165 | }); | ||
| 166 | normp_data.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out }); | ||
| 167 | |||
| 152 | const norm_data = b.createModule(.{ | 168 | const norm_data = b.createModule(.{ |
| 153 | .root_source_file = .{ .path = "src/NormData.zig" }, | 169 | .root_source_file = .{ .path = "src/NormData.zig" }, |
| 154 | .target = target, | 170 | .target = target, |
| @@ -158,6 +174,7 @@ pub fn build(b: *std.Build) void { | |||
| 158 | norm_data.addImport("CombiningData", ccc_data); | 174 | norm_data.addImport("CombiningData", ccc_data); |
| 159 | norm_data.addImport("CompatData", compat_data); | 175 | norm_data.addImport("CompatData", compat_data); |
| 160 | norm_data.addImport("HangulData", hangul_data); | 176 | norm_data.addImport("HangulData", hangul_data); |
| 177 | norm_data.addImport("NormPropsData", normp_data); | ||
| 161 | 178 | ||
| 162 | const norm = b.addModule("Normalizer", .{ | 179 | const norm = b.addModule("Normalizer", .{ |
| 163 | .root_source_file = .{ .path = "src/Normalizer.zig" }, | 180 | .root_source_file = .{ .path = "src/Normalizer.zig" }, |
| @@ -200,7 +217,7 @@ pub fn build(b: *std.Build) void { | |||
| 200 | exe_unit_tests.root_module.addImport("code_point", code_point); | 217 | exe_unit_tests.root_module.addImport("code_point", code_point); |
| 201 | // exe_unit_tests.root_module.addImport("GraphemeData", grapheme_data); | 218 | // exe_unit_tests.root_module.addImport("GraphemeData", grapheme_data); |
| 202 | // exe_unit_tests.root_module.addImport("grapheme", grapheme); | 219 | // exe_unit_tests.root_module.addImport("grapheme", grapheme); |
| 203 | exe_unit_tests.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); | 220 | // exe_unit_tests.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); |
| 204 | // exe_unit_tests.root_module.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out }); | 221 | // exe_unit_tests.root_module.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out }); |
| 205 | // exe_unit_tests.root_module.addImport("DisplayWidthData", dw_data); | 222 | // exe_unit_tests.root_module.addImport("DisplayWidthData", dw_data); |
| 206 | exe_unit_tests.root_module.addImport("NormData", norm_data); | 223 | exe_unit_tests.root_module.addImport("NormData", norm_data); |
diff --git a/codegen/normp.zig b/codegen/normp.zig new file mode 100644 index 0000000..a332e73 --- /dev/null +++ b/codegen/normp.zig | |||
| @@ -0,0 +1,135 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | |||
| 4 | const block_size = 256; | ||
| 5 | const Block = [block_size]u3; | ||
| 6 | |||
| 7 | const BlockMap = std.HashMap( | ||
| 8 | Block, | ||
| 9 | u16, | ||
| 10 | struct { | ||
| 11 | pub fn hash(_: @This(), k: Block) u64 { | ||
| 12 | var hasher = std.hash.Wyhash.init(0); | ||
| 13 | std.hash.autoHashStrat(&hasher, k, .DeepRecursive); | ||
| 14 | return hasher.final(); | ||
| 15 | } | ||
| 16 | |||
| 17 | pub fn eql(_: @This(), a: Block, b: Block) bool { | ||
| 18 | return std.mem.eql(u3, &a, &b); | ||
| 19 | } | ||
| 20 | }, | ||
| 21 | std.hash_map.default_max_load_percentage, | ||
| 22 | ); | ||
| 23 | |||
| 24 | pub fn main() !void { | ||
| 25 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); | ||
| 26 | defer arena.deinit(); | ||
| 27 | const allocator = arena.allocator(); | ||
| 28 | |||
| 29 | var flat_map = std.AutoHashMap(u21, u3).init(allocator); | ||
| 30 | defer flat_map.deinit(); | ||
| 31 | |||
| 32 | var line_buf: [4096]u8 = undefined; | ||
| 33 | |||
| 34 | // Process DerivedEastAsianWidth.txt | ||
| 35 | var in_file = try std.fs.cwd().openFile("data/unicode/DerivedNormalizationProps.txt", .{}); | ||
| 36 | defer in_file.close(); | ||
| 37 | var in_buf = std.io.bufferedReader(in_file.reader()); | ||
| 38 | const in_reader = in_buf.reader(); | ||
| 39 | |||
| 40 | while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { | ||
| 41 | if (line.len == 0 or line[0] == '#') continue; | ||
| 42 | |||
| 43 | const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line; | ||
| 44 | |||
| 45 | var field_iter = std.mem.tokenizeAny(u8, no_comment, "; "); | ||
| 46 | var current_code: [2]u21 = undefined; | ||
| 47 | |||
| 48 | var i: usize = 0; | ||
| 49 | while (field_iter.next()) |field| : (i += 1) { | ||
| 50 | switch (i) { | ||
| 51 | 0 => { | ||
| 52 | // Code point(s) | ||
| 53 | if (std.mem.indexOf(u8, field, "..")) |dots| { | ||
| 54 | current_code = .{ | ||
| 55 | try std.fmt.parseInt(u21, field[0..dots], 16), | ||
| 56 | try std.fmt.parseInt(u21, field[dots + 2 ..], 16), | ||
| 57 | }; | ||
| 58 | } else { | ||
| 59 | const code = try std.fmt.parseInt(u21, field, 16); | ||
| 60 | current_code = .{ code, code }; | ||
| 61 | } | ||
| 62 | }, | ||
| 63 | 1 => { | ||
| 64 | // Norm props | ||
| 65 | for (current_code[0]..current_code[1] + 1) |cp| { | ||
| 66 | const gop = try flat_map.getOrPut(@intCast(cp)); | ||
| 67 | if (!gop.found_existing) gop.value_ptr.* = 0; | ||
| 68 | |||
| 69 | if (std.mem.eql(u8, field, "NFD_QC")) { | ||
| 70 | gop.value_ptr.* |= 1; | ||
| 71 | } else if (std.mem.eql(u8, field, "NFKD_QC")) { | ||
| 72 | gop.value_ptr.* |= 2; | ||
| 73 | } else if (std.mem.eql(u8, field, "Full_Composition_Exclusion")) { | ||
| 74 | gop.value_ptr.* |= 4; | ||
| 75 | } | ||
| 76 | } | ||
| 77 | }, | ||
| 78 | else => {}, | ||
| 79 | } | ||
| 80 | } | ||
| 81 | } | ||
| 82 | |||
| 83 | var blocks_map = BlockMap.init(allocator); | ||
| 84 | defer blocks_map.deinit(); | ||
| 85 | |||
| 86 | var stage1 = std.ArrayList(u16).init(allocator); | ||
| 87 | defer stage1.deinit(); | ||
| 88 | |||
| 89 | var stage2 = std.ArrayList(u3).init(allocator); | ||
| 90 | defer stage2.deinit(); | ||
| 91 | |||
| 92 | var block: Block = [_]u3{0} ** block_size; | ||
| 93 | var block_len: u16 = 0; | ||
| 94 | |||
| 95 | for (0..0x110000) |i| { | ||
| 96 | const cp: u21 = @intCast(i); | ||
| 97 | const props = flat_map.get(cp) orelse 0; | ||
| 98 | |||
| 99 | // Process block | ||
| 100 | block[block_len] = props; | ||
| 101 | block_len += 1; | ||
| 102 | |||
| 103 | if (block_len < block_size and cp != 0x10ffff) continue; | ||
| 104 | |||
| 105 | const gop = try blocks_map.getOrPut(block); | ||
| 106 | if (!gop.found_existing) { | ||
| 107 | gop.value_ptr.* = @intCast(stage2.items.len); | ||
| 108 | try stage2.appendSlice(&block); | ||
| 109 | } | ||
| 110 | |||
| 111 | try stage1.append(gop.value_ptr.*); | ||
| 112 | block_len = 0; | ||
| 113 | } | ||
| 114 | |||
| 115 | var args_iter = try std.process.argsWithAllocator(allocator); | ||
| 116 | defer args_iter.deinit(); | ||
| 117 | _ = args_iter.skip(); | ||
| 118 | const output_path = args_iter.next() orelse @panic("No output file arg!"); | ||
| 119 | |||
| 120 | const compressor = std.compress.deflate.compressor; | ||
| 121 | var out_file = try std.fs.cwd().createFile(output_path, .{}); | ||
| 122 | defer out_file.close(); | ||
| 123 | var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); | ||
| 124 | defer out_comp.deinit(); | ||
| 125 | const writer = out_comp.writer(); | ||
| 126 | |||
| 127 | const endian = builtin.cpu.arch.endian(); | ||
| 128 | try writer.writeInt(u16, @intCast(stage1.items.len), endian); | ||
| 129 | for (stage1.items) |i| try writer.writeInt(u16, i, endian); | ||
| 130 | |||
| 131 | try writer.writeInt(u16, @intCast(stage2.items.len), endian); | ||
| 132 | for (stage2.items) |i| try writer.writeInt(u8, i, endian); | ||
| 133 | |||
| 134 | try out_comp.flush(); | ||
| 135 | } | ||
diff --git a/src/NormData.zig b/src/NormData.zig index 8923382..7c2a09b 100644 --- a/src/NormData.zig +++ b/src/NormData.zig | |||
| @@ -5,11 +5,13 @@ const CanonData = @import("CanonData"); | |||
| 5 | const CccData = @import("CombiningData"); | 5 | const CccData = @import("CombiningData"); |
| 6 | const CompatData = @import("CompatData"); | 6 | const CompatData = @import("CompatData"); |
| 7 | const HangulData = @import("HangulData"); | 7 | const HangulData = @import("HangulData"); |
| 8 | const NormPropsData = @import("NormPropsData"); | ||
| 8 | 9 | ||
| 9 | canon_data: CanonData, | 10 | canon_data: CanonData, |
| 10 | ccc_data: CccData, | 11 | ccc_data: CccData, |
| 11 | compat_data: CompatData, | 12 | compat_data: CompatData, |
| 12 | hangul_data: HangulData, | 13 | hangul_data: HangulData, |
| 14 | normp_data: NormPropsData, | ||
| 13 | 15 | ||
| 14 | const Self = @This(); | 16 | const Self = @This(); |
| 15 | 17 | ||
| @@ -19,6 +21,7 @@ pub fn init(allocator: std.mem.Allocator) !Self { | |||
| 19 | .ccc_data = try CccData.init(allocator), | 21 | .ccc_data = try CccData.init(allocator), |
| 20 | .compat_data = try CompatData.init(allocator), | 22 | .compat_data = try CompatData.init(allocator), |
| 21 | .hangul_data = try HangulData.init(allocator), | 23 | .hangul_data = try HangulData.init(allocator), |
| 24 | .normp_data = try NormPropsData.init(allocator), | ||
| 22 | }; | 25 | }; |
| 23 | } | 26 | } |
| 24 | 27 | ||
| @@ -27,4 +30,5 @@ pub fn deinit(self: *Self) void { | |||
| 27 | self.ccc_data.deinit(); | 30 | self.ccc_data.deinit(); |
| 28 | self.compat_data.deinit(); | 31 | self.compat_data.deinit(); |
| 29 | self.hangul_data.deinit(); | 32 | self.hangul_data.deinit(); |
| 33 | self.normp_data.deinit(); | ||
| 30 | } | 34 | } |
diff --git a/src/NormPropsData.zig b/src/NormPropsData.zig new file mode 100644 index 0000000..3c49712 --- /dev/null +++ b/src/NormPropsData.zig | |||
| @@ -0,0 +1,53 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | const compress = std.compress; | ||
| 4 | const mem = std.mem; | ||
| 5 | const testing = std.testing; | ||
| 6 | |||
| 7 | allocator: mem.Allocator, | ||
| 8 | s1: []u16 = undefined, | ||
| 9 | s2: []u4 = undefined, | ||
| 10 | |||
| 11 | const Self = @This(); | ||
| 12 | |||
| 13 | pub fn init(allocator: mem.Allocator) !Self { | ||
| 14 | const decompressor = compress.deflate.decompressor; | ||
| 15 | const in_bytes = @embedFile("normp"); | ||
| 16 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 17 | var in_decomp = try decompressor(allocator, in_fbs.reader(), null); | ||
| 18 | defer in_decomp.deinit(); | ||
| 19 | var reader = in_decomp.reader(); | ||
| 20 | |||
| 21 | const endian = builtin.cpu.arch.endian(); | ||
| 22 | var self = Self{ .allocator = allocator }; | ||
| 23 | |||
| 24 | const stage_1_len: u16 = try reader.readInt(u16, endian); | ||
| 25 | self.s1 = try allocator.alloc(u16, stage_1_len); | ||
| 26 | for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); | ||
| 27 | |||
| 28 | const stage_2_len: u16 = try reader.readInt(u16, endian); | ||
| 29 | self.s2 = try allocator.alloc(u4, stage_2_len); | ||
| 30 | for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian)); | ||
| 31 | |||
| 32 | return self; | ||
| 33 | } | ||
| 34 | |||
| 35 | pub fn deinit(self: *Self) void { | ||
| 36 | self.allocator.free(self.s1); | ||
| 37 | self.allocator.free(self.s2); | ||
| 38 | } | ||
| 39 | |||
| 40 | /// Returns true if `cp` is already in NFD form. | ||
| 41 | pub inline fn isNfd(self: Self, cp: u21) bool { | ||
| 42 | return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 1 == 0; | ||
| 43 | } | ||
| 44 | |||
| 45 | /// Returns true if `cp` is already in NFKD form. | ||
| 46 | pub inline fn isNfkd(self: Self, cp: u21) bool { | ||
| 47 | return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 2 == 0; | ||
| 48 | } | ||
| 49 | |||
| 50 | /// Returns true if `cp` is not allowed in any normalized form. | ||
| 51 | pub inline fn isFcx(self: Self, cp: u21) bool { | ||
| 52 | return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 4 == 4; | ||
| 53 | } | ||
diff --git a/src/Normalizer.zig b/src/Normalizer.zig index d1d7cee..26177ac 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig | |||
| @@ -6,8 +6,6 @@ const std = @import("std"); | |||
| 6 | const testing = std.testing; | 6 | const testing = std.testing; |
| 7 | 7 | ||
| 8 | const CodePointIterator = @import("code_point").Iterator; | 8 | const CodePointIterator = @import("code_point").Iterator; |
| 9 | const norm_props = @import("ziglyph").normalization_props; | ||
| 10 | |||
| 11 | pub const NormData = @import("NormData"); | 9 | pub const NormData = @import("NormData"); |
| 12 | 10 | ||
| 13 | norm_data: *NormData, | 11 | norm_data: *NormData, |
| @@ -109,7 +107,10 @@ pub fn decompose(self: Self, cp: u21, form: Form) Decomp { | |||
| 109 | var dc = Decomp{ .form = form }; | 107 | var dc = Decomp{ .form = form }; |
| 110 | 108 | ||
| 111 | // ASCII or NFD / NFKD quick checks. | 109 | // ASCII or NFD / NFKD quick checks. |
| 112 | if (cp <= 127 or (form == .nfd and norm_props.isNfd(cp)) or (form == .nfkd and norm_props.isNfkd(cp))) { | 110 | if (cp <= 127 or |
| 111 | (form == .nfd and self.norm_data.normp_data.isNfd(cp)) or | ||
| 112 | (form == .nfkd and self.norm_data.normp_data.isNfkd(cp))) | ||
| 113 | { | ||
| 113 | dc.cps[0] = cp; | 114 | dc.cps[0] = cp; |
| 114 | return dc; | 115 | return dc; |
| 115 | } | 116 | } |
| @@ -436,7 +437,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 436 | if (!processed_hangul) { | 437 | if (!processed_hangul) { |
| 437 | // L -> C not Hangul. | 438 | // L -> C not Hangul. |
| 438 | if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| { | 439 | if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| { |
| 439 | if (!norm_props.isFcx(P)) { | 440 | if (!self.norm_data.normp_data.isFcx(P)) { |
| 440 | d_list.items[sidx] = P; | 441 | d_list.items[sidx] = P; |
| 441 | d_list.items[i] = tombstone; // Mark for deletion. | 442 | d_list.items[i] = tombstone; // Mark for deletion. |
| 442 | deleted += 1; | 443 | deleted += 1; |
diff --git a/src/main.zig b/src/main.zig index 2c2cf8c..15dca16 100644 --- a/src/main.zig +++ b/src/main.zig | |||
| @@ -25,9 +25,9 @@ pub fn main() !void { | |||
| 25 | _ = args_iter.skip(); | 25 | _ = args_iter.skip(); |
| 26 | const in_path = args_iter.next() orelse return error.MissingArg; | 26 | const in_path = args_iter.next() orelse return error.MissingArg; |
| 27 | 27 | ||
| 28 | var gpa = std.heap.GeneralPurposeAllocator(.{}){}; | 28 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); |
| 29 | defer _ = gpa.deinit(); | 29 | defer arena.deinit(); |
| 30 | const allocator = gpa.allocator(); | 30 | const allocator = arena.allocator(); |
| 31 | 31 | ||
| 32 | const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32)); | 32 | const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32)); |
| 33 | defer allocator.free(input); | 33 | defer allocator.free(input); |
| @@ -51,9 +51,9 @@ pub fn main() !void { | |||
| 51 | // while (iter.next()) |_| result += 1; | 51 | // while (iter.next()) |_| result += 1; |
| 52 | // while (iter.next()) |line| result += strWidth(line, &data); | 52 | // while (iter.next()) |line| result += strWidth(line, &data); |
| 53 | while (iter.next()) |line| { | 53 | while (iter.next()) |line| { |
| 54 | var nfc = try n.nfc(allocator, line); | 54 | const nfc = try n.nfc(allocator, line); |
| 55 | result += nfc.slice.len; | 55 | result += nfc.slice.len; |
| 56 | nfc.deinit(); | 56 | // nfc.deinit(); |
| 57 | } | 57 | } |
| 58 | 58 | ||
| 59 | std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); | 59 | std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); |