diff options
| author | 2024-02-27 18:04:32 -0400 | |
|---|---|---|
| committer | 2024-02-27 18:04:32 -0400 | |
| commit | ecd9c2277de17e24fa26aefee955caa10b5b990c (patch) | |
| tree | b76f3e2b9886456f61b0001f4dcae15d00a76856 | |
| parent | Normalizer 2x faster than Ziglyph; Uses 2x memory (diff) | |
| download | zg-ecd9c2277de17e24fa26aefee955caa10b5b990c.tar.gz zg-ecd9c2277de17e24fa26aefee955caa10b5b990c.tar.xz zg-ecd9c2277de17e24fa26aefee955caa10b5b990c.zip | |
General Category with GenCatData
| -rw-r--r-- | build.zig | 22 | ||||
| -rw-r--r-- | codegen/gencat.zig | 172 | ||||
| -rw-r--r-- | src/GenCatData.zig | 83 | ||||
| -rw-r--r-- | src/HangulData.zig | 8 | ||||
| -rw-r--r-- | src/main.zig | 33 |
5 files changed, 300 insertions, 18 deletions
| @@ -79,6 +79,15 @@ pub fn build(b: *std.Build) void { | |||
| 79 | const run_ccc_gen_exe = b.addRunArtifact(ccc_gen_exe); | 79 | const run_ccc_gen_exe = b.addRunArtifact(ccc_gen_exe); |
| 80 | const ccc_gen_out = run_ccc_gen_exe.addOutputFileArg("ccc.bin.z"); | 80 | const ccc_gen_out = run_ccc_gen_exe.addOutputFileArg("ccc.bin.z"); |
| 81 | 81 | ||
| 82 | const gencat_gen_exe = b.addExecutable(.{ | ||
| 83 | .name = "gencat", | ||
| 84 | .root_source_file = .{ .path = "codegen/gencat.zig" }, | ||
| 85 | .target = b.host, | ||
| 86 | .optimize = .Debug, | ||
| 87 | }); | ||
| 88 | const run_gencat_gen_exe = b.addRunArtifact(gencat_gen_exe); | ||
| 89 | const gencat_gen_out = run_gencat_gen_exe.addOutputFileArg("gencat.bin.z"); | ||
| 90 | |||
| 82 | // Modules we provide | 91 | // Modules we provide |
| 83 | // Code points | 92 | // Code points |
| 84 | const code_point = b.addModule("code_point", .{ | 93 | const code_point = b.addModule("code_point", .{ |
| @@ -185,6 +194,14 @@ pub fn build(b: *std.Build) void { | |||
| 185 | norm.addImport("ziglyph", ziglyph.module("ziglyph")); | 194 | norm.addImport("ziglyph", ziglyph.module("ziglyph")); |
| 186 | norm.addImport("NormData", norm_data); | 195 | norm.addImport("NormData", norm_data); |
| 187 | 196 | ||
| 197 | // General Category | ||
| 198 | const gencat_data = b.createModule(.{ | ||
| 199 | .root_source_file = .{ .path = "src/GenCatData.zig" }, | ||
| 200 | .target = target, | ||
| 201 | .optimize = optimize, | ||
| 202 | }); | ||
| 203 | gencat_data.addAnonymousImport("gencat", .{ .root_source_file = gencat_gen_out }); | ||
| 204 | |||
| 188 | // Benchmark rig | 205 | // Benchmark rig |
| 189 | const exe = b.addExecutable(.{ | 206 | const exe = b.addExecutable(.{ |
| 190 | .name = "zg", | 207 | .name = "zg", |
| @@ -194,10 +211,11 @@ pub fn build(b: *std.Build) void { | |||
| 194 | }); | 211 | }); |
| 195 | // exe.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); | 212 | // exe.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); |
| 196 | // exe.root_module.addImport("ascii", ascii); | 213 | // exe.root_module.addImport("ascii", ascii); |
| 197 | // exe.root_module.addImport("code_point", code_point); | 214 | exe.root_module.addImport("code_point", code_point); |
| 198 | // exe.root_module.addImport("grapheme", grapheme); | 215 | // exe.root_module.addImport("grapheme", grapheme); |
| 199 | // exe.root_module.addImport("DisplayWidth", display_width); | 216 | // exe.root_module.addImport("DisplayWidth", display_width); |
| 200 | exe.root_module.addImport("Normalizer", norm); | 217 | // exe.root_module.addImport("Normalizer", norm); |
| 218 | exe.root_module.addImport("GenCatData", gencat_data); | ||
| 201 | b.installArtifact(exe); | 219 | b.installArtifact(exe); |
| 202 | 220 | ||
| 203 | const run_cmd = b.addRunArtifact(exe); | 221 | const run_cmd = b.addRunArtifact(exe); |
diff --git a/codegen/gencat.zig b/codegen/gencat.zig new file mode 100644 index 0000000..5407040 --- /dev/null +++ b/codegen/gencat.zig | |||
| @@ -0,0 +1,172 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | |||
| 4 | const Gc = enum { | ||
| 5 | Cc, | ||
| 6 | Cf, | ||
| 7 | Cn, | ||
| 8 | Co, | ||
| 9 | Cs, | ||
| 10 | Ll, | ||
| 11 | Lm, | ||
| 12 | Lo, | ||
| 13 | Lt, | ||
| 14 | Lu, | ||
| 15 | Mc, | ||
| 16 | Me, | ||
| 17 | Mn, | ||
| 18 | Nd, | ||
| 19 | Nl, | ||
| 20 | No, | ||
| 21 | Pc, | ||
| 22 | Pd, | ||
| 23 | Pe, | ||
| 24 | Pf, | ||
| 25 | Pi, | ||
| 26 | Po, | ||
| 27 | Ps, | ||
| 28 | Sc, | ||
| 29 | Sk, | ||
| 30 | Sm, | ||
| 31 | So, | ||
| 32 | Zl, | ||
| 33 | Zp, | ||
| 34 | Zs, | ||
| 35 | }; | ||
| 36 | |||
| 37 | const block_size = 256; | ||
| 38 | const Block = [block_size]u5; | ||
| 39 | |||
| 40 | const BlockMap = std.HashMap( | ||
| 41 | Block, | ||
| 42 | u16, | ||
| 43 | struct { | ||
| 44 | pub fn hash(_: @This(), k: Block) u64 { | ||
| 45 | var hasher = std.hash.Wyhash.init(0); | ||
| 46 | std.hash.autoHashStrat(&hasher, k, .DeepRecursive); | ||
| 47 | return hasher.final(); | ||
| 48 | } | ||
| 49 | |||
| 50 | pub fn eql(_: @This(), a: Block, b: Block) bool { | ||
| 51 | return std.mem.eql(u5, &a, &b); | ||
| 52 | } | ||
| 53 | }, | ||
| 54 | std.hash_map.default_max_load_percentage, | ||
| 55 | ); | ||
| 56 | |||
| 57 | pub fn main() !void { | ||
| 58 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); | ||
| 59 | defer arena.deinit(); | ||
| 60 | const allocator = arena.allocator(); | ||
| 61 | |||
| 62 | var flat_map = std.AutoHashMap(u21, u5).init(allocator); | ||
| 63 | defer flat_map.deinit(); | ||
| 64 | |||
| 65 | var line_buf: [4096]u8 = undefined; | ||
| 66 | |||
| 67 | // Process DerivedEastAsianWidth.txt | ||
| 68 | var in_file = try std.fs.cwd().openFile("data/unicode/extracted/DerivedGeneralCategory.txt", .{}); | ||
| 69 | defer in_file.close(); | ||
| 70 | var in_buf = std.io.bufferedReader(in_file.reader()); | ||
| 71 | const in_reader = in_buf.reader(); | ||
| 72 | |||
| 73 | while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { | ||
| 74 | if (line.len == 0 or line[0] == '#') continue; | ||
| 75 | |||
| 76 | const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line; | ||
| 77 | |||
| 78 | var field_iter = std.mem.tokenizeAny(u8, no_comment, "; "); | ||
| 79 | var current_code: [2]u21 = undefined; | ||
| 80 | |||
| 81 | var i: usize = 0; | ||
| 82 | while (field_iter.next()) |field| : (i += 1) { | ||
| 83 | switch (i) { | ||
| 84 | 0 => { | ||
| 85 | // Code point(s) | ||
| 86 | if (std.mem.indexOf(u8, field, "..")) |dots| { | ||
| 87 | current_code = .{ | ||
| 88 | try std.fmt.parseInt(u21, field[0..dots], 16), | ||
| 89 | try std.fmt.parseInt(u21, field[dots + 2 ..], 16), | ||
| 90 | }; | ||
| 91 | } else { | ||
| 92 | const code = try std.fmt.parseInt(u21, field, 16); | ||
| 93 | current_code = .{ code, code }; | ||
| 94 | } | ||
| 95 | }, | ||
| 96 | 1 => { | ||
| 97 | // General category | ||
| 98 | const gc = std.meta.stringToEnum(Gc, field) orelse return error.UnknownGenCat; | ||
| 99 | for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), @intFromEnum(gc)); | ||
| 100 | }, | ||
| 101 | else => {}, | ||
| 102 | } | ||
| 103 | } | ||
| 104 | } | ||
| 105 | |||
| 106 | var blocks_map = BlockMap.init(allocator); | ||
| 107 | defer blocks_map.deinit(); | ||
| 108 | |||
| 109 | var stage1 = std.ArrayList(u16).init(allocator); | ||
| 110 | defer stage1.deinit(); | ||
| 111 | |||
| 112 | var stage2 = std.ArrayList(u5).init(allocator); | ||
| 113 | defer stage2.deinit(); | ||
| 114 | |||
| 115 | var stage3 = std.ArrayList(u5).init(allocator); | ||
| 116 | defer stage3.deinit(); | ||
| 117 | |||
| 118 | var block: Block = [_]u5{0} ** block_size; | ||
| 119 | var block_len: u16 = 0; | ||
| 120 | |||
| 121 | for (0..0x110000) |i| { | ||
| 122 | const cp: u21 = @intCast(i); | ||
| 123 | const gc = flat_map.get(cp).?; | ||
| 124 | |||
| 125 | const stage3_idx = blk: { | ||
| 126 | for (stage3.items, 0..) |gci, j| { | ||
| 127 | if (gc == gci) break :blk j; | ||
| 128 | } | ||
| 129 | try stage3.append(gc); | ||
| 130 | break :blk stage3.items.len - 1; | ||
| 131 | }; | ||
| 132 | |||
| 133 | // Process block | ||
| 134 | block[block_len] = @intCast(stage3_idx); | ||
| 135 | block_len += 1; | ||
| 136 | |||
| 137 | if (block_len < block_size and cp != 0x10ffff) continue; | ||
| 138 | |||
| 139 | const gop = try blocks_map.getOrPut(block); | ||
| 140 | if (!gop.found_existing) { | ||
| 141 | gop.value_ptr.* = @intCast(stage2.items.len); | ||
| 142 | try stage2.appendSlice(&block); | ||
| 143 | } | ||
| 144 | |||
| 145 | try stage1.append(gop.value_ptr.*); | ||
| 146 | block_len = 0; | ||
| 147 | } | ||
| 148 | |||
| 149 | var args_iter = try std.process.argsWithAllocator(allocator); | ||
| 150 | defer args_iter.deinit(); | ||
| 151 | _ = args_iter.skip(); | ||
| 152 | const output_path = args_iter.next() orelse @panic("No output file arg!"); | ||
| 153 | |||
| 154 | const compressor = std.compress.deflate.compressor; | ||
| 155 | var out_file = try std.fs.cwd().createFile(output_path, .{}); | ||
| 156 | defer out_file.close(); | ||
| 157 | var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); | ||
| 158 | defer out_comp.deinit(); | ||
| 159 | const writer = out_comp.writer(); | ||
| 160 | |||
| 161 | const endian = builtin.cpu.arch.endian(); | ||
| 162 | try writer.writeInt(u16, @intCast(stage1.items.len), endian); | ||
| 163 | for (stage1.items) |i| try writer.writeInt(u16, i, endian); | ||
| 164 | |||
| 165 | try writer.writeInt(u16, @intCast(stage2.items.len), endian); | ||
| 166 | for (stage2.items) |i| try writer.writeInt(u8, i, endian); | ||
| 167 | |||
| 168 | try writer.writeInt(u8, @intCast(stage3.items.len), endian); | ||
| 169 | for (stage3.items) |i| try writer.writeInt(u8, i, endian); | ||
| 170 | |||
| 171 | try out_comp.flush(); | ||
| 172 | } | ||
diff --git a/src/GenCatData.zig b/src/GenCatData.zig new file mode 100644 index 0000000..5496e4e --- /dev/null +++ b/src/GenCatData.zig | |||
| @@ -0,0 +1,83 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | const compress = std.compress; | ||
| 4 | const mem = std.mem; | ||
| 5 | |||
| 6 | /// General Category | ||
| 7 | pub const Gc = enum { | ||
| 8 | Cc, | ||
| 9 | Cf, | ||
| 10 | Cn, | ||
| 11 | Co, | ||
| 12 | Cs, | ||
| 13 | Ll, | ||
| 14 | Lm, | ||
| 15 | Lo, | ||
| 16 | Lt, | ||
| 17 | Lu, | ||
| 18 | Mc, | ||
| 19 | Me, | ||
| 20 | Mn, | ||
| 21 | Nd, | ||
| 22 | Nl, | ||
| 23 | No, | ||
| 24 | Pc, | ||
| 25 | Pd, | ||
| 26 | Pe, | ||
| 27 | Pf, | ||
| 28 | Pi, | ||
| 29 | Po, | ||
| 30 | Ps, | ||
| 31 | Sc, | ||
| 32 | Sk, | ||
| 33 | Sm, | ||
| 34 | So, | ||
| 35 | Zl, | ||
| 36 | Zp, | ||
| 37 | Zs, | ||
| 38 | }; | ||
| 39 | |||
| 40 | allocator: mem.Allocator, | ||
| 41 | s1: []u16 = undefined, | ||
| 42 | s2: []u5 = undefined, | ||
| 43 | s3: []u5 = undefined, | ||
| 44 | |||
| 45 | const Self = @This(); | ||
| 46 | |||
| 47 | pub fn init(allocator: mem.Allocator) !Self { | ||
| 48 | const decompressor = compress.deflate.decompressor; | ||
| 49 | const in_bytes = @embedFile("gencat"); | ||
| 50 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 51 | var in_decomp = try decompressor(allocator, in_fbs.reader(), null); | ||
| 52 | defer in_decomp.deinit(); | ||
| 53 | var reader = in_decomp.reader(); | ||
| 54 | |||
| 55 | const endian = builtin.cpu.arch.endian(); | ||
| 56 | |||
| 57 | var self = Self{ .allocator = allocator }; | ||
| 58 | |||
| 59 | const s1_len: u16 = try reader.readInt(u16, endian); | ||
| 60 | self.s1 = try allocator.alloc(u16, s1_len); | ||
| 61 | for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian); | ||
| 62 | |||
| 63 | const s2_len: u16 = try reader.readInt(u16, endian); | ||
| 64 | self.s2 = try allocator.alloc(u5, s2_len); | ||
| 65 | for (0..s2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian)); | ||
| 66 | |||
| 67 | const s3_len: u16 = try reader.readInt(u8, endian); | ||
| 68 | self.s3 = try allocator.alloc(u5, s3_len); | ||
| 69 | for (0..s3_len) |i| self.s3[i] = @intCast(try reader.readInt(u8, endian)); | ||
| 70 | |||
| 71 | return self; | ||
| 72 | } | ||
| 73 | |||
| 74 | pub fn deinit(self: *Self) void { | ||
| 75 | self.allocator.free(self.s1); | ||
| 76 | self.allocator.free(self.s2); | ||
| 77 | self.allocator.free(self.s3); | ||
| 78 | } | ||
| 79 | |||
| 80 | /// Lookup the General Category for `cp`. | ||
| 81 | pub inline fn gc(self: Self, cp: u21) Gc { | ||
| 82 | return @enumFromInt(self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]]); | ||
| 83 | } | ||
diff --git a/src/HangulData.zig b/src/HangulData.zig index 4d80c99..b97424c 100644 --- a/src/HangulData.zig +++ b/src/HangulData.zig | |||
| @@ -15,7 +15,7 @@ pub const Syllable = enum { | |||
| 15 | 15 | ||
| 16 | allocator: mem.Allocator, | 16 | allocator: mem.Allocator, |
| 17 | s1: []u16 = undefined, | 17 | s1: []u16 = undefined, |
| 18 | s2: []Syllable = undefined, | 18 | s2: []u3 = undefined, |
| 19 | 19 | ||
| 20 | const Self = @This(); | 20 | const Self = @This(); |
| 21 | 21 | ||
| @@ -35,8 +35,8 @@ pub fn init(allocator: mem.Allocator) !Self { | |||
| 35 | for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); | 35 | for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); |
| 36 | 36 | ||
| 37 | const stage_2_len: u16 = try reader.readInt(u16, endian); | 37 | const stage_2_len: u16 = try reader.readInt(u16, endian); |
| 38 | self.s2 = try allocator.alloc(Syllable, stage_2_len); | 38 | self.s2 = try allocator.alloc(u3, stage_2_len); |
| 39 | for (0..stage_2_len) |i| self.s2[i] = @enumFromInt(try reader.readInt(u8, endian)); | 39 | for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian)); |
| 40 | 40 | ||
| 41 | return self; | 41 | return self; |
| 42 | } | 42 | } |
| @@ -48,5 +48,5 @@ pub fn deinit(self: *Self) void { | |||
| 48 | 48 | ||
| 49 | /// Returns the Hangul syllable type for `cp`. | 49 | /// Returns the Hangul syllable type for `cp`. |
| 50 | pub inline fn syllable(self: Self, cp: u21) Syllable { | 50 | pub inline fn syllable(self: Self, cp: u21) Syllable { |
| 51 | return self.s2[self.s1[cp >> 8] + (cp & 0xff)]; | 51 | return @enumFromInt(self.s2[self.s1[cp >> 8] + (cp & 0xff)]); |
| 52 | } | 52 | } |
diff --git a/src/main.zig b/src/main.zig index 0f1aab5..c521c4f 100644 --- a/src/main.zig +++ b/src/main.zig | |||
| @@ -11,14 +11,16 @@ const std = @import("std"); | |||
| 11 | // const strWidth = @import("display_width").strWidth; | 11 | // const strWidth = @import("display_width").strWidth; |
| 12 | 12 | ||
| 13 | // const CodePointIterator = @import("ziglyph").CodePointIterator; | 13 | // const CodePointIterator = @import("ziglyph").CodePointIterator; |
| 14 | // const CodePointIterator = @import("code_point").Iterator; | 14 | const CodePointIterator = @import("code_point").Iterator; |
| 15 | 15 | ||
| 16 | // const ascii = @import("ascii"); | 16 | // const ascii = @import("ascii"); |
| 17 | // const ascii = std.ascii; | 17 | // const ascii = std.ascii; |
| 18 | 18 | ||
| 19 | // const Normalizer = @import("ziglyph").Normalizer; | 19 | // const Normalizer = @import("ziglyph").Normalizer; |
| 20 | const NormData = @import("Normalizer").NormData; | 20 | // const NormData = @import("Normalizer").NormData; |
| 21 | const Normalizer = @import("Normalizer"); | 21 | // const Normalizer = @import("Normalizer"); |
| 22 | |||
| 23 | const GenCatData = @import("GenCatData"); | ||
| 22 | 24 | ||
| 23 | pub fn main() !void { | 25 | pub fn main() !void { |
| 24 | var args_iter = std.process.args(); | 26 | var args_iter = std.process.args(); |
| @@ -32,16 +34,19 @@ pub fn main() !void { | |||
| 32 | const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32)); | 34 | const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32)); |
| 33 | defer allocator.free(input); | 35 | defer allocator.free(input); |
| 34 | 36 | ||
| 35 | var data = try NormData.init(allocator); | 37 | // var data = try NormData.init(allocator); |
| 36 | defer data.deinit(); | 38 | // defer data.deinit(); |
| 37 | var n = Normalizer{ .norm_data = &data }; | 39 | // var n = Normalizer{ .norm_data = &data }; |
| 38 | // var n = try Normalizer.init(allocator); | 40 | // var n = try Normalizer.init(allocator); |
| 39 | // defer n.deinit(); | 41 | // defer n.deinit(); |
| 40 | 42 | ||
| 43 | var gencat_data = try GenCatData.init(allocator); | ||
| 44 | defer gencat_data.deinit(); | ||
| 45 | |||
| 41 | // var iter = GraphemeIterator.init(input, &data); | 46 | // var iter = GraphemeIterator.init(input, &data); |
| 42 | // defer iter.deinit(); | 47 | // defer iter.deinit(); |
| 43 | // var iter = CodePointIterator{ .bytes = input }; | 48 | var iter = CodePointIterator{ .bytes = input }; |
| 44 | var iter = std.mem.splitScalar(u8, input, '\n'); | 49 | // var iter = std.mem.splitScalar(u8, input, '\n'); |
| 45 | 50 | ||
| 46 | var result: usize = 0; | 51 | var result: usize = 0; |
| 47 | // var result: isize = 0; | 52 | // var result: isize = 0; |
| @@ -50,10 +55,14 @@ pub fn main() !void { | |||
| 50 | // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); | 55 | // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); |
| 51 | // while (iter.next()) |_| result += 1; | 56 | // while (iter.next()) |_| result += 1; |
| 52 | // while (iter.next()) |line| result += strWidth(line, &data); | 57 | // while (iter.next()) |line| result += strWidth(line, &data); |
| 53 | while (iter.next()) |line| { | 58 | // while (iter.next()) |line| { |
| 54 | const nfc = try n.nfc(allocator, line); | 59 | // const nfc = try n.nfc(allocator, line); |
| 55 | result += nfc.slice.len; | 60 | // result += nfc.slice.len; |
| 56 | // nfc.deinit(); | 61 | // // nfc.deinit(); |
| 62 | // } | ||
| 63 | while (iter.next()) |cp| { | ||
| 64 | if (cp.code == 'É') std.debug.print("`{u}` Gc: {s}\n", .{ cp.code, @tagName(gencat_data.gc(cp.code)) }); | ||
| 65 | result += 1; | ||
| 57 | } | 66 | } |
| 58 | 67 | ||
| 59 | std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); | 68 | std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); |