summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--build.zig22
-rw-r--r--codegen/gencat.zig172
-rw-r--r--src/GenCatData.zig83
-rw-r--r--src/HangulData.zig8
-rw-r--r--src/main.zig33
5 files changed, 300 insertions, 18 deletions
diff --git a/build.zig b/build.zig
index 7e41a9a..b10b0d3 100644
--- a/build.zig
+++ b/build.zig
@@ -79,6 +79,15 @@ pub fn build(b: *std.Build) void {
79 const run_ccc_gen_exe = b.addRunArtifact(ccc_gen_exe); 79 const run_ccc_gen_exe = b.addRunArtifact(ccc_gen_exe);
80 const ccc_gen_out = run_ccc_gen_exe.addOutputFileArg("ccc.bin.z"); 80 const ccc_gen_out = run_ccc_gen_exe.addOutputFileArg("ccc.bin.z");
81 81
82 const gencat_gen_exe = b.addExecutable(.{
83 .name = "gencat",
84 .root_source_file = .{ .path = "codegen/gencat.zig" },
85 .target = b.host,
86 .optimize = .Debug,
87 });
88 const run_gencat_gen_exe = b.addRunArtifact(gencat_gen_exe);
89 const gencat_gen_out = run_gencat_gen_exe.addOutputFileArg("gencat.bin.z");
90
82 // Modules we provide 91 // Modules we provide
83 // Code points 92 // Code points
84 const code_point = b.addModule("code_point", .{ 93 const code_point = b.addModule("code_point", .{
@@ -185,6 +194,14 @@ pub fn build(b: *std.Build) void {
185 norm.addImport("ziglyph", ziglyph.module("ziglyph")); 194 norm.addImport("ziglyph", ziglyph.module("ziglyph"));
186 norm.addImport("NormData", norm_data); 195 norm.addImport("NormData", norm_data);
187 196
197 // General Category
198 const gencat_data = b.createModule(.{
199 .root_source_file = .{ .path = "src/GenCatData.zig" },
200 .target = target,
201 .optimize = optimize,
202 });
203 gencat_data.addAnonymousImport("gencat", .{ .root_source_file = gencat_gen_out });
204
188 // Benchmark rig 205 // Benchmark rig
189 const exe = b.addExecutable(.{ 206 const exe = b.addExecutable(.{
190 .name = "zg", 207 .name = "zg",
@@ -194,10 +211,11 @@ pub fn build(b: *std.Build) void {
194 }); 211 });
195 // exe.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); 212 // exe.root_module.addImport("ziglyph", ziglyph.module("ziglyph"));
196 // exe.root_module.addImport("ascii", ascii); 213 // exe.root_module.addImport("ascii", ascii);
197 // exe.root_module.addImport("code_point", code_point); 214 exe.root_module.addImport("code_point", code_point);
198 // exe.root_module.addImport("grapheme", grapheme); 215 // exe.root_module.addImport("grapheme", grapheme);
199 // exe.root_module.addImport("DisplayWidth", display_width); 216 // exe.root_module.addImport("DisplayWidth", display_width);
200 exe.root_module.addImport("Normalizer", norm); 217 // exe.root_module.addImport("Normalizer", norm);
218 exe.root_module.addImport("GenCatData", gencat_data);
201 b.installArtifact(exe); 219 b.installArtifact(exe);
202 220
203 const run_cmd = b.addRunArtifact(exe); 221 const run_cmd = b.addRunArtifact(exe);
diff --git a/codegen/gencat.zig b/codegen/gencat.zig
new file mode 100644
index 0000000..5407040
--- /dev/null
+++ b/codegen/gencat.zig
@@ -0,0 +1,172 @@
1const std = @import("std");
2const builtin = @import("builtin");
3
4const Gc = enum {
5 Cc,
6 Cf,
7 Cn,
8 Co,
9 Cs,
10 Ll,
11 Lm,
12 Lo,
13 Lt,
14 Lu,
15 Mc,
16 Me,
17 Mn,
18 Nd,
19 Nl,
20 No,
21 Pc,
22 Pd,
23 Pe,
24 Pf,
25 Pi,
26 Po,
27 Ps,
28 Sc,
29 Sk,
30 Sm,
31 So,
32 Zl,
33 Zp,
34 Zs,
35};
36
37const block_size = 256;
38const Block = [block_size]u5;
39
40const BlockMap = std.HashMap(
41 Block,
42 u16,
43 struct {
44 pub fn hash(_: @This(), k: Block) u64 {
45 var hasher = std.hash.Wyhash.init(0);
46 std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
47 return hasher.final();
48 }
49
50 pub fn eql(_: @This(), a: Block, b: Block) bool {
51 return std.mem.eql(u5, &a, &b);
52 }
53 },
54 std.hash_map.default_max_load_percentage,
55);
56
57pub fn main() !void {
58 var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
59 defer arena.deinit();
60 const allocator = arena.allocator();
61
62 var flat_map = std.AutoHashMap(u21, u5).init(allocator);
63 defer flat_map.deinit();
64
65 var line_buf: [4096]u8 = undefined;
66
67 // Process DerivedEastAsianWidth.txt
68 var in_file = try std.fs.cwd().openFile("data/unicode/extracted/DerivedGeneralCategory.txt", .{});
69 defer in_file.close();
70 var in_buf = std.io.bufferedReader(in_file.reader());
71 const in_reader = in_buf.reader();
72
73 while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
74 if (line.len == 0 or line[0] == '#') continue;
75
76 const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line;
77
78 var field_iter = std.mem.tokenizeAny(u8, no_comment, "; ");
79 var current_code: [2]u21 = undefined;
80
81 var i: usize = 0;
82 while (field_iter.next()) |field| : (i += 1) {
83 switch (i) {
84 0 => {
85 // Code point(s)
86 if (std.mem.indexOf(u8, field, "..")) |dots| {
87 current_code = .{
88 try std.fmt.parseInt(u21, field[0..dots], 16),
89 try std.fmt.parseInt(u21, field[dots + 2 ..], 16),
90 };
91 } else {
92 const code = try std.fmt.parseInt(u21, field, 16);
93 current_code = .{ code, code };
94 }
95 },
96 1 => {
97 // General category
98 const gc = std.meta.stringToEnum(Gc, field) orelse return error.UnknownGenCat;
99 for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), @intFromEnum(gc));
100 },
101 else => {},
102 }
103 }
104 }
105
106 var blocks_map = BlockMap.init(allocator);
107 defer blocks_map.deinit();
108
109 var stage1 = std.ArrayList(u16).init(allocator);
110 defer stage1.deinit();
111
112 var stage2 = std.ArrayList(u5).init(allocator);
113 defer stage2.deinit();
114
115 var stage3 = std.ArrayList(u5).init(allocator);
116 defer stage3.deinit();
117
118 var block: Block = [_]u5{0} ** block_size;
119 var block_len: u16 = 0;
120
121 for (0..0x110000) |i| {
122 const cp: u21 = @intCast(i);
123 const gc = flat_map.get(cp).?;
124
125 const stage3_idx = blk: {
126 for (stage3.items, 0..) |gci, j| {
127 if (gc == gci) break :blk j;
128 }
129 try stage3.append(gc);
130 break :blk stage3.items.len - 1;
131 };
132
133 // Process block
134 block[block_len] = @intCast(stage3_idx);
135 block_len += 1;
136
137 if (block_len < block_size and cp != 0x10ffff) continue;
138
139 const gop = try blocks_map.getOrPut(block);
140 if (!gop.found_existing) {
141 gop.value_ptr.* = @intCast(stage2.items.len);
142 try stage2.appendSlice(&block);
143 }
144
145 try stage1.append(gop.value_ptr.*);
146 block_len = 0;
147 }
148
149 var args_iter = try std.process.argsWithAllocator(allocator);
150 defer args_iter.deinit();
151 _ = args_iter.skip();
152 const output_path = args_iter.next() orelse @panic("No output file arg!");
153
154 const compressor = std.compress.deflate.compressor;
155 var out_file = try std.fs.cwd().createFile(output_path, .{});
156 defer out_file.close();
157 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression });
158 defer out_comp.deinit();
159 const writer = out_comp.writer();
160
161 const endian = builtin.cpu.arch.endian();
162 try writer.writeInt(u16, @intCast(stage1.items.len), endian);
163 for (stage1.items) |i| try writer.writeInt(u16, i, endian);
164
165 try writer.writeInt(u16, @intCast(stage2.items.len), endian);
166 for (stage2.items) |i| try writer.writeInt(u8, i, endian);
167
168 try writer.writeInt(u8, @intCast(stage3.items.len), endian);
169 for (stage3.items) |i| try writer.writeInt(u8, i, endian);
170
171 try out_comp.flush();
172}
diff --git a/src/GenCatData.zig b/src/GenCatData.zig
new file mode 100644
index 0000000..5496e4e
--- /dev/null
+++ b/src/GenCatData.zig
@@ -0,0 +1,83 @@
1const std = @import("std");
2const builtin = @import("builtin");
3const compress = std.compress;
4const mem = std.mem;
5
6/// General Category
7pub const Gc = enum {
8 Cc,
9 Cf,
10 Cn,
11 Co,
12 Cs,
13 Ll,
14 Lm,
15 Lo,
16 Lt,
17 Lu,
18 Mc,
19 Me,
20 Mn,
21 Nd,
22 Nl,
23 No,
24 Pc,
25 Pd,
26 Pe,
27 Pf,
28 Pi,
29 Po,
30 Ps,
31 Sc,
32 Sk,
33 Sm,
34 So,
35 Zl,
36 Zp,
37 Zs,
38};
39
40allocator: mem.Allocator,
41s1: []u16 = undefined,
42s2: []u5 = undefined,
43s3: []u5 = undefined,
44
45const Self = @This();
46
47pub fn init(allocator: mem.Allocator) !Self {
48 const decompressor = compress.deflate.decompressor;
49 const in_bytes = @embedFile("gencat");
50 var in_fbs = std.io.fixedBufferStream(in_bytes);
51 var in_decomp = try decompressor(allocator, in_fbs.reader(), null);
52 defer in_decomp.deinit();
53 var reader = in_decomp.reader();
54
55 const endian = builtin.cpu.arch.endian();
56
57 var self = Self{ .allocator = allocator };
58
59 const s1_len: u16 = try reader.readInt(u16, endian);
60 self.s1 = try allocator.alloc(u16, s1_len);
61 for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
62
63 const s2_len: u16 = try reader.readInt(u16, endian);
64 self.s2 = try allocator.alloc(u5, s2_len);
65 for (0..s2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian));
66
67 const s3_len: u16 = try reader.readInt(u8, endian);
68 self.s3 = try allocator.alloc(u5, s3_len);
69 for (0..s3_len) |i| self.s3[i] = @intCast(try reader.readInt(u8, endian));
70
71 return self;
72}
73
74pub fn deinit(self: *Self) void {
75 self.allocator.free(self.s1);
76 self.allocator.free(self.s2);
77 self.allocator.free(self.s3);
78}
79
80/// Lookup the General Category for `cp`.
81pub inline fn gc(self: Self, cp: u21) Gc {
82 return @enumFromInt(self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]]);
83}
diff --git a/src/HangulData.zig b/src/HangulData.zig
index 4d80c99..b97424c 100644
--- a/src/HangulData.zig
+++ b/src/HangulData.zig
@@ -15,7 +15,7 @@ pub const Syllable = enum {
15 15
16allocator: mem.Allocator, 16allocator: mem.Allocator,
17s1: []u16 = undefined, 17s1: []u16 = undefined,
18s2: []Syllable = undefined, 18s2: []u3 = undefined,
19 19
20const Self = @This(); 20const Self = @This();
21 21
@@ -35,8 +35,8 @@ pub fn init(allocator: mem.Allocator) !Self {
35 for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); 35 for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
36 36
37 const stage_2_len: u16 = try reader.readInt(u16, endian); 37 const stage_2_len: u16 = try reader.readInt(u16, endian);
38 self.s2 = try allocator.alloc(Syllable, stage_2_len); 38 self.s2 = try allocator.alloc(u3, stage_2_len);
39 for (0..stage_2_len) |i| self.s2[i] = @enumFromInt(try reader.readInt(u8, endian)); 39 for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian));
40 40
41 return self; 41 return self;
42} 42}
@@ -48,5 +48,5 @@ pub fn deinit(self: *Self) void {
48 48
49/// Returns the Hangul syllable type for `cp`. 49/// Returns the Hangul syllable type for `cp`.
50pub inline fn syllable(self: Self, cp: u21) Syllable { 50pub inline fn syllable(self: Self, cp: u21) Syllable {
51 return self.s2[self.s1[cp >> 8] + (cp & 0xff)]; 51 return @enumFromInt(self.s2[self.s1[cp >> 8] + (cp & 0xff)]);
52} 52}
diff --git a/src/main.zig b/src/main.zig
index 0f1aab5..c521c4f 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -11,14 +11,16 @@ const std = @import("std");
11// const strWidth = @import("display_width").strWidth; 11// const strWidth = @import("display_width").strWidth;
12 12
13// const CodePointIterator = @import("ziglyph").CodePointIterator; 13// const CodePointIterator = @import("ziglyph").CodePointIterator;
14// const CodePointIterator = @import("code_point").Iterator; 14const CodePointIterator = @import("code_point").Iterator;
15 15
16// const ascii = @import("ascii"); 16// const ascii = @import("ascii");
17// const ascii = std.ascii; 17// const ascii = std.ascii;
18 18
19// const Normalizer = @import("ziglyph").Normalizer; 19// const Normalizer = @import("ziglyph").Normalizer;
20const NormData = @import("Normalizer").NormData; 20// const NormData = @import("Normalizer").NormData;
21const Normalizer = @import("Normalizer"); 21// const Normalizer = @import("Normalizer");
22
23const GenCatData = @import("GenCatData");
22 24
23pub fn main() !void { 25pub fn main() !void {
24 var args_iter = std.process.args(); 26 var args_iter = std.process.args();
@@ -32,16 +34,19 @@ pub fn main() !void {
32 const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32)); 34 const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32));
33 defer allocator.free(input); 35 defer allocator.free(input);
34 36
35 var data = try NormData.init(allocator); 37 // var data = try NormData.init(allocator);
36 defer data.deinit(); 38 // defer data.deinit();
37 var n = Normalizer{ .norm_data = &data }; 39 // var n = Normalizer{ .norm_data = &data };
38 // var n = try Normalizer.init(allocator); 40 // var n = try Normalizer.init(allocator);
39 // defer n.deinit(); 41 // defer n.deinit();
40 42
43 var gencat_data = try GenCatData.init(allocator);
44 defer gencat_data.deinit();
45
41 // var iter = GraphemeIterator.init(input, &data); 46 // var iter = GraphemeIterator.init(input, &data);
42 // defer iter.deinit(); 47 // defer iter.deinit();
43 // var iter = CodePointIterator{ .bytes = input }; 48 var iter = CodePointIterator{ .bytes = input };
44 var iter = std.mem.splitScalar(u8, input, '\n'); 49 // var iter = std.mem.splitScalar(u8, input, '\n');
45 50
46 var result: usize = 0; 51 var result: usize = 0;
47 // var result: isize = 0; 52 // var result: isize = 0;
@@ -50,10 +55,14 @@ pub fn main() !void {
50 // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); 55 // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code));
51 // while (iter.next()) |_| result += 1; 56 // while (iter.next()) |_| result += 1;
52 // while (iter.next()) |line| result += strWidth(line, &data); 57 // while (iter.next()) |line| result += strWidth(line, &data);
53 while (iter.next()) |line| { 58 // while (iter.next()) |line| {
54 const nfc = try n.nfc(allocator, line); 59 // const nfc = try n.nfc(allocator, line);
55 result += nfc.slice.len; 60 // result += nfc.slice.len;
56 // nfc.deinit(); 61 // // nfc.deinit();
62 // }
63 while (iter.next()) |cp| {
64 if (cp.code == 'É') std.debug.print("`{u}` Gc: {s}\n", .{ cp.code, @tagName(gencat_data.gc(cp.code)) });
65 result += 1;
57 } 66 }
58 67
59 std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); 68 std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms });