summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--build.zig25
-rw-r--r--codegen/dwp.zig243
-rw-r--r--src/display_width.zig111
-rw-r--r--src/main.zig14
4 files changed, 386 insertions, 7 deletions
diff --git a/build.zig b/build.zig
index 6353874..2a39549 100644
--- a/build.zig
+++ b/build.zig
@@ -17,6 +17,15 @@ pub fn build(b: *std.Build) void {
17 const run_gbp_gen_exe = b.addRunArtifact(gbp_gen_exe); 17 const run_gbp_gen_exe = b.addRunArtifact(gbp_gen_exe);
18 const gbp_gen_out = run_gbp_gen_exe.addOutputFileArg("gbp.zig"); 18 const gbp_gen_out = run_gbp_gen_exe.addOutputFileArg("gbp.zig");
19 19
20 const dwp_gen_exe = b.addExecutable(.{
21 .name = "dwp",
22 .root_source_file = .{ .path = "codegen/dwp.zig" },
23 .target = b.host,
24 .optimize = .Debug,
25 });
26 const run_dwp_gen_exe = b.addRunArtifact(dwp_gen_exe);
27 const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.zig");
28
20 // Modules we provide 29 // Modules we provide
21 const code_point = b.addModule("CodePoint", .{ 30 const code_point = b.addModule("CodePoint", .{
22 .root_source_file = .{ .path = "src/CodePoint.zig" }, 31 .root_source_file = .{ .path = "src/CodePoint.zig" },
@@ -32,6 +41,15 @@ pub fn build(b: *std.Build) void {
32 grapheme.addImport("CodePoint", code_point); 41 grapheme.addImport("CodePoint", code_point);
33 grapheme.addAnonymousImport("gbp", .{ .root_source_file = gbp_gen_out }); 42 grapheme.addAnonymousImport("gbp", .{ .root_source_file = gbp_gen_out });
34 43
44 const display_width = b.addModule("display_width", .{
45 .root_source_file = .{ .path = "src/display_width.zig" },
46 .target = target,
47 .optimize = optimize,
48 });
49 display_width.addImport("CodePoint", code_point);
50 display_width.addImport("Grapheme", grapheme);
51 display_width.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out });
52
35 // Benchmark rig 53 // Benchmark rig
36 const exe = b.addExecutable(.{ 54 const exe = b.addExecutable(.{
37 .name = "zgbench", 55 .name = "zgbench",
@@ -40,7 +58,9 @@ pub fn build(b: *std.Build) void {
40 .optimize = optimize, 58 .optimize = optimize,
41 }); 59 });
42 exe.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); 60 exe.root_module.addImport("ziglyph", ziglyph.module("ziglyph"));
61 exe.root_module.addImport("CodePoint", code_point);
43 exe.root_module.addImport("Grapheme", grapheme); 62 exe.root_module.addImport("Grapheme", grapheme);
63 exe.root_module.addImport("display_width", display_width);
44 b.installArtifact(exe); 64 b.installArtifact(exe);
45 65
46 const run_cmd = b.addRunArtifact(exe); 66 const run_cmd = b.addRunArtifact(exe);
@@ -52,12 +72,13 @@ pub fn build(b: *std.Build) void {
52 72
53 // Tests 73 // Tests
54 const exe_unit_tests = b.addTest(.{ 74 const exe_unit_tests = b.addTest(.{
55 .root_source_file = .{ .path = "src/main.zig" }, 75 .root_source_file = .{ .path = "src/display_width.zig" },
56 .target = target, 76 .target = target,
57 .optimize = optimize, 77 .optimize = optimize,
58 }); 78 });
59 exe_unit_tests.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); 79 exe_unit_tests.root_module.addImport("CodePoint", code_point);
60 exe_unit_tests.root_module.addImport("Grapheme", grapheme); 80 exe_unit_tests.root_module.addImport("Grapheme", grapheme);
81 exe_unit_tests.root_module.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out });
61 82
62 const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); 83 const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
63 84
diff --git a/codegen/dwp.zig b/codegen/dwp.zig
new file mode 100644
index 0000000..a8cef57
--- /dev/null
+++ b/codegen/dwp.zig
@@ -0,0 +1,243 @@
1const std = @import("std");
2
3const block_size = 256;
4const Block = [block_size]i3;
5
6const BlockMap = std.HashMap(
7 Block,
8 u16,
9 struct {
10 pub fn hash(_: @This(), k: Block) u64 {
11 var hasher = std.hash.Wyhash.init(0);
12 std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
13 return hasher.final();
14 }
15
16 pub fn eql(_: @This(), a: Block, b: Block) bool {
17 return std.mem.eql(i3, &a, &b);
18 }
19 },
20 std.hash_map.default_max_load_percentage,
21);
22
23pub fn main() !void {
24 var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
25 defer arena.deinit();
26 const allocator = arena.allocator();
27
28 var flat_map = std.AutoHashMap(u21, i3).init(allocator);
29 defer flat_map.deinit();
30
31 var line_buf: [4096]u8 = undefined;
32
33 // Process DerivedEastAsianWidth.txt
34 var deaw_file = try std.fs.cwd().openFile("unicode/extracted/DerivedEastAsianWidth.txt", .{});
35 defer deaw_file.close();
36 var deaw_buf = std.io.bufferedReader(deaw_file.reader());
37 const deaw_reader = deaw_buf.reader();
38
39 while (try deaw_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
40 if (line.len == 0) continue;
41
42 // @missing ranges
43 if (std.mem.startsWith(u8, line, "# @missing: ")) {
44 const semi = std.mem.indexOfScalar(u8, line, ';').?;
45 const field = line[12..semi];
46 const dots = std.mem.indexOf(u8, field, "..").?;
47 const from = try std.fmt.parseInt(u21, field[0..dots], 16);
48 const to = try std.fmt.parseInt(u21, field[dots + 2 ..], 16);
49 if (from == 0 and to == 0x10ffff) continue;
50 for (from..to + 1) |cp| try flat_map.put(@intCast(cp), 2);
51 continue;
52 }
53
54 if (line[0] == '#') continue;
55
56 const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line;
57
58 var field_iter = std.mem.tokenizeAny(u8, no_comment, "; ");
59 var current_code: [2]u21 = undefined;
60
61 var i: usize = 0;
62 while (field_iter.next()) |field| : (i += 1) {
63 switch (i) {
64 0 => {
65 // Code point(s)
66 if (std.mem.indexOf(u8, field, "..")) |dots| {
67 current_code = .{
68 try std.fmt.parseInt(u21, field[0..dots], 16),
69 try std.fmt.parseInt(u21, field[dots + 2 ..], 16),
70 };
71 } else {
72 const code = try std.fmt.parseInt(u21, field, 16);
73 current_code = .{ code, code };
74 }
75 },
76 1 => {
77 // Width
78 if (std.mem.eql(u8, field, "W") or std.mem.eql(u8, field, "F")) {
79 for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 2);
80 }
81 },
82 else => {},
83 }
84 }
85 }
86
87 // Process DerivedGeneralCategory.txt
88 var dgc_file = try std.fs.cwd().openFile("unicode/extracted/DerivedGeneralCategory.txt", .{});
89 defer dgc_file.close();
90 var dgc_buf = std.io.bufferedReader(dgc_file.reader());
91 const dgc_reader = dgc_buf.reader();
92
93 while (try dgc_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
94 if (line.len == 0 or line[0] == '#') continue;
95 const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line;
96
97 var field_iter = std.mem.tokenizeAny(u8, no_comment, "; ");
98 var current_code: [2]u21 = undefined;
99
100 var i: usize = 0;
101 while (field_iter.next()) |field| : (i += 1) {
102 switch (i) {
103 0 => {
104 // Code point(s)
105 if (std.mem.indexOf(u8, field, "..")) |dots| {
106 current_code = .{
107 try std.fmt.parseInt(u21, field[0..dots], 16),
108 try std.fmt.parseInt(u21, field[dots + 2 ..], 16),
109 };
110 } else {
111 const code = try std.fmt.parseInt(u21, field, 16);
112 current_code = .{ code, code };
113 }
114 },
115 1 => {
116 // General category
117 if (std.mem.eql(u8, field, "Mn")) {
118 // Nonspacing_Mark
119 for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 0);
120 } else if (std.mem.eql(u8, field, "Me")) {
121 // Enclosing_Mark
122 for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 0);
123 } else if (std.mem.eql(u8, field, "Mc")) {
124 // Spacing_Mark
125 for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 0);
126 } else if (std.mem.eql(u8, field, "Cf")) {
127 if (std.mem.indexOf(u8, line, "ARABIC") == null) {
128 // Format except Arabic
129 for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 0);
130 }
131 }
132 },
133 else => {},
134 }
135 }
136 }
137
138 var blocks_map = BlockMap.init(allocator);
139 defer blocks_map.deinit();
140
141 var stage1 = std.ArrayList(u16).init(allocator);
142 defer stage1.deinit();
143
144 var stage2 = std.ArrayList(i3).init(allocator);
145 defer stage2.deinit();
146
147 var block: Block = [_]i3{0} ** block_size;
148 var block_len: u16 = 0;
149
150 for (0..0x110000) |i| {
151 const cp: u21 = @intCast(i);
152 var width = flat_map.get(cp) orelse 1;
153
154 // Specific overrides
155 switch (cp) {
156 // Three-em dash
157 0x2e3b => width = 3,
158
159 // C0/C1 control codes
160 0...0x20,
161 0x80...0xa0,
162
163 // Line separator
164 0x2028,
165
166 // Paragraph separator
167 0x2029,
168
169 // Hangul syllable and ignorable.
170 0x1160...0x11ff,
171 0xd7b0...0xd7ff,
172 0x2060...0x206f,
173 0xfff0...0xfff8,
174 0xe0000...0xE0fff,
175
176 // Sk with EMOJI MODIFIER comment
177 0x1f3fb...0x1f3ff,
178 => width = 0,
179
180 // Two-em dash
181 0x2e3a,
182
183 // Regional indicators
184 0x1f1e6...0x1f200,
185
186 // CJK Blocks
187 0x3400...0x4dbf, // CJK Unified Ideographs Extension A
188 0x4e00...0x9fff, // CJK Unified Ideographs
189 0xf900...0xfaff, // CJK Compatibility Ideographs
190 0x20000...0x2fffd, // Plane 2
191 0x30000...0x3fffd, // Plane 3
192 => width = 2,
193
194 else => {},
195 }
196
197 // ASCII
198 if (0x20 <= cp and cp < 0x7f) width = 1;
199
200 // Soft hyphen
201 if (cp == 0xad) width = 1;
202
203 // Backspace and delete
204 if (cp == 0x8 or cp == 0x7f) width = -1;
205
206 // Process block
207 block[block_len] = width;
208 block_len += 1;
209
210 if (block_len < block_size and cp != 0x10ffff) continue;
211
212 const gop = try blocks_map.getOrPut(block);
213 if (!gop.found_existing) {
214 gop.value_ptr.* = @intCast(stage2.items.len);
215 try stage2.appendSlice(&block);
216 }
217
218 try stage1.append(gop.value_ptr.*);
219 block_len = 0;
220 }
221
222 var args_iter = std.process.args();
223 _ = args_iter.skip();
224 const output_path = args_iter.next() orelse @panic("No output file arg!");
225
226 var out_file = try std.fs.cwd().createFile(output_path, .{});
227 defer out_file.close();
228 var out_buf = std.io.bufferedWriter(out_file.writer());
229 const writer = out_buf.writer();
230
231 try writer.writeAll("const std = @import(\"std\");\n");
232
233 try writer.print("const Stage2Int = std.math.IntFittingRange(0, {});\n", .{stage2.items.len});
234 try writer.print("pub const stage_1 = [{}]Stage2Int{{", .{stage1.items.len});
235 for (stage1.items) |v| try writer.print("{},", .{v});
236 try writer.writeAll("};\n");
237
238 try writer.print("pub const stage_2 = [{}]i3{{", .{stage2.items.len});
239 for (stage2.items) |v| try writer.print("{},", .{v});
240 try writer.writeAll("};\n");
241
242 try out_buf.flush();
243}
diff --git a/src/display_width.zig b/src/display_width.zig
new file mode 100644
index 0000000..e06aa8f
--- /dev/null
+++ b/src/display_width.zig
@@ -0,0 +1,111 @@
1const std = @import("std");
2const testing = std.testing;
3
4const CodePointIterator = @import("CodePoint").CodePointIterator;
5const GraphemeIterator = @import("Grapheme").GraphemeIterator;
6const dwp = @import("dwp");
7
8/// codePointWidth returns the number of cells `cp` requires when rendered
9/// in a fixed-pitch font (i.e. a terminal screen). This can range from -1 to
10/// 3, where BACKSPACE and DELETE return -1 and 3-em-dash returns 3. C0/C1
11/// control codes return 0. If `cjk` is true, ambiguous code points return 2,
12/// otherwise they return 1.
13pub fn codePointWidth(cp: u21) i3 {
14 return dwp.stage_2[dwp.stage_1[cp >> 8] + (cp & 0xff)];
15}
16
17fn strWidth(str: []const u8) usize {
18 var total: isize = 0;
19 var giter = GraphemeIterator.init(str);
20
21 while (giter.next()) |gc| {
22 var cp_iter = CodePointIterator{ .bytes = str[gc.offset..][0..gc.len] };
23 var gc_total: isize = 0;
24
25 while (cp_iter.next()) |cp| {
26 var w = codePointWidth(cp.code);
27
28 if (w != 0) {
29 // Handle text emoji sequence.
30 if (cp_iter.next()) |ncp| {
31 // emoji text sequence.
32 if (ncp.code == 0xFE0E) w = 1;
33 }
34
35 // Only adding width of first non-zero-width code point.
36 if (gc_total == 0) gc_total = w;
37 }
38 }
39
40 total += gc_total;
41 }
42
43 return if (total > 0) @intCast(total) else 0;
44}
45
46test "display_width Width" {
47 try testing.expectEqual(@as(i3, 0), codePointWidth(0x0000)); // null
48 try testing.expectEqual(@as(i3, -1), codePointWidth(0x8)); // \b
49 try testing.expectEqual(@as(i3, -1), codePointWidth(0x7f)); // DEL
50 try testing.expectEqual(@as(i3, 0), codePointWidth(0x0005)); // Cf
51 try testing.expectEqual(@as(i3, 0), codePointWidth(0x0007)); // \a BEL
52 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000A)); // \n LF
53 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000B)); // \v VT
54 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000C)); // \f FF
55 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000D)); // \r CR
56 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000E)); // SQ
57 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000F)); // SI
58
59 try testing.expectEqual(@as(i3, 0), codePointWidth(0x070F)); // Cf
60 try testing.expectEqual(@as(i3, 1), codePointWidth(0x0603)); // Cf Arabic
61
62 try testing.expectEqual(@as(i3, 1), codePointWidth(0x00AD)); // soft-hyphen
63 try testing.expectEqual(@as(i3, 2), codePointWidth(0x2E3A)); // two-em dash
64 try testing.expectEqual(@as(i3, 3), codePointWidth(0x2E3B)); // three-em dash
65
66 try testing.expectEqual(@as(i3, 1), codePointWidth(0x00BD)); // ambiguous halfwidth
67
68 try testing.expectEqual(@as(i3, 1), codePointWidth('é'));
69 try testing.expectEqual(@as(i3, 2), codePointWidth('😊'));
70 try testing.expectEqual(@as(i3, 2), codePointWidth('统'));
71
72 try testing.expectEqual(@as(usize, 5), strWidth("Hello\r\n"));
73 try testing.expectEqual(@as(usize, 1), strWidth("\u{0065}\u{0301}"));
74 try testing.expectEqual(@as(usize, 2), strWidth("\u{1F476}\u{1F3FF}\u{0308}\u{200D}\u{1F476}\u{1F3FF}"));
75 try testing.expectEqual(@as(usize, 8), strWidth("Hello 😊"));
76 try testing.expectEqual(@as(usize, 8), strWidth("Héllo 😊"));
77 try testing.expectEqual(@as(usize, 8), strWidth("Héllo :)"));
78 try testing.expectEqual(@as(usize, 8), strWidth("Héllo 🇪🇸"));
79 try testing.expectEqual(@as(usize, 2), strWidth("\u{26A1}")); // Lone emoji
80 try testing.expectEqual(@as(usize, 1), strWidth("\u{26A1}\u{FE0E}")); // Text sequence
81 try testing.expectEqual(@as(usize, 2), strWidth("\u{26A1}\u{FE0F}")); // Presentation sequence
82 try testing.expectEqual(@as(usize, 0), strWidth("A\x08")); // Backspace
83 try testing.expectEqual(@as(usize, 0), strWidth("\x7FA")); // DEL
84 try testing.expectEqual(@as(usize, 0), strWidth("\x7FA\x08\x08")); // never less than o
85
86 // wcwidth Python lib tests. See: https://github.com/jquast/wcwidth/blob/master/tests/test_core.py
87 const empty = "";
88 try testing.expectEqual(@as(usize, 0), strWidth(empty));
89 const with_null = "hello\x00world";
90 try testing.expectEqual(@as(usize, 10), strWidth(with_null));
91 const hello_jp = "コンニチハ, セカイ!";
92 try testing.expectEqual(@as(usize, 19), strWidth(hello_jp));
93 const control = "\x1b[0m";
94 try testing.expectEqual(@as(usize, 3), strWidth(control));
95 const balinese = "\u{1B13}\u{1B28}\u{1B2E}\u{1B44}";
96 try testing.expectEqual(@as(usize, 3), strWidth(balinese));
97
98 // These commented out tests require a new specification for complex scripts.
99 // See: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf
100 // const jamo = "\u{1100}\u{1160}";
101 // try testing.expectEqual(@as(usize, 3), strWidth(jamo));
102 // const devengari = "\u{0915}\u{094D}\u{0937}\u{093F}";
103 // try testing.expectEqual(@as(usize, 3), strWidth(devengari));
104 // const tamal = "\u{0b95}\u{0bcd}\u{0bb7}\u{0bcc}";
105 // try testing.expectEqual(@as(usize, 5), strWidth(tamal));
106 // const kannada_1 = "\u{0cb0}\u{0ccd}\u{0c9d}\u{0cc8}";
107 // try testing.expectEqual(@as(usize, 3), strWidth(kannada_1));
108 // The following passes but as a mere coincidence.
109 const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}";
110 try testing.expectEqual(@as(usize, 2), strWidth(kannada_2));
111}
diff --git a/src/main.zig b/src/main.zig
index fe49300..3e65c7b 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -1,7 +1,10 @@
1const std = @import("std"); 1const std = @import("std");
2 2
3// const GraphemeIterator = @import("ziglyph").GraphemeIterator; 3// const GraphemeIterator = @import("ziglyph").GraphemeIterator;
4const GraphemeIterator = @import("Grapheme").GraphemeIterator; 4// const GraphemeIterator = @import("Grapheme").GraphemeIterator;
5// const codePointWidth = @import("ziglyph").display_width.codePointWidth;
6const codePointWidth = @import("display_width").codePointWidth;
7const CodePointIterator = @import("CodePoint").CodePointIterator;
5 8
6pub fn main() !void { 9pub fn main() !void {
7 var gpa = std.heap.GeneralPurposeAllocator(.{}){}; 10 var gpa = std.heap.GeneralPurposeAllocator(.{}){};
@@ -11,14 +14,15 @@ pub fn main() !void {
11 const input = try std.fs.cwd().readFileAlloc(allocator, "lang_mix.txt", std.math.maxInt(u32)); 14 const input = try std.fs.cwd().readFileAlloc(allocator, "lang_mix.txt", std.math.maxInt(u32));
12 defer allocator.free(input); 15 defer allocator.free(input);
13 16
14 var result: usize = 0; 17 var result: isize = 0;
15 var iter = GraphemeIterator.init(input); 18 // var iter = GraphemeIterator.init(input);
19 var iter = CodePointIterator{ .bytes = input };
16 20
17 var timer = try std.time.Timer.start(); 21 var timer = try std.time.Timer.start();
18 22
19 // for (0..50) |_| { 23 // for (0..50) |_| {
20 while (iter.next()) |_| result += 1; 24 while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code));
21 iter.cp_iter.i = 0; 25 // iter.cp_iter.i = 0;
22 // } 26 // }
23 27
24 std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); 28 std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms });