diff options
| -rw-r--r-- | README.md | 4 | ||||
| -rw-r--r-- | build.zig | 17 | ||||
| -rw-r--r-- | codegen/dwp.zig | 16 | ||||
| -rw-r--r-- | src/DisplayWidth.zig | 14 | ||||
| -rw-r--r-- | src/WidthData.zig | 54 |
5 files changed, 63 insertions, 42 deletions
| @@ -441,7 +441,7 @@ test "Display width" { | |||
| 441 | } | 441 | } |
| 442 | ``` | 442 | ``` |
| 443 | 443 | ||
| 444 | This has a build option, `"cjk"`, which will consider [ambiguous characters](https://www.unicode.org/reports/tr11/tr11-6.html) as double-width. | 444 | This module has build options. The first is `cjk`, which will consider [ambiguous characters](https://www.unicode.org/reports/tr11/tr11-6.html) as double-width. |
| 445 | 445 | ||
| 446 | To choose this option, add it to the dependency like so: | 446 | To choose this option, add it to the dependency like so: |
| 447 | 447 | ||
| @@ -451,6 +451,8 @@ const zg = b.dependency("zg", .{ | |||
| 451 | }); | 451 | }); |
| 452 | ``` | 452 | ``` |
| 453 | 453 | ||
| 454 | The other options are `c0_width` and `c1_width`. The standard behavior is to treat C0 and C1 control codes as zero-width, except for delete and backspace, which are -1 (the logic ensures that a `strWidth` is always at least 0). If printing control codes with replacement characters, it's necessary to assign these a width, hence the options. When provided these values must fit in an `i4`, this allows for C1s to be printed as `\u{80}` if desired. | ||
| 455 | |||
| 454 | ## Scripts | 456 | ## Scripts |
| 455 | 457 | ||
| 456 | Unicode categorizes code points by the Script in which they belong. A Script | 458 | Unicode categorizes code points by the Script in which they belong. A Script |
| @@ -16,10 +16,24 @@ pub fn build(b: *std.Build) void { | |||
| 16 | const gbp_gen_out = run_gbp_gen_exe.addOutputFileArg("gbp.bin.z"); | 16 | const gbp_gen_out = run_gbp_gen_exe.addOutputFileArg("gbp.bin.z"); |
| 17 | 17 | ||
| 18 | // Display width | 18 | // Display width |
| 19 | const cjk = b.option(bool, "cjk", "Ambiguouse code points are wide (display width: 2).") orelse false; | 19 | const cjk = b.option(bool, "cjk", "Ambiguous code points are wide (display width: 2).") orelse false; |
| 20 | const options = b.addOptions(); | 20 | const options = b.addOptions(); |
| 21 | options.addOption(bool, "cjk", cjk); | 21 | options.addOption(bool, "cjk", cjk); |
| 22 | 22 | ||
| 23 | // Visible Controls | ||
| 24 | const c0_width = b.option( | ||
| 25 | i4, | ||
| 26 | "c0_width", | ||
| 27 | "C0 controls have this width (default: 0, <BS> <Del> default -1)", | ||
| 28 | ); | ||
| 29 | options.addOption(?i4, "c0_width", c0_width); | ||
| 30 | const c1_width = b.option( | ||
| 31 | i4, | ||
| 32 | "c1_width", | ||
| 33 | "C1 controls have this width (default: 0)", | ||
| 34 | ); | ||
| 35 | options.addOption(?i4, "c1_width", c1_width); | ||
| 36 | |||
| 23 | const dwp_gen_exe = b.addExecutable(.{ | 37 | const dwp_gen_exe = b.addExecutable(.{ |
| 24 | .name = "dwp", | 38 | .name = "dwp", |
| 25 | .root_source_file = b.path("codegen/dwp.zig"), | 39 | .root_source_file = b.path("codegen/dwp.zig"), |
| @@ -210,6 +224,7 @@ pub fn build(b: *std.Build) void { | |||
| 210 | display_width.addImport("code_point", code_point); | 224 | display_width.addImport("code_point", code_point); |
| 211 | display_width.addImport("grapheme", grapheme); | 225 | display_width.addImport("grapheme", grapheme); |
| 212 | display_width.addImport("DisplayWidthData", width_data); | 226 | display_width.addImport("DisplayWidthData", width_data); |
| 227 | display_width.addOptions("options", options); // For testing | ||
| 213 | 228 | ||
| 214 | // Normalization | 229 | // Normalization |
| 215 | const ccc_data = b.createModule(.{ | 230 | const ccc_data = b.createModule(.{ |
diff --git a/codegen/dwp.zig b/codegen/dwp.zig index c581eb6..5e5bf6a 100644 --- a/codegen/dwp.zig +++ b/codegen/dwp.zig | |||
| @@ -4,7 +4,7 @@ const builtin = @import("builtin"); | |||
| 4 | const options = @import("options"); | 4 | const options = @import("options"); |
| 5 | 5 | ||
| 6 | const block_size = 256; | 6 | const block_size = 256; |
| 7 | const Block = [block_size]i3; | 7 | const Block = [block_size]i4; |
| 8 | 8 | ||
| 9 | const BlockMap = std.HashMap( | 9 | const BlockMap = std.HashMap( |
| 10 | Block, | 10 | Block, |
| @@ -17,7 +17,7 @@ const BlockMap = std.HashMap( | |||
| 17 | } | 17 | } |
| 18 | 18 | ||
| 19 | pub fn eql(_: @This(), a: Block, b: Block) bool { | 19 | pub fn eql(_: @This(), a: Block, b: Block) bool { |
| 20 | return std.mem.eql(i3, &a, &b); | 20 | return std.mem.eql(i4, &a, &b); |
| 21 | } | 21 | } |
| 22 | }, | 22 | }, |
| 23 | std.hash_map.default_max_load_percentage, | 23 | std.hash_map.default_max_load_percentage, |
| @@ -28,7 +28,7 @@ pub fn main() !void { | |||
| 28 | defer arena.deinit(); | 28 | defer arena.deinit(); |
| 29 | const allocator = arena.allocator(); | 29 | const allocator = arena.allocator(); |
| 30 | 30 | ||
| 31 | var flat_map = std.AutoHashMap(u21, i3).init(allocator); | 31 | var flat_map = std.AutoHashMap(u21, i4).init(allocator); |
| 32 | defer flat_map.deinit(); | 32 | defer flat_map.deinit(); |
| 33 | 33 | ||
| 34 | var line_buf: [4096]u8 = undefined; | 34 | var line_buf: [4096]u8 = undefined; |
| @@ -147,10 +147,10 @@ pub fn main() !void { | |||
| 147 | var stage1 = std.ArrayList(u16).init(allocator); | 147 | var stage1 = std.ArrayList(u16).init(allocator); |
| 148 | defer stage1.deinit(); | 148 | defer stage1.deinit(); |
| 149 | 149 | ||
| 150 | var stage2 = std.ArrayList(i3).init(allocator); | 150 | var stage2 = std.ArrayList(i4).init(allocator); |
| 151 | defer stage2.deinit(); | 151 | defer stage2.deinit(); |
| 152 | 152 | ||
| 153 | var block: Block = [_]i3{0} ** block_size; | 153 | var block: Block = [_]i4{0} ** block_size; |
| 154 | var block_len: u16 = 0; | 154 | var block_len: u16 = 0; |
| 155 | 155 | ||
| 156 | for (0..0x110000) |i| { | 156 | for (0..0x110000) |i| { |
| @@ -163,8 +163,8 @@ pub fn main() !void { | |||
| 163 | 0x2e3b => width = 3, | 163 | 0x2e3b => width = 3, |
| 164 | 164 | ||
| 165 | // C0/C1 control codes | 165 | // C0/C1 control codes |
| 166 | 0...0x20, | 166 | 0...0x20 => width = if (options.c0_width) |c0| c0 else 0, |
| 167 | 0x80...0xa0, | 167 | 0x80...0x9f => width = if (options.c1_width) |c1| c1 else 0, |
| 168 | 168 | ||
| 169 | // Line separator | 169 | // Line separator |
| 170 | 0x2028, | 170 | 0x2028, |
| @@ -204,7 +204,7 @@ pub fn main() !void { | |||
| 204 | if (cp == 0xad) width = 1; | 204 | if (cp == 0xad) width = 1; |
| 205 | 205 | ||
| 206 | // Backspace and delete | 206 | // Backspace and delete |
| 207 | if (cp == 0x8 or cp == 0x7f) width = -1; | 207 | if (cp == 0x8 or cp == 0x7f) width = if (options.c0_width) |c0| c0 else -1; |
| 208 | 208 | ||
| 209 | // Process block | 209 | // Process block |
| 210 | block[block_len] = width; | 210 | block[block_len] = width; |
diff --git a/src/DisplayWidth.zig b/src/DisplayWidth.zig index 621b8c1..04e6b0c 100644 --- a/src/DisplayWidth.zig +++ b/src/DisplayWidth.zig | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | const std = @import("std"); | 1 | const std = @import("std"); |
| 2 | const builtin = @import("builtin"); | 2 | const builtin = @import("builtin"); |
| 3 | const options = @import("options"); | ||
| 3 | const ArrayList = std.ArrayList; | 4 | const ArrayList = std.ArrayList; |
| 4 | const mem = std.mem; | 5 | const mem = std.mem; |
| 5 | const simd = std.simd; | 6 | const simd = std.simd; |
| @@ -60,6 +61,7 @@ test "strWidth" { | |||
| 60 | const data = try DisplayWidthData.init(testing.allocator); | 61 | const data = try DisplayWidthData.init(testing.allocator); |
| 61 | defer data.deinit(); | 62 | defer data.deinit(); |
| 62 | const self = Self{ .data = &data }; | 63 | const self = Self{ .data = &data }; |
| 64 | const c0 = options.c0_width orelse 0; | ||
| 63 | 65 | ||
| 64 | try testing.expectEqual(@as(usize, 5), self.strWidth("Hello\r\n")); | 66 | try testing.expectEqual(@as(usize, 5), self.strWidth("Hello\r\n")); |
| 65 | try testing.expectEqual(@as(usize, 1), self.strWidth("\u{0065}\u{0301}")); | 67 | try testing.expectEqual(@as(usize, 1), self.strWidth("\u{0065}\u{0301}")); |
| @@ -74,19 +76,21 @@ test "strWidth" { | |||
| 74 | try testing.expectEqual(@as(usize, 1), self.strWidth("\u{2764}")); // Default text presentation | 76 | try testing.expectEqual(@as(usize, 1), self.strWidth("\u{2764}")); // Default text presentation |
| 75 | try testing.expectEqual(@as(usize, 1), self.strWidth("\u{2764}\u{FE0E}")); // Default text presentation with VS15 selector | 77 | try testing.expectEqual(@as(usize, 1), self.strWidth("\u{2764}\u{FE0E}")); // Default text presentation with VS15 selector |
| 76 | try testing.expectEqual(@as(usize, 2), self.strWidth("\u{2764}\u{FE0F}")); // Default text presentation with VS16 selector | 78 | try testing.expectEqual(@as(usize, 2), self.strWidth("\u{2764}\u{FE0F}")); // Default text presentation with VS16 selector |
| 77 | try testing.expectEqual(@as(usize, 0), self.strWidth("A\x08")); // Backspace | 79 | const expect_bs: usize = if (c0 == 0) 0 else 1 + c0; |
| 78 | try testing.expectEqual(@as(usize, 0), self.strWidth("\x7FA")); // DEL | 80 | try testing.expectEqual(expect_bs, self.strWidth("A\x08")); // Backspace |
| 79 | try testing.expectEqual(@as(usize, 0), self.strWidth("\x7FA\x08\x08")); // never less than o | 81 | try testing.expectEqual(expect_bs, self.strWidth("\x7FA")); // DEL |
| 82 | const expect_long_del: usize = if (c0 == 0) 0 else 1 + (c0 * 3); | ||
| 83 | try testing.expectEqual(expect_long_del, self.strWidth("\x7FA\x08\x08")); // never less than 0 | ||
| 80 | 84 | ||
| 81 | // wcwidth Python lib tests. See: https://github.com/jquast/wcwidth/blob/master/tests/test_core.py | 85 | // wcwidth Python lib tests. See: https://github.com/jquast/wcwidth/blob/master/tests/test_core.py |
| 82 | const empty = ""; | 86 | const empty = ""; |
| 83 | try testing.expectEqual(@as(usize, 0), self.strWidth(empty)); | 87 | try testing.expectEqual(@as(usize, 0), self.strWidth(empty)); |
| 84 | const with_null = "hello\x00world"; | 88 | const with_null = "hello\x00world"; |
| 85 | try testing.expectEqual(@as(usize, 10), self.strWidth(with_null)); | 89 | try testing.expectEqual(@as(usize, 10 + c0), self.strWidth(with_null)); |
| 86 | const hello_jp = "コンニチハ, セカイ!"; | 90 | const hello_jp = "コンニチハ, セカイ!"; |
| 87 | try testing.expectEqual(@as(usize, 19), self.strWidth(hello_jp)); | 91 | try testing.expectEqual(@as(usize, 19), self.strWidth(hello_jp)); |
| 88 | const control = "\x1b[0m"; | 92 | const control = "\x1b[0m"; |
| 89 | try testing.expectEqual(@as(usize, 3), self.strWidth(control)); | 93 | try testing.expectEqual(@as(usize, 3 + c0), self.strWidth(control)); |
| 90 | const balinese = "\u{1B13}\u{1B28}\u{1B2E}\u{1B44}"; | 94 | const balinese = "\u{1B13}\u{1B28}\u{1B2E}\u{1B44}"; |
| 91 | try testing.expectEqual(@as(usize, 3), self.strWidth(balinese)); | 95 | try testing.expectEqual(@as(usize, 3), self.strWidth(balinese)); |
| 92 | 96 | ||
diff --git a/src/WidthData.zig b/src/WidthData.zig index 1b7fb2e..d77879e 100644 --- a/src/WidthData.zig +++ b/src/WidthData.zig | |||
| @@ -9,7 +9,7 @@ const GraphemeData = @import("GraphemeData"); | |||
| 9 | allocator: mem.Allocator, | 9 | allocator: mem.Allocator, |
| 10 | g_data: GraphemeData, | 10 | g_data: GraphemeData, |
| 11 | s1: []u16 = undefined, | 11 | s1: []u16 = undefined, |
| 12 | s2: []i3 = undefined, | 12 | s2: []i4 = undefined, |
| 13 | 13 | ||
| 14 | const Self = @This(); | 14 | const Self = @This(); |
| 15 | 15 | ||
| @@ -34,7 +34,7 @@ pub fn init(allocator: mem.Allocator) mem.Allocator.Error!Self { | |||
| 34 | for (0..stage_1_len) |i| self.s1[i] = reader.readInt(u16, endian) catch unreachable; | 34 | for (0..stage_1_len) |i| self.s1[i] = reader.readInt(u16, endian) catch unreachable; |
| 35 | 35 | ||
| 36 | const stage_2_len: u16 = reader.readInt(u16, endian) catch unreachable; | 36 | const stage_2_len: u16 = reader.readInt(u16, endian) catch unreachable; |
| 37 | self.s2 = try allocator.alloc(i3, stage_2_len); | 37 | self.s2 = try allocator.alloc(i4, stage_2_len); |
| 38 | errdefer allocator.free(self.s2); | 38 | errdefer allocator.free(self.s2); |
| 39 | for (0..stage_2_len) |i| self.s2[i] = @intCast(reader.readInt(i8, endian) catch unreachable); | 39 | for (0..stage_2_len) |i| self.s2[i] = @intCast(reader.readInt(i8, endian) catch unreachable); |
| 40 | 40 | ||
| @@ -52,33 +52,33 @@ pub fn deinit(self: *const Self) void { | |||
| 52 | /// 3, where BACKSPACE and DELETE return -1 and 3-em-dash returns 3. C0/C1 | 52 | /// 3, where BACKSPACE and DELETE return -1 and 3-em-dash returns 3. C0/C1 |
| 53 | /// control codes return 0. If `cjk` is true, ambiguous code points return 2, | 53 | /// control codes return 0. If `cjk` is true, ambiguous code points return 2, |
| 54 | /// otherwise they return 1. | 54 | /// otherwise they return 1. |
| 55 | pub fn codePointWidth(self: Self, cp: u21) i3 { | 55 | pub fn codePointWidth(self: Self, cp: u21) i4 { |
| 56 | return self.s2[self.s1[cp >> 8] + (cp & 0xff)]; | 56 | return self.s2[self.s1[cp >> 8] + (cp & 0xff)]; |
| 57 | } | 57 | } |
| 58 | 58 | ||
| 59 | test "codePointWidth" { | 59 | test "codePointWidth" { |
| 60 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x0000)); // null | 60 | try testing.expectEqual(@as(i4, 0), codePointWidth(0x0000)); // null |
| 61 | try testing.expectEqual(@as(i3, -1), codePointWidth(0x8)); // \b | 61 | try testing.expectEqual(@as(i4, -1), codePointWidth(0x8)); // \b |
| 62 | try testing.expectEqual(@as(i3, -1), codePointWidth(0x7f)); // DEL | 62 | try testing.expectEqual(@as(i4, -1), codePointWidth(0x7f)); // DEL |
| 63 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x0005)); // Cf | 63 | try testing.expectEqual(@as(i4, 0), codePointWidth(0x0005)); // Cf |
| 64 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x0007)); // \a BEL | 64 | try testing.expectEqual(@as(i4, 0), codePointWidth(0x0007)); // \a BEL |
| 65 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000A)); // \n LF | 65 | try testing.expectEqual(@as(i4, 0), codePointWidth(0x000A)); // \n LF |
| 66 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000B)); // \v VT | 66 | try testing.expectEqual(@as(i4, 0), codePointWidth(0x000B)); // \v VT |
| 67 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000C)); // \f FF | 67 | try testing.expectEqual(@as(i4, 0), codePointWidth(0x000C)); // \f FF |
| 68 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000D)); // \r CR | 68 | try testing.expectEqual(@as(i4, 0), codePointWidth(0x000D)); // \r CR |
| 69 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000E)); // SQ | 69 | try testing.expectEqual(@as(i4, 0), codePointWidth(0x000E)); // SQ |
| 70 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000F)); // SI | 70 | try testing.expectEqual(@as(i4, 0), codePointWidth(0x000F)); // SI |
| 71 | 71 | ||
| 72 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x070F)); // Cf | 72 | try testing.expectEqual(@as(i4, 0), codePointWidth(0x070F)); // Cf |
| 73 | try testing.expectEqual(@as(i3, 1), codePointWidth(0x0603)); // Cf Arabic | 73 | try testing.expectEqual(@as(i4, 1), codePointWidth(0x0603)); // Cf Arabic |
| 74 | 74 | ||
| 75 | try testing.expectEqual(@as(i3, 1), codePointWidth(0x00AD)); // soft-hyphen | 75 | try testing.expectEqual(@as(i4, 1), codePointWidth(0x00AD)); // soft-hyphen |
| 76 | try testing.expectEqual(@as(i3, 2), codePointWidth(0x2E3A)); // two-em dash | 76 | try testing.expectEqual(@as(i4, 2), codePointWidth(0x2E3A)); // two-em dash |
| 77 | try testing.expectEqual(@as(i3, 3), codePointWidth(0x2E3B)); // three-em dash | 77 | try testing.expectEqual(@as(i4, 3), codePointWidth(0x2E3B)); // three-em dash |
| 78 | 78 | ||
| 79 | try testing.expectEqual(@as(i3, 1), codePointWidth(0x00BD)); // ambiguous halfwidth | 79 | try testing.expectEqual(@as(i4, 1), codePointWidth(0x00BD)); // ambiguous halfwidth |
| 80 | 80 | ||
| 81 | try testing.expectEqual(@as(i3, 1), codePointWidth('é')); | 81 | try testing.expectEqual(@as(i4, 1), codePointWidth('é')); |
| 82 | try testing.expectEqual(@as(i3, 2), codePointWidth('😊')); | 82 | try testing.expectEqual(@as(i4, 2), codePointWidth('😊')); |
| 83 | try testing.expectEqual(@as(i3, 2), codePointWidth('统')); | 83 | try testing.expectEqual(@as(i4, 2), codePointWidth('统')); |
| 84 | } | 84 | } |