summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar owl2024-08-30 12:23:33 +0200
committerGravatar Komari Spaghetti2024-08-30 16:46:05 +0200
commit70bc70375f8e82843830d93631ab005302057a15 (patch)
tree0553ae38d11a3d52696c898fceafd312e7ff284a
parentfeat: Allow for the assignment separator to be configured (diff)
downloadzig-clap-70bc70375f8e82843830d93631ab005302057a15.tar.gz
zig-clap-70bc70375f8e82843830d93631ab005302057a15.tar.xz
zig-clap-70bc70375f8e82843830d93631ab005302057a15.zip
count codepoints instead of bytes, to determine width
A complete solution would be to count grapheme clusters, but that would require adding a dependency on something like zg. Counting codepoints will ensure that typical non-ASCII text is supported, but you can still throw it off with more complex Unicode constructions, which might not be so useful in help text. Fixes #75
-rw-r--r--clap.zig76
-rw-r--r--clap/codepoint_counting_writer.zig102
2 files changed, 162 insertions, 16 deletions
diff --git a/clap.zig b/clap.zig
index 6054b24..d03de72 100644
--- a/clap.zig
+++ b/clap.zig
@@ -13,6 +13,7 @@ const testing = std.testing;
13pub const args = @import("clap/args.zig"); 13pub const args = @import("clap/args.zig");
14pub const parsers = @import("clap/parsers.zig"); 14pub const parsers = @import("clap/parsers.zig");
15pub const streaming = @import("clap/streaming.zig"); 15pub const streaming = @import("clap/streaming.zig");
16pub const ccw = @import("clap/codepoint_counting_writer.zig");
16 17
17test "clap" { 18test "clap" {
18 testing.refAllDecls(@This()); 19 testing.refAllDecls(@This());
@@ -1153,10 +1154,10 @@ pub fn help(
1153 const max_spacing = blk: { 1154 const max_spacing = blk: {
1154 var res: usize = 0; 1155 var res: usize = 0;
1155 for (params) |param| { 1156 for (params) |param| {
1156 var cs = io.countingWriter(io.null_writer); 1157 var cs = ccw.codepointCountingWriter(io.null_writer);
1157 try printParam(cs.writer(), Id, param); 1158 try printParam(cs.writer(), Id, param);
1158 if (res < cs.bytes_written) 1159 if (res < cs.codepoints_written)
1159 res = @intCast(cs.bytes_written); 1160 res = @intCast(cs.codepoints_written);
1160 } 1161 }
1161 1162
1162 break :blk res; 1163 break :blk res;
@@ -1166,22 +1167,22 @@ pub fn help(
1166 opt.description_indent + 1167 opt.description_indent +
1167 max_spacing * @intFromBool(!opt.description_on_new_line); 1168 max_spacing * @intFromBool(!opt.description_on_new_line);
1168 1169
1169 var first_paramter: bool = true; 1170 var first_parameter: bool = true;
1170 for (params) |param| { 1171 for (params) |param| {
1171 if (!first_paramter) 1172 if (!first_parameter)
1172 try writer.writeByteNTimes('\n', opt.spacing_between_parameters); 1173 try writer.writeByteNTimes('\n', opt.spacing_between_parameters);
1173 1174
1174 first_paramter = false; 1175 first_parameter = false;
1175 try writer.writeByteNTimes(' ', opt.indent); 1176 try writer.writeByteNTimes(' ', opt.indent);
1176 1177
1177 var cw = io.countingWriter(writer); 1178 var cw = ccw.codepointCountingWriter(writer);
1178 try printParam(cw.writer(), Id, param); 1179 try printParam(cw.writer(), Id, param);
1179 1180
1180 const Writer = DescriptionWriter(@TypeOf(writer)); 1181 const Writer = DescriptionWriter(@TypeOf(writer));
1181 var description_writer = Writer{ 1182 var description_writer = Writer{
1182 .underlying_writer = writer, 1183 .underlying_writer = writer,
1183 .indentation = description_indentation, 1184 .indentation = description_indentation,
1184 .printed_chars = @intCast(cw.bytes_written), 1185 .printed_chars = @intCast(cw.codepoints_written),
1185 .max_width = opt.max_width, 1186 .max_width = opt.max_width,
1186 }; 1187 };
1187 1188
@@ -1260,8 +1261,7 @@ pub fn help(
1260 } else { 1261 } else {
1261 // For none markdown like format, we just respect the newlines in the input 1262 // For none markdown like format, we just respect the newlines in the input
1262 // string and output them as is. 1263 // string and output them as is.
1263 var i: usize = 0; 1264 for (0..non_emitted_newlines) |_|
1264 while (i < non_emitted_newlines) : (i += 1)
1265 try description_writer.newline(); 1265 try description_writer.newline();
1266 } 1266 }
1267 1267
@@ -1292,7 +1292,7 @@ fn DescriptionWriter(comptime UnderlyingWriter: type) type {
1292 debug.assert(word.len != 0); 1292 debug.assert(word.len != 0);
1293 1293
1294 var first_word = writer.printed_chars <= writer.indentation; 1294 var first_word = writer.printed_chars <= writer.indentation;
1295 const chars_to_write = word.len + @intFromBool(!first_word); 1295 const chars_to_write = try std.unicode.utf8CountCodepoints(word) + @intFromBool(!first_word);
1296 if (chars_to_write + writer.printed_chars > writer.max_width) { 1296 if (chars_to_write + writer.printed_chars > writer.max_width) {
1297 // If the word does not fit on this line, then we insert a new line and print 1297 // If the word does not fit on this line, then we insert a new line and print
1298 // it on that line. The only exception to this is if this was the first word. 1298 // it on that line. The only exception to this is if this was the first word.
@@ -1744,6 +1744,50 @@ test "clap.help" {
1744 \\-d, --dd <V3>... Both repeated option. 1744 \\-d, --dd <V3>... Both repeated option.
1745 \\ 1745 \\
1746 ); 1746 );
1747
1748 // Test with multibyte characters.
1749 try testHelp(.{
1750 .indent = 0,
1751 .max_width = 46,
1752 .description_on_new_line = false,
1753 .description_indent = 4,
1754 .spacing_between_parameters = 2,
1755 },
1756 \\-a Shört flåg.
1757 \\
1758 \\
1759 \\-b <V1> Shört öptiön.
1760 \\
1761 \\
1762 \\ --aa Löng fläg.
1763 \\
1764 \\
1765 \\ --bb <V2> Löng öptiön.
1766 \\
1767 \\
1768 \\-c, --cc Bóth fläg.
1769 \\
1770 \\
1771 \\ --complicate Fläg wíth ä cömplǐcätéd
1772 \\ änd vërý löng dèscrıptıön
1773 \\ thät späns mültíplë
1774 \\ lınēs.
1775 \\
1776 \\ Pärägräph number 2:
1777 \\ * Bullet pöint
1778 \\ * Bullet pöint
1779 \\
1780 \\ Exämple:
1781 \\ sömething sömething
1782 \\ sömething
1783 \\
1784 \\
1785 \\-d, --dd <V3> Böth öptiön.
1786 \\
1787 \\
1788 \\-d, --dd <V3>... Böth repeäted öptiön.
1789 \\
1790 );
1747} 1791}
1748 1792
1749/// Will print a usage message in the following format: 1793/// Will print a usage message in the following format:
@@ -1752,18 +1796,18 @@ test "clap.help" {
1752/// First all none value taking parameters, which have a short name are printed, then non 1796/// First all none value taking parameters, which have a short name are printed, then non
1753/// positional parameters and finally the positional. 1797/// positional parameters and finally the positional.
1754pub fn usage(stream: anytype, comptime Id: type, params: []const Param(Id)) !void { 1798pub fn usage(stream: anytype, comptime Id: type, params: []const Param(Id)) !void {
1755 var cos = io.countingWriter(stream); 1799 var cos = ccw.codepointCountingWriter(stream);
1756 const cs = cos.writer(); 1800 const cs = cos.writer();
1757 for (params) |param| { 1801 for (params) |param| {
1758 const name = param.names.short orelse continue; 1802 const name = param.names.short orelse continue;
1759 if (param.takes_value != .none) 1803 if (param.takes_value != .none)
1760 continue; 1804 continue;
1761 1805
1762 if (cos.bytes_written == 0) 1806 if (cos.codepoints_written == 0)
1763 try stream.writeAll("[-"); 1807 try stream.writeAll("[-");
1764 try cs.writeByte(name); 1808 try cs.writeByte(name);
1765 } 1809 }
1766 if (cos.bytes_written != 0) 1810 if (cos.codepoints_written != 0)
1767 try cs.writeAll("]"); 1811 try cs.writeAll("]");
1768 1812
1769 var has_positionals: bool = false; 1813 var has_positionals: bool = false;
@@ -1782,7 +1826,7 @@ pub fn usage(stream: anytype, comptime Id: type, params: []const Param(Id)) !voi
1782 continue; 1826 continue;
1783 }; 1827 };
1784 1828
1785 if (cos.bytes_written != 0) 1829 if (cos.codepoints_written != 0)
1786 try cs.writeAll(" "); 1830 try cs.writeAll(" ");
1787 1831
1788 try cs.writeAll("["); 1832 try cs.writeAll("[");
@@ -1806,7 +1850,7 @@ pub fn usage(stream: anytype, comptime Id: type, params: []const Param(Id)) !voi
1806 if (param.names.short != null or param.names.long != null) 1850 if (param.names.short != null or param.names.long != null)
1807 continue; 1851 continue;
1808 1852
1809 if (cos.bytes_written != 0) 1853 if (cos.codepoints_written != 0)
1810 try cs.writeAll(" "); 1854 try cs.writeAll(" ");
1811 1855
1812 try cs.writeAll("<"); 1856 try cs.writeAll("<");
diff --git a/clap/codepoint_counting_writer.zig b/clap/codepoint_counting_writer.zig
new file mode 100644
index 0000000..e6b9d1c
--- /dev/null
+++ b/clap/codepoint_counting_writer.zig
@@ -0,0 +1,102 @@
1const std = @import("std");
2const builtin = @import("builtin");
3const native_endian = builtin.cpu.arch.endian();
4
5/// A Writer that counts how many codepoints has been written to it.
6/// Expects valid UTF-8 input, and does not validate the input.
7pub fn CodepointCountingWriter(comptime WriterType: type) type {
8 return struct {
9 codepoints_written: u64,
10 child_stream: WriterType,
11
12 pub const Error = WriterType.Error || error{Utf8InvalidStartByte};
13 pub const Writer = std.io.Writer(*Self, Error, write);
14
15 const Self = @This();
16
17 pub fn write(self: *Self, bytes: []const u8) Error!usize {
18 const bytes_and_codepoints = try utf8CountCodepointsAllowTruncate(bytes);
19 // Might not be the full input, so the leftover bytes are written on the next call.
20 const bytes_to_write = bytes[0..bytes_and_codepoints.bytes];
21 const amt = try self.child_stream.write(bytes_to_write);
22 const bytes_written = bytes_to_write[0..amt];
23 self.codepoints_written += (try utf8CountCodepointsAllowTruncate(bytes_written)).codepoints;
24 return amt;
25 }
26
27 pub fn writer(self: *Self) Writer {
28 return .{ .context = self };
29 }
30 };
31}
32
33// Like `std.unicode.utf8CountCodepoints`, but on truncated input, it returns
34// the number of codepoints up to that point.
35// Does not validate UTF-8 beyond checking the start byte.
36fn utf8CountCodepointsAllowTruncate(s: []const u8) !struct { bytes: usize, codepoints: usize } {
37 var len: usize = 0;
38
39 const N = @sizeOf(usize);
40 const MASK = 0x80 * (std.math.maxInt(usize) / 0xff);
41
42 var i: usize = 0;
43 while (i < s.len) {
44 // Fast path for ASCII sequences
45 while (i + N <= s.len) : (i += N) {
46 const v = std.mem.readInt(usize, s[i..][0..N], native_endian);
47 if (v & MASK != 0) break;
48 len += N;
49 }
50
51 if (i < s.len) {
52 const n = try std.unicode.utf8ByteSequenceLength(s[i]);
53 // Truncated input; return the current counts.
54 if (i + n > s.len) return .{ .bytes = i, .codepoints = len };
55
56 i += n;
57 len += 1;
58 }
59 }
60
61 return .{ .bytes = i, .codepoints = len };
62}
63
64pub fn codepointCountingWriter(child_stream: anytype) CodepointCountingWriter(@TypeOf(child_stream)) {
65 return .{ .codepoints_written = 0, .child_stream = child_stream };
66}
67
68const testing = std.testing;
69
70test CodepointCountingWriter {
71 var counting_stream = codepointCountingWriter(std.io.null_writer);
72 const stream = counting_stream.writer();
73
74 const utf8_text = "blåhaj" ** 100;
75 stream.writeAll(utf8_text) catch unreachable;
76 const expected_count = try std.unicode.utf8CountCodepoints(utf8_text);
77 try testing.expectEqual(expected_count, counting_stream.codepoints_written);
78}
79
80test "handles partial UTF-8 writes" {
81 var buf: [100]u8 = undefined;
82 var fbs = std.io.fixedBufferStream(&buf);
83 var counting_stream = codepointCountingWriter(fbs.writer());
84 const stream = counting_stream.writer();
85
86 const utf8_text = "ååå";
87 // `å` is represented as `\xC5\xA5`, write 1.5 `å`s.
88 var wc = try stream.write(utf8_text[0..3]);
89 // One should have been written fully.
90 try testing.expectEqual("å".len, wc);
91 try testing.expectEqual(1, counting_stream.codepoints_written);
92
93 // Write the rest, continuing from the reported number of bytes written.
94 wc = try stream.write(utf8_text[wc..]);
95 try testing.expectEqual(4, wc);
96 try testing.expectEqual(3, counting_stream.codepoints_written);
97
98 const expected_count = try std.unicode.utf8CountCodepoints(utf8_text);
99 try testing.expectEqual(expected_count, counting_stream.codepoints_written);
100
101 try testing.expectEqualSlices(u8, utf8_text, fbs.getWritten());
102}