count codepoints instead of bytes, to determine width

A complete solution would be to count grapheme clusters, but that would require adding a dependency on something like zg. Counting codepoints will ensure that typical non-ASCII text is supported, but you can still throw it off with more complex Unicode constructions, which might not be so useful in help text. Fixes #75
author: owl 2024-08-30 12:23:33 +0200
committer: Komari Spaghetti 2024-08-30 16:46:05 +0200
commit: 70bc70375f8e82843830d93631ab005302057a15 (patch)
tree: 0553ae38d11a3d52696c898fceafd312e7ff284a
parent: feat: Allow for the assignment separator to be configured (diff)
download: zig-clap-70bc70375f8e82843830d93631ab005302057a15.tar.gz
zig-clap-70bc70375f8e82843830d93631ab005302057a15.tar.xz
zig-clap-70bc70375f8e82843830d93631ab005302057a15.zip
2 files changed, 162 insertions, 16 deletions
diff --git a/clap.zig b/clap.zig
index 6054b24..d03de72 100644
--- a/clap.zig
+++ b/clap.zig
@@ -13,6 +13,7 @@ const testing = std.testing;
 pub const args = @import("clap/args.zig");
 pub const parsers = @import("clap/parsers.zig");
 pub const streaming = @import("clap/streaming.zig");
+pub const ccw = @import("clap/codepoint_counting_writer.zig");
 test "clap" {
    testing.refAllDecls(@This());
@@ -1153,10 +1154,10 @@ pub fn help(
    const max_spacing = blk: {
        var res: usize = 0;
        for (params) |param| {
-            var cs = io.countingWriter(io.null_writer);
+            var cs = ccw.codepointCountingWriter(io.null_writer);
            try printParam(cs.writer(), Id, param);
-            if (res < cs.bytes_written)
+            if (res < cs.codepoints_written)
-                res = @intCast(cs.bytes_written);
+                res = @intCast(cs.codepoints_written);
        }
        break :blk res;
@@ -1166,22 +1167,22 @@ pub fn help(
        opt.description_indent +
        max_spacing * @intFromBool(!opt.description_on_new_line);
-    var first_paramter: bool = true;
+    var first_parameter: bool = true;
    for (params) |param| {
-        if (!first_paramter)
+        if (!first_parameter)
            try writer.writeByteNTimes('\n', opt.spacing_between_parameters);
-        first_paramter = false;
+        first_parameter = false;
        try writer.writeByteNTimes(' ', opt.indent);
-        var cw = io.countingWriter(writer);
+        var cw = ccw.codepointCountingWriter(writer);
        try printParam(cw.writer(), Id, param);
        const Writer = DescriptionWriter(@TypeOf(writer));
        var description_writer = Writer{
            .underlying_writer = writer,
            .indentation = description_indentation,
-            .printed_chars = @intCast(cw.bytes_written),
+            .printed_chars = @intCast(cw.codepoints_written),
            .max_width = opt.max_width,
        };
@@ -1260,8 +1261,7 @@ pub fn help(
            } else {
                // For none markdown like format, we just respect the newlines in the input
                // string and output them as is.
-                var i: usize = 0;
+                for (0..non_emitted_newlines) |_|
-                while (i < non_emitted_newlines) : (i += 1)
                    try description_writer.newline();
            }
@@ -1292,7 +1292,7 @@ fn DescriptionWriter(comptime UnderlyingWriter: type) type {
            debug.assert(word.len != 0);
            var first_word = writer.printed_chars <= writer.indentation;
-            const chars_to_write = word.len + @intFromBool(!first_word);
+            const chars_to_write = try std.unicode.utf8CountCodepoints(word) + @intFromBool(!first_word);
            if (chars_to_write + writer.printed_chars > writer.max_width) {
                // If the word does not fit on this line, then we insert a new line and print
                // it on that line. The only exception to this is if this was the first word.
@@ -1744,6 +1744,50 @@ test "clap.help" {
        \\-d, --dd <V3>...    Both repeated option.
        \\
    );
+    // Test with multibyte characters.
+    try testHelp(.{
+        .indent = 0,
+        .max_width = 46,
+        .description_on_new_line = false,
+        .description_indent = 4,
+        .spacing_between_parameters = 2,
+    },
+        \\-a                  Shört flåg.
+        \\
+        \\
+        \\-b <V1>             Shört öptiön.
+        \\
+        \\
+        \\    --aa            Löng fläg.
+        \\
+        \\
+        \\    --bb <V2>       Löng öptiön.
+        \\
+        \\
+        \\-c, --cc            Bóth fläg.
+        \\
+        \\
+        \\    --complicate    Fläg wíth ä cömplǐcätéd
+        \\                    änd vërý löng dèscrıptıön
+        \\                    thät späns mültíplë
+        \\                    lınēs.
+        \\
+        \\                    Pärägräph number 2:
+        \\                    * Bullet pöint
+        \\                    * Bullet pöint
+        \\
+        \\                    Exämple:
+        \\                        sömething sömething
+        \\                        sömething
+        \\
+        \\
+        \\-d, --dd <V3>       Böth öptiön.
+        \\
+        \\
+        \\-d, --dd <V3>...    Böth repeäted öptiön.
+        \\
+    );
 }
 /// Will print a usage message in the following format:
@@ -1752,18 +1796,18 @@ test "clap.help" {
 /// First all none value taking parameters, which have a short name are printed, then non
 /// positional parameters and finally the positional.
 pub fn usage(stream: anytype, comptime Id: type, params: []const Param(Id)) !void {
-    var cos = io.countingWriter(stream);
+    var cos = ccw.codepointCountingWriter(stream);
    const cs = cos.writer();
    for (params) |param| {
        const name = param.names.short orelse continue;
        if (param.takes_value != .none)
            continue;
-        if (cos.bytes_written == 0)
+        if (cos.codepoints_written == 0)
            try stream.writeAll("[-");
        try cs.writeByte(name);
    }
-    if (cos.bytes_written != 0)
+    if (cos.codepoints_written != 0)
        try cs.writeAll("]");
    var has_positionals: bool = false;
@@ -1782,7 +1826,7 @@ pub fn usage(stream: anytype, comptime Id: type, params: []const Param(Id)) !voi
            continue;
        };
-        if (cos.bytes_written != 0)
+        if (cos.codepoints_written != 0)
            try cs.writeAll(" ");
        try cs.writeAll("[");
@@ -1806,7 +1850,7 @@ pub fn usage(stream: anytype, comptime Id: type, params: []const Param(Id)) !voi
        if (param.names.short != null or param.names.long != null)
            continue;
-        if (cos.bytes_written != 0)
+        if (cos.codepoints_written != 0)
            try cs.writeAll(" ");
        try cs.writeAll("<");
diff --git a/clap/codepoint_counting_writer.zig b/clap/codepoint_counting_writer.zig
new file mode 100644
index 0000000..e6b9d1c
--- /dev/null
+++ b/clap/codepoint_counting_writer.zig
@@ -0,0 +1,102 @@
+const std = @import("std");
+const builtin = @import("builtin");
+const native_endian = builtin.cpu.arch.endian();
+/// A Writer that counts how many codepoints has been written to it.
+/// Expects valid UTF-8 input, and does not validate the input.
+pub fn CodepointCountingWriter(comptime WriterType: type) type {
+    return struct {
+        codepoints_written: u64,
+        child_stream: WriterType,
+        pub const Error = WriterType.Error || error{Utf8InvalidStartByte};
+        pub const Writer = std.io.Writer(*Self, Error, write);
+        const Self = @This();
+        pub fn write(self: *Self, bytes: []const u8) Error!usize {
+            const bytes_and_codepoints = try utf8CountCodepointsAllowTruncate(bytes);
+            // Might not be the full input, so the leftover bytes are written on the next call.
+            const bytes_to_write = bytes[0..bytes_and_codepoints.bytes];
+            const amt = try self.child_stream.write(bytes_to_write);
+            const bytes_written = bytes_to_write[0..amt];
+            self.codepoints_written += (try utf8CountCodepointsAllowTruncate(bytes_written)).codepoints;
+            return amt;
+        }
+        pub fn writer(self: *Self) Writer {
+            return .{ .context = self };
+        }
+    };
+}
+// Like `std.unicode.utf8CountCodepoints`, but on truncated input, it returns
+// the number of codepoints up to that point.
+// Does not validate UTF-8 beyond checking the start byte.
+fn utf8CountCodepointsAllowTruncate(s: []const u8) !struct { bytes: usize, codepoints: usize } {
+    var len: usize = 0;
+    const N = @sizeOf(usize);
+    const MASK = 0x80 * (std.math.maxInt(usize) / 0xff);
+    var i: usize = 0;
+    while (i < s.len) {
+        // Fast path for ASCII sequences
+        while (i + N <= s.len) : (i += N) {
+            const v = std.mem.readInt(usize, s[i..][0..N], native_endian);
+            if (v & MASK != 0) break;
+            len += N;
+        }
+        if (i < s.len) {
+            const n = try std.unicode.utf8ByteSequenceLength(s[i]);
+            // Truncated input; return the current counts.
+            if (i + n > s.len) return .{ .bytes = i, .codepoints = len };
+            i += n;
+            len += 1;
+        }
+    }
+    return .{ .bytes = i, .codepoints = len };
+}
+pub fn codepointCountingWriter(child_stream: anytype) CodepointCountingWriter(@TypeOf(child_stream)) {
+    return .{ .codepoints_written = 0, .child_stream = child_stream };
+}
+const testing = std.testing;
+test CodepointCountingWriter {
+    var counting_stream = codepointCountingWriter(std.io.null_writer);
+    const stream = counting_stream.writer();
+    const utf8_text = "blåhaj" ** 100;
+    stream.writeAll(utf8_text) catch unreachable;
+    const expected_count = try std.unicode.utf8CountCodepoints(utf8_text);
+    try testing.expectEqual(expected_count, counting_stream.codepoints_written);
+}
+test "handles partial UTF-8 writes" {
+    var buf: [100]u8 = undefined;
+    var fbs = std.io.fixedBufferStream(&buf);
+    var counting_stream = codepointCountingWriter(fbs.writer());
+    const stream = counting_stream.writer();
+    const utf8_text = "ååå";
+    // `å` is represented as `\xC5\xA5`, write 1.5 `å`s.
+    var wc = try stream.write(utf8_text[0..3]);
+    // One should have been written fully.
+    try testing.expectEqual("å".len, wc);
+    try testing.expectEqual(1, counting_stream.codepoints_written);
+    // Write the rest, continuing from the reported number of bytes written.
+    wc = try stream.write(utf8_text[wc..]);
+    try testing.expectEqual(4, wc);
+    try testing.expectEqual(3, counting_stream.codepoints_written);
+    const expected_count = try std.unicode.utf8CountCodepoints(utf8_text);
+    try testing.expectEqual(expected_count, counting_stream.codepoints_written);
+    try testing.expectEqualSlices(u8, utf8_text, fbs.getWritten());
+}
author	owl	2024-08-30 12:23:33 +0200
committer	Komari Spaghetti	2024-08-30 16:46:05 +0200
commit	70bc70375f8e82843830d93631ab005302057a15 (patch)
tree	0553ae38d11a3d52696c898fceafd312e7ff284a
parent	feat: Allow for the assignment separator to be configured (diff)
download	zig-clap-70bc70375f8e82843830d93631ab005302057a15.tar.gz zig-clap-70bc70375f8e82843830d93631ab005302057a15.tar.xz zig-clap-70bc70375f8e82843830d93631ab005302057a15.zip

diff --git a/clap.zig b/clap.zig index 6054b24..d03de72 100644 --- a/clap.zig +++ b/clap.zig
@@ -13,6 +13,7 @@ const testing = std.testing;
13	pub const args = @import("clap/args.zig");	13	pub const args = @import("clap/args.zig");
14	pub const parsers = @import("clap/parsers.zig");	14	pub const parsers = @import("clap/parsers.zig");
15	pub const streaming = @import("clap/streaming.zig");	15	pub const streaming = @import("clap/streaming.zig");
		16	pub const ccw = @import("clap/codepoint_counting_writer.zig");
16		17
17	test "clap" {	18	test "clap" {
18	testing.refAllDecls(@This());	19	testing.refAllDecls(@This());
@@ -1153,10 +1154,10 @@ pub fn help(
1153	const max_spacing = blk: {	1154	const max_spacing = blk: {
1154	var res: usize = 0;	1155	var res: usize = 0;
1155	for (params) \|param\| {	1156	for (params) \|param\| {
1156	var cs = io.countingWriter(io.null_writer);	1157	var cs = ccw.codepointCountingWriter(io.null_writer);
1157	try printParam(cs.writer(), Id, param);	1158	try printParam(cs.writer(), Id, param);
1158	if (res < cs.bytes_written)	1159	if (res < cs.codepoints_written)
1159	res = @intCast(cs.bytes_written);	1160	res = @intCast(cs.codepoints_written);
1160	}	1161	}
1161		1162
1162	break :blk res;	1163	break :blk res;
@@ -1166,22 +1167,22 @@ pub fn help(
1166	opt.description_indent +	1167	opt.description_indent +
1167	max_spacing * @intFromBool(!opt.description_on_new_line);	1168	max_spacing * @intFromBool(!opt.description_on_new_line);
1168		1169
1169	var first_paramter: bool = true;	1170	var first_parameter: bool = true;
1170	for (params) \|param\| {	1171	for (params) \|param\| {
1171	if (!first_paramter)	1172	if (!first_parameter)
1172	try writer.writeByteNTimes('\n', opt.spacing_between_parameters);	1173	try writer.writeByteNTimes('\n', opt.spacing_between_parameters);
1173		1174
1174	first_paramter = false;	1175	first_parameter = false;
1175	try writer.writeByteNTimes(' ', opt.indent);	1176	try writer.writeByteNTimes(' ', opt.indent);
1176		1177
1177	var cw = io.countingWriter(writer);	1178	var cw = ccw.codepointCountingWriter(writer);
1178	try printParam(cw.writer(), Id, param);	1179	try printParam(cw.writer(), Id, param);
1179		1180
1180	const Writer = DescriptionWriter(@TypeOf(writer));	1181	const Writer = DescriptionWriter(@TypeOf(writer));
1181	var description_writer = Writer{	1182	var description_writer = Writer{
1182	.underlying_writer = writer,	1183	.underlying_writer = writer,
1183	.indentation = description_indentation,	1184	.indentation = description_indentation,
1184	.printed_chars = @intCast(cw.bytes_written),	1185	.printed_chars = @intCast(cw.codepoints_written),
1185	.max_width = opt.max_width,	1186	.max_width = opt.max_width,
1186	};	1187	};
1187		1188
@@ -1260,8 +1261,7 @@ pub fn help(
1260	} else {	1261	} else {
1261	// For none markdown like format, we just respect the newlines in the input	1262	// For none markdown like format, we just respect the newlines in the input
1262	// string and output them as is.	1263	// string and output them as is.
1263	var i: usize = 0;	1264	for (0..non_emitted_newlines) \|_\|
1264	while (i < non_emitted_newlines) : (i += 1)
1265	try description_writer.newline();	1265	try description_writer.newline();
1266	}	1266	}
1267		1267
@@ -1292,7 +1292,7 @@ fn DescriptionWriter(comptime UnderlyingWriter: type) type {
1292	debug.assert(word.len != 0);	1292	debug.assert(word.len != 0);
1293		1293
1294	var first_word = writer.printed_chars <= writer.indentation;	1294	var first_word = writer.printed_chars <= writer.indentation;
1295	const chars_to_write = word.len + @intFromBool(!first_word);	1295	const chars_to_write = try std.unicode.utf8CountCodepoints(word) + @intFromBool(!first_word);
1296	if (chars_to_write + writer.printed_chars > writer.max_width) {	1296	if (chars_to_write + writer.printed_chars > writer.max_width) {
1297	// If the word does not fit on this line, then we insert a new line and print	1297	// If the word does not fit on this line, then we insert a new line and print
1298	// it on that line. The only exception to this is if this was the first word.	1298	// it on that line. The only exception to this is if this was the first word.
@@ -1744,6 +1744,50 @@ test "clap.help" {
1744	\\-d, --dd <V3>... Both repeated option.	1744	\\-d, --dd <V3>... Both repeated option.
1745	\\	1745	\\
1746	);	1746	);
		1747
		1748	// Test with multibyte characters.
		1749	try testHelp(.{
		1750	.indent = 0,
		1751	.max_width = 46,
		1752	.description_on_new_line = false,
		1753	.description_indent = 4,
		1754	.spacing_between_parameters = 2,
		1755	},
		1756	\\-a Shört flåg.
		1757	\\
		1758	\\
		1759	\\-b <V1> Shört öptiön.
		1760	\\
		1761	\\
		1762	\\ --aa Löng fläg.
		1763	\\
		1764	\\
		1765	\\ --bb <V2> Löng öptiön.
		1766	\\
		1767	\\
		1768	\\-c, --cc Bóth fläg.
		1769	\\
		1770	\\
		1771	\\ --complicate Fläg wíth ä cömplǐcätéd
		1772	\\ änd vërý löng dèscrıptıön
		1773	\\ thät späns mültíplë
		1774	\\ lınēs.
		1775	\\
		1776	\\ Pärägräph number 2:
		1777	\\ * Bullet pöint
		1778	\\ * Bullet pöint
		1779	\\
		1780	\\ Exämple:
		1781	\\ sömething sömething
		1782	\\ sömething
		1783	\\
		1784	\\
		1785	\\-d, --dd <V3> Böth öptiön.
		1786	\\
		1787	\\
		1788	\\-d, --dd <V3>... Böth repeäted öptiön.
		1789	\\
		1790	);
1747	}	1791	}
1748		1792
1749	/// Will print a usage message in the following format:	1793	/// Will print a usage message in the following format:
@@ -1752,18 +1796,18 @@ test "clap.help" {
1752	/// First all none value taking parameters, which have a short name are printed, then non	1796	/// First all none value taking parameters, which have a short name are printed, then non
1753	/// positional parameters and finally the positional.	1797	/// positional parameters and finally the positional.
1754	pub fn usage(stream: anytype, comptime Id: type, params: []const Param(Id)) !void {	1798	pub fn usage(stream: anytype, comptime Id: type, params: []const Param(Id)) !void {
1755	var cos = io.countingWriter(stream);	1799	var cos = ccw.codepointCountingWriter(stream);
1756	const cs = cos.writer();	1800	const cs = cos.writer();
1757	for (params) \|param\| {	1801	for (params) \|param\| {
1758	const name = param.names.short orelse continue;	1802	const name = param.names.short orelse continue;
1759	if (param.takes_value != .none)	1803	if (param.takes_value != .none)
1760	continue;	1804	continue;
1761		1805
1762	if (cos.bytes_written == 0)	1806	if (cos.codepoints_written == 0)
1763	try stream.writeAll("[-");	1807	try stream.writeAll("[-");
1764	try cs.writeByte(name);	1808	try cs.writeByte(name);
1765	}	1809	}
1766	if (cos.bytes_written != 0)	1810	if (cos.codepoints_written != 0)
1767	try cs.writeAll("]");	1811	try cs.writeAll("]");
1768		1812
1769	var has_positionals: bool = false;	1813	var has_positionals: bool = false;
@@ -1782,7 +1826,7 @@ pub fn usage(stream: anytype, comptime Id: type, params: []const Param(Id)) !voi
1782	continue;	1826	continue;
1783	};	1827	};
1784		1828
1785	if (cos.bytes_written != 0)	1829	if (cos.codepoints_written != 0)
1786	try cs.writeAll(" ");	1830	try cs.writeAll(" ");
1787		1831
1788	try cs.writeAll("[");	1832	try cs.writeAll("[");
@@ -1806,7 +1850,7 @@ pub fn usage(stream: anytype, comptime Id: type, params: []const Param(Id)) !voi
1806	if (param.names.short != null or param.names.long != null)	1850	if (param.names.short != null or param.names.long != null)
1807	continue;	1851	continue;
1808		1852
1809	if (cos.bytes_written != 0)	1853	if (cos.codepoints_written != 0)
1810	try cs.writeAll(" ");	1854	try cs.writeAll(" ");
1811		1855
1812	try cs.writeAll("<");	1856	try cs.writeAll("<");


diff --git a/clap/codepoint_counting_writer.zig b/clap/codepoint_counting_writer.zig new file mode 100644 index 0000000..e6b9d1c --- /dev/null +++ b/clap/codepoint_counting_writer.zig
@@ -0,0 +1,102 @@
		1	const std = @import("std");
		2	const builtin = @import("builtin");
		3	const native_endian = builtin.cpu.arch.endian();
		4
		5	/// A Writer that counts how many codepoints has been written to it.
		6	/// Expects valid UTF-8 input, and does not validate the input.
		7	pub fn CodepointCountingWriter(comptime WriterType: type) type {
		8	return struct {
		9	codepoints_written: u64,
		10	child_stream: WriterType,
		11
		12	pub const Error = WriterType.Error \|\| error{Utf8InvalidStartByte};
		13	pub const Writer = std.io.Writer(*Self, Error, write);
		14
		15	const Self = @This();
		16
		17	pub fn write(self: *Self, bytes: []const u8) Error!usize {
		18	const bytes_and_codepoints = try utf8CountCodepointsAllowTruncate(bytes);
		19	// Might not be the full input, so the leftover bytes are written on the next call.
		20	const bytes_to_write = bytes[0..bytes_and_codepoints.bytes];
		21	const amt = try self.child_stream.write(bytes_to_write);
		22	const bytes_written = bytes_to_write[0..amt];
		23	self.codepoints_written += (try utf8CountCodepointsAllowTruncate(bytes_written)).codepoints;
		24	return amt;
		25	}
		26
		27	pub fn writer(self: *Self) Writer {
		28	return .{ .context = self };
		29	}
		30	};
		31	}
		32
		33	// Like `std.unicode.utf8CountCodepoints`, but on truncated input, it returns
		34	// the number of codepoints up to that point.
		35	// Does not validate UTF-8 beyond checking the start byte.
		36	fn utf8CountCodepointsAllowTruncate(s: []const u8) !struct { bytes: usize, codepoints: usize } {
		37	var len: usize = 0;
		38
		39	const N = @sizeOf(usize);
		40	const MASK = 0x80 * (std.math.maxInt(usize) / 0xff);
		41
		42	var i: usize = 0;
		43	while (i < s.len) {
		44	// Fast path for ASCII sequences
		45	while (i + N <= s.len) : (i += N) {
		46	const v = std.mem.readInt(usize, s[i..][0..N], native_endian);
		47	if (v & MASK != 0) break;
		48	len += N;
		49	}
		50
		51	if (i < s.len) {
		52	const n = try std.unicode.utf8ByteSequenceLength(s[i]);
		53	// Truncated input; return the current counts.
		54	if (i + n > s.len) return .{ .bytes = i, .codepoints = len };
		55
		56	i += n;
		57	len += 1;
		58	}
		59	}
		60
		61	return .{ .bytes = i, .codepoints = len };
		62	}
		63
		64	pub fn codepointCountingWriter(child_stream: anytype) CodepointCountingWriter(@TypeOf(child_stream)) {
		65	return .{ .codepoints_written = 0, .child_stream = child_stream };
		66	}
		67
		68	const testing = std.testing;
		69
		70	test CodepointCountingWriter {
		71	var counting_stream = codepointCountingWriter(std.io.null_writer);
		72	const stream = counting_stream.writer();
		73
		74	const utf8_text = "blåhaj" ** 100;
		75	stream.writeAll(utf8_text) catch unreachable;
		76	const expected_count = try std.unicode.utf8CountCodepoints(utf8_text);
		77	try testing.expectEqual(expected_count, counting_stream.codepoints_written);
		78	}
		79
		80	test "handles partial UTF-8 writes" {
		81	var buf: [100]u8 = undefined;
		82	var fbs = std.io.fixedBufferStream(&buf);
		83	var counting_stream = codepointCountingWriter(fbs.writer());
		84	const stream = counting_stream.writer();
		85
		86	const utf8_text = "ååå";
		87	// `å` is represented as `\xC5\xA5`, write 1.5 `å`s.
		88	var wc = try stream.write(utf8_text[0..3]);
		89	// One should have been written fully.
		90	try testing.expectEqual("å".len, wc);
		91	try testing.expectEqual(1, counting_stream.codepoints_written);
		92
		93	// Write the rest, continuing from the reported number of bytes written.
		94	wc = try stream.write(utf8_text[wc..]);
		95	try testing.expectEqual(4, wc);
		96	try testing.expectEqual(3, counting_stream.codepoints_written);
		97
		98	const expected_count = try std.unicode.utf8CountCodepoints(utf8_text);
		99	try testing.expectEqual(expected_count, counting_stream.codepoints_written);
		100
		101	try testing.expectEqualSlices(u8, utf8_text, fbs.getWritten());
		102	}