count codepoints instead of bytes, to determine width

A complete solution would be to count grapheme clusters, but that would require adding a dependency on something like zg. Counting codepoints will ensure that typical non-ASCII text is supported, but you can still throw it off with more complex Unicode constructions, which might not be so useful in help text. Fixes #75
author: owl 2024-08-30 12:23:33 +0200
committer: Komari Spaghetti 2024-08-30 16:46:05 +0200
commit: 70bc70375f8e82843830d93631ab005302057a15 (patch)
tree: 0553ae38d11a3d52696c898fceafd312e7ff284a /clap
parent: feat: Allow for the assignment separator to be configured (diff)
download: zig-clap-70bc70375f8e82843830d93631ab005302057a15.tar.gz
zig-clap-70bc70375f8e82843830d93631ab005302057a15.tar.xz
zig-clap-70bc70375f8e82843830d93631ab005302057a15.zip
1 files changed, 102 insertions, 0 deletions
diff --git a/clap/codepoint_counting_writer.zig b/clap/codepoint_counting_writer.zig
new file mode 100644
index 0000000..e6b9d1c
--- /dev/null
+++ b/clap/codepoint_counting_writer.zig
@@ -0,0 +1,102 @@
+const std = @import("std");
+const builtin = @import("builtin");
+const native_endian = builtin.cpu.arch.endian();
+/// A Writer that counts how many codepoints has been written to it.
+/// Expects valid UTF-8 input, and does not validate the input.
+pub fn CodepointCountingWriter(comptime WriterType: type) type {
+    return struct {
+        codepoints_written: u64,
+        child_stream: WriterType,
+        pub const Error = WriterType.Error || error{Utf8InvalidStartByte};
+        pub const Writer = std.io.Writer(*Self, Error, write);
+        const Self = @This();
+        pub fn write(self: *Self, bytes: []const u8) Error!usize {
+            const bytes_and_codepoints = try utf8CountCodepointsAllowTruncate(bytes);
+            // Might not be the full input, so the leftover bytes are written on the next call.
+            const bytes_to_write = bytes[0..bytes_and_codepoints.bytes];
+            const amt = try self.child_stream.write(bytes_to_write);
+            const bytes_written = bytes_to_write[0..amt];
+            self.codepoints_written += (try utf8CountCodepointsAllowTruncate(bytes_written)).codepoints;
+            return amt;
+        }
+        pub fn writer(self: *Self) Writer {
+            return .{ .context = self };
+        }
+    };
+}
+// Like `std.unicode.utf8CountCodepoints`, but on truncated input, it returns
+// the number of codepoints up to that point.
+// Does not validate UTF-8 beyond checking the start byte.
+fn utf8CountCodepointsAllowTruncate(s: []const u8) !struct { bytes: usize, codepoints: usize } {
+    var len: usize = 0;
+    const N = @sizeOf(usize);
+    const MASK = 0x80 * (std.math.maxInt(usize) / 0xff);
+    var i: usize = 0;
+    while (i < s.len) {
+        // Fast path for ASCII sequences
+        while (i + N <= s.len) : (i += N) {
+            const v = std.mem.readInt(usize, s[i..][0..N], native_endian);
+            if (v & MASK != 0) break;
+            len += N;
+        }
+        if (i < s.len) {
+            const n = try std.unicode.utf8ByteSequenceLength(s[i]);
+            // Truncated input; return the current counts.
+            if (i + n > s.len) return .{ .bytes = i, .codepoints = len };
+            i += n;
+            len += 1;
+        }
+    }
+    return .{ .bytes = i, .codepoints = len };
+}
+pub fn codepointCountingWriter(child_stream: anytype) CodepointCountingWriter(@TypeOf(child_stream)) {
+    return .{ .codepoints_written = 0, .child_stream = child_stream };
+}
+const testing = std.testing;
+test CodepointCountingWriter {
+    var counting_stream = codepointCountingWriter(std.io.null_writer);
+    const stream = counting_stream.writer();
+    const utf8_text = "blåhaj" ** 100;
+    stream.writeAll(utf8_text) catch unreachable;
+    const expected_count = try std.unicode.utf8CountCodepoints(utf8_text);
+    try testing.expectEqual(expected_count, counting_stream.codepoints_written);
+}
+test "handles partial UTF-8 writes" {
+    var buf: [100]u8 = undefined;
+    var fbs = std.io.fixedBufferStream(&buf);
+    var counting_stream = codepointCountingWriter(fbs.writer());
+    const stream = counting_stream.writer();
+    const utf8_text = "ååå";
+    // `å` is represented as `\xC5\xA5`, write 1.5 `å`s.
+    var wc = try stream.write(utf8_text[0..3]);
+    // One should have been written fully.
+    try testing.expectEqual("å".len, wc);
+    try testing.expectEqual(1, counting_stream.codepoints_written);
+    // Write the rest, continuing from the reported number of bytes written.
+    wc = try stream.write(utf8_text[wc..]);
+    try testing.expectEqual(4, wc);
+    try testing.expectEqual(3, counting_stream.codepoints_written);
+    const expected_count = try std.unicode.utf8CountCodepoints(utf8_text);
+    try testing.expectEqual(expected_count, counting_stream.codepoints_written);
+    try testing.expectEqualSlices(u8, utf8_text, fbs.getWritten());
+}
author	owl	2024-08-30 12:23:33 +0200
committer	Komari Spaghetti	2024-08-30 16:46:05 +0200
commit	70bc70375f8e82843830d93631ab005302057a15 (patch)
tree	0553ae38d11a3d52696c898fceafd312e7ff284a /clap
parent	feat: Allow for the assignment separator to be configured (diff)
download	zig-clap-70bc70375f8e82843830d93631ab005302057a15.tar.gz zig-clap-70bc70375f8e82843830d93631ab005302057a15.tar.xz zig-clap-70bc70375f8e82843830d93631ab005302057a15.zip

diff --git a/clap/codepoint_counting_writer.zig b/clap/codepoint_counting_writer.zig new file mode 100644 index 0000000..e6b9d1c --- /dev/null +++ b/clap/codepoint_counting_writer.zig
@@ -0,0 +1,102 @@
	1	const std = @import("std");
	2	const builtin = @import("builtin");
	3	const native_endian = builtin.cpu.arch.endian();
	4
	5	/// A Writer that counts how many codepoints has been written to it.
	6	/// Expects valid UTF-8 input, and does not validate the input.
	7	pub fn CodepointCountingWriter(comptime WriterType: type) type {
	8	return struct {
	9	codepoints_written: u64,
	10	child_stream: WriterType,
	11
	12	pub const Error = WriterType.Error \|\| error{Utf8InvalidStartByte};
	13	pub const Writer = std.io.Writer(*Self, Error, write);
	14
	15	const Self = @This();
	16
	17	pub fn write(self: *Self, bytes: []const u8) Error!usize {
	18	const bytes_and_codepoints = try utf8CountCodepointsAllowTruncate(bytes);
	19	// Might not be the full input, so the leftover bytes are written on the next call.
	20	const bytes_to_write = bytes[0..bytes_and_codepoints.bytes];
	21	const amt = try self.child_stream.write(bytes_to_write);
	22	const bytes_written = bytes_to_write[0..amt];
	23	self.codepoints_written += (try utf8CountCodepointsAllowTruncate(bytes_written)).codepoints;
	24	return amt;
	25	}
	26
	27	pub fn writer(self: *Self) Writer {
	28	return .{ .context = self };
	29	}
	30	};
	31	}
	32
	33	// Like `std.unicode.utf8CountCodepoints`, but on truncated input, it returns
	34	// the number of codepoints up to that point.
	35	// Does not validate UTF-8 beyond checking the start byte.
	36	fn utf8CountCodepointsAllowTruncate(s: []const u8) !struct { bytes: usize, codepoints: usize } {
	37	var len: usize = 0;
	38
	39	const N = @sizeOf(usize);
	40	const MASK = 0x80 * (std.math.maxInt(usize) / 0xff);
	41
	42	var i: usize = 0;
	43	while (i < s.len) {
	44	// Fast path for ASCII sequences
	45	while (i + N <= s.len) : (i += N) {
	46	const v = std.mem.readInt(usize, s[i..][0..N], native_endian);
	47	if (v & MASK != 0) break;
	48	len += N;
	49	}
	50
	51	if (i < s.len) {
	52	const n = try std.unicode.utf8ByteSequenceLength(s[i]);
	53	// Truncated input; return the current counts.
	54	if (i + n > s.len) return .{ .bytes = i, .codepoints = len };
	55
	56	i += n;
	57	len += 1;
	58	}
	59	}
	60
	61	return .{ .bytes = i, .codepoints = len };
	62	}
	63
	64	pub fn codepointCountingWriter(child_stream: anytype) CodepointCountingWriter(@TypeOf(child_stream)) {
	65	return .{ .codepoints_written = 0, .child_stream = child_stream };
	66	}
	67
	68	const testing = std.testing;
	69
	70	test CodepointCountingWriter {
	71	var counting_stream = codepointCountingWriter(std.io.null_writer);
	72	const stream = counting_stream.writer();
	73
	74	const utf8_text = "blåhaj" ** 100;
	75	stream.writeAll(utf8_text) catch unreachable;
	76	const expected_count = try std.unicode.utf8CountCodepoints(utf8_text);
	77	try testing.expectEqual(expected_count, counting_stream.codepoints_written);
	78	}
	79
	80	test "handles partial UTF-8 writes" {
	81	var buf: [100]u8 = undefined;
	82	var fbs = std.io.fixedBufferStream(&buf);
	83	var counting_stream = codepointCountingWriter(fbs.writer());
	84	const stream = counting_stream.writer();
	85
	86	const utf8_text = "ååå";
	87	// `å` is represented as `\xC5\xA5`, write 1.5 `å`s.
	88	var wc = try stream.write(utf8_text[0..3]);
	89	// One should have been written fully.
	90	try testing.expectEqual("å".len, wc);
	91	try testing.expectEqual(1, counting_stream.codepoints_written);
	92
	93	// Write the rest, continuing from the reported number of bytes written.
	94	wc = try stream.write(utf8_text[wc..]);
	95	try testing.expectEqual(4, wc);
	96	try testing.expectEqual(3, counting_stream.codepoints_written);
	97
	98	const expected_count = try std.unicode.utf8CountCodepoints(utf8_text);
	99	try testing.expectEqual(expected_count, counting_stream.codepoints_written);
	100
	101	try testing.expectEqualSlices(u8, utf8_text, fbs.getWritten());
	102	}