summaryrefslogtreecommitdiff
path: root/clap
diff options
context:
space:
mode:
authorGravatar owl2024-08-30 12:23:33 +0200
committerGravatar Komari Spaghetti2024-08-30 16:46:05 +0200
commit70bc70375f8e82843830d93631ab005302057a15 (patch)
tree0553ae38d11a3d52696c898fceafd312e7ff284a /clap
parentfeat: Allow for the assignment separator to be configured (diff)
downloadzig-clap-70bc70375f8e82843830d93631ab005302057a15.tar.gz
zig-clap-70bc70375f8e82843830d93631ab005302057a15.tar.xz
zig-clap-70bc70375f8e82843830d93631ab005302057a15.zip
count codepoints instead of bytes, to determine width
A complete solution would be to count grapheme clusters, but that would require adding a dependency on something like zg. Counting codepoints will ensure that typical non-ASCII text is supported, but you can still throw it off with more complex Unicode constructions, which might not be so useful in help text. Fixes #75
Diffstat (limited to 'clap')
-rw-r--r--clap/codepoint_counting_writer.zig102
1 files changed, 102 insertions, 0 deletions
diff --git a/clap/codepoint_counting_writer.zig b/clap/codepoint_counting_writer.zig
new file mode 100644
index 0000000..e6b9d1c
--- /dev/null
+++ b/clap/codepoint_counting_writer.zig
@@ -0,0 +1,102 @@
1const std = @import("std");
2const builtin = @import("builtin");
3const native_endian = builtin.cpu.arch.endian();
4
5/// A Writer that counts how many codepoints has been written to it.
6/// Expects valid UTF-8 input, and does not validate the input.
7pub fn CodepointCountingWriter(comptime WriterType: type) type {
8 return struct {
9 codepoints_written: u64,
10 child_stream: WriterType,
11
12 pub const Error = WriterType.Error || error{Utf8InvalidStartByte};
13 pub const Writer = std.io.Writer(*Self, Error, write);
14
15 const Self = @This();
16
17 pub fn write(self: *Self, bytes: []const u8) Error!usize {
18 const bytes_and_codepoints = try utf8CountCodepointsAllowTruncate(bytes);
19 // Might not be the full input, so the leftover bytes are written on the next call.
20 const bytes_to_write = bytes[0..bytes_and_codepoints.bytes];
21 const amt = try self.child_stream.write(bytes_to_write);
22 const bytes_written = bytes_to_write[0..amt];
23 self.codepoints_written += (try utf8CountCodepointsAllowTruncate(bytes_written)).codepoints;
24 return amt;
25 }
26
27 pub fn writer(self: *Self) Writer {
28 return .{ .context = self };
29 }
30 };
31}
32
33// Like `std.unicode.utf8CountCodepoints`, but on truncated input, it returns
34// the number of codepoints up to that point.
35// Does not validate UTF-8 beyond checking the start byte.
36fn utf8CountCodepointsAllowTruncate(s: []const u8) !struct { bytes: usize, codepoints: usize } {
37 var len: usize = 0;
38
39 const N = @sizeOf(usize);
40 const MASK = 0x80 * (std.math.maxInt(usize) / 0xff);
41
42 var i: usize = 0;
43 while (i < s.len) {
44 // Fast path for ASCII sequences
45 while (i + N <= s.len) : (i += N) {
46 const v = std.mem.readInt(usize, s[i..][0..N], native_endian);
47 if (v & MASK != 0) break;
48 len += N;
49 }
50
51 if (i < s.len) {
52 const n = try std.unicode.utf8ByteSequenceLength(s[i]);
53 // Truncated input; return the current counts.
54 if (i + n > s.len) return .{ .bytes = i, .codepoints = len };
55
56 i += n;
57 len += 1;
58 }
59 }
60
61 return .{ .bytes = i, .codepoints = len };
62}
63
64pub fn codepointCountingWriter(child_stream: anytype) CodepointCountingWriter(@TypeOf(child_stream)) {
65 return .{ .codepoints_written = 0, .child_stream = child_stream };
66}
67
68const testing = std.testing;
69
70test CodepointCountingWriter {
71 var counting_stream = codepointCountingWriter(std.io.null_writer);
72 const stream = counting_stream.writer();
73
74 const utf8_text = "blåhaj" ** 100;
75 stream.writeAll(utf8_text) catch unreachable;
76 const expected_count = try std.unicode.utf8CountCodepoints(utf8_text);
77 try testing.expectEqual(expected_count, counting_stream.codepoints_written);
78}
79
80test "handles partial UTF-8 writes" {
81 var buf: [100]u8 = undefined;
82 var fbs = std.io.fixedBufferStream(&buf);
83 var counting_stream = codepointCountingWriter(fbs.writer());
84 const stream = counting_stream.writer();
85
86 const utf8_text = "ååå";
87 // `å` is represented as `\xC5\xA5`, write 1.5 `å`s.
88 var wc = try stream.write(utf8_text[0..3]);
89 // One should have been written fully.
90 try testing.expectEqual("å".len, wc);
91 try testing.expectEqual(1, counting_stream.codepoints_written);
92
93 // Write the rest, continuing from the reported number of bytes written.
94 wc = try stream.write(utf8_text[wc..]);
95 try testing.expectEqual(4, wc);
96 try testing.expectEqual(3, counting_stream.codepoints_written);
97
98 const expected_count = try std.unicode.utf8CountCodepoints(utf8_text);
99 try testing.expectEqual(expected_count, counting_stream.codepoints_written);
100
101 try testing.expectEqualSlices(u8, utf8_text, fbs.getWritten());
102}