diff options
| author | 2024-08-30 12:23:33 +0200 | |
|---|---|---|
| committer | 2024-08-30 16:46:05 +0200 | |
| commit | 70bc70375f8e82843830d93631ab005302057a15 (patch) | |
| tree | 0553ae38d11a3d52696c898fceafd312e7ff284a /clap | |
| parent | feat: Allow for the assignment separator to be configured (diff) | |
| download | zig-clap-70bc70375f8e82843830d93631ab005302057a15.tar.gz zig-clap-70bc70375f8e82843830d93631ab005302057a15.tar.xz zig-clap-70bc70375f8e82843830d93631ab005302057a15.zip | |
count codepoints instead of bytes, to determine width
A complete solution would be to count grapheme clusters, but that would
require adding a dependency on something like zg.
Counting codepoints will ensure that typical non-ASCII text is
supported, but you can still throw it off with more complex Unicode
constructions, which might not be so useful in help text.
Fixes #75
Diffstat (limited to 'clap')
| -rw-r--r-- | clap/codepoint_counting_writer.zig | 102 |
1 files changed, 102 insertions, 0 deletions
diff --git a/clap/codepoint_counting_writer.zig b/clap/codepoint_counting_writer.zig new file mode 100644 index 0000000..e6b9d1c --- /dev/null +++ b/clap/codepoint_counting_writer.zig | |||
| @@ -0,0 +1,102 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | const native_endian = builtin.cpu.arch.endian(); | ||
| 4 | |||
| 5 | /// A Writer that counts how many codepoints has been written to it. | ||
| 6 | /// Expects valid UTF-8 input, and does not validate the input. | ||
| 7 | pub fn CodepointCountingWriter(comptime WriterType: type) type { | ||
| 8 | return struct { | ||
| 9 | codepoints_written: u64, | ||
| 10 | child_stream: WriterType, | ||
| 11 | |||
| 12 | pub const Error = WriterType.Error || error{Utf8InvalidStartByte}; | ||
| 13 | pub const Writer = std.io.Writer(*Self, Error, write); | ||
| 14 | |||
| 15 | const Self = @This(); | ||
| 16 | |||
| 17 | pub fn write(self: *Self, bytes: []const u8) Error!usize { | ||
| 18 | const bytes_and_codepoints = try utf8CountCodepointsAllowTruncate(bytes); | ||
| 19 | // Might not be the full input, so the leftover bytes are written on the next call. | ||
| 20 | const bytes_to_write = bytes[0..bytes_and_codepoints.bytes]; | ||
| 21 | const amt = try self.child_stream.write(bytes_to_write); | ||
| 22 | const bytes_written = bytes_to_write[0..amt]; | ||
| 23 | self.codepoints_written += (try utf8CountCodepointsAllowTruncate(bytes_written)).codepoints; | ||
| 24 | return amt; | ||
| 25 | } | ||
| 26 | |||
| 27 | pub fn writer(self: *Self) Writer { | ||
| 28 | return .{ .context = self }; | ||
| 29 | } | ||
| 30 | }; | ||
| 31 | } | ||
| 32 | |||
| 33 | // Like `std.unicode.utf8CountCodepoints`, but on truncated input, it returns | ||
| 34 | // the number of codepoints up to that point. | ||
| 35 | // Does not validate UTF-8 beyond checking the start byte. | ||
| 36 | fn utf8CountCodepointsAllowTruncate(s: []const u8) !struct { bytes: usize, codepoints: usize } { | ||
| 37 | var len: usize = 0; | ||
| 38 | |||
| 39 | const N = @sizeOf(usize); | ||
| 40 | const MASK = 0x80 * (std.math.maxInt(usize) / 0xff); | ||
| 41 | |||
| 42 | var i: usize = 0; | ||
| 43 | while (i < s.len) { | ||
| 44 | // Fast path for ASCII sequences | ||
| 45 | while (i + N <= s.len) : (i += N) { | ||
| 46 | const v = std.mem.readInt(usize, s[i..][0..N], native_endian); | ||
| 47 | if (v & MASK != 0) break; | ||
| 48 | len += N; | ||
| 49 | } | ||
| 50 | |||
| 51 | if (i < s.len) { | ||
| 52 | const n = try std.unicode.utf8ByteSequenceLength(s[i]); | ||
| 53 | // Truncated input; return the current counts. | ||
| 54 | if (i + n > s.len) return .{ .bytes = i, .codepoints = len }; | ||
| 55 | |||
| 56 | i += n; | ||
| 57 | len += 1; | ||
| 58 | } | ||
| 59 | } | ||
| 60 | |||
| 61 | return .{ .bytes = i, .codepoints = len }; | ||
| 62 | } | ||
| 63 | |||
| 64 | pub fn codepointCountingWriter(child_stream: anytype) CodepointCountingWriter(@TypeOf(child_stream)) { | ||
| 65 | return .{ .codepoints_written = 0, .child_stream = child_stream }; | ||
| 66 | } | ||
| 67 | |||
| 68 | const testing = std.testing; | ||
| 69 | |||
| 70 | test CodepointCountingWriter { | ||
| 71 | var counting_stream = codepointCountingWriter(std.io.null_writer); | ||
| 72 | const stream = counting_stream.writer(); | ||
| 73 | |||
| 74 | const utf8_text = "blåhaj" ** 100; | ||
| 75 | stream.writeAll(utf8_text) catch unreachable; | ||
| 76 | const expected_count = try std.unicode.utf8CountCodepoints(utf8_text); | ||
| 77 | try testing.expectEqual(expected_count, counting_stream.codepoints_written); | ||
| 78 | } | ||
| 79 | |||
| 80 | test "handles partial UTF-8 writes" { | ||
| 81 | var buf: [100]u8 = undefined; | ||
| 82 | var fbs = std.io.fixedBufferStream(&buf); | ||
| 83 | var counting_stream = codepointCountingWriter(fbs.writer()); | ||
| 84 | const stream = counting_stream.writer(); | ||
| 85 | |||
| 86 | const utf8_text = "ååå"; | ||
| 87 | // `å` is represented as `\xC5\xA5`, write 1.5 `å`s. | ||
| 88 | var wc = try stream.write(utf8_text[0..3]); | ||
| 89 | // One should have been written fully. | ||
| 90 | try testing.expectEqual("å".len, wc); | ||
| 91 | try testing.expectEqual(1, counting_stream.codepoints_written); | ||
| 92 | |||
| 93 | // Write the rest, continuing from the reported number of bytes written. | ||
| 94 | wc = try stream.write(utf8_text[wc..]); | ||
| 95 | try testing.expectEqual(4, wc); | ||
| 96 | try testing.expectEqual(3, counting_stream.codepoints_written); | ||
| 97 | |||
| 98 | const expected_count = try std.unicode.utf8CountCodepoints(utf8_text); | ||
| 99 | try testing.expectEqual(expected_count, counting_stream.codepoints_written); | ||
| 100 | |||
| 101 | try testing.expectEqualSlices(u8, utf8_text, fbs.getWritten()); | ||
| 102 | } | ||