diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/Grapheme.zig | 16 | ||||
| -rw-r--r-- | src/code_point.zig | 68 | ||||
| -rw-r--r-- | src/display_width.zig | 7 |
3 files changed, 47 insertions, 44 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig index f013aba..6981753 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig | |||
| @@ -1,7 +1,6 @@ | |||
| 1 | const std = @import("std"); | 1 | const std = @import("std"); |
| 2 | const unicode = std.unicode; | 2 | const unicode = std.unicode; |
| 3 | 3 | ||
| 4 | const CodePoint = @import("code_point").CodePoint; | ||
| 5 | const CodePointIterator = @import("code_point").Iterator; | 4 | const CodePointIterator = @import("code_point").Iterator; |
| 6 | const gbp = @import("gbp"); | 5 | const gbp = @import("gbp"); |
| 7 | 6 | ||
| @@ -17,6 +16,13 @@ pub const Grapheme = struct { | |||
| 17 | } | 16 | } |
| 18 | }; | 17 | }; |
| 19 | 18 | ||
| 19 | // We need the code as a u21. | ||
| 20 | const CodePoint = struct { | ||
| 21 | code: u21, | ||
| 22 | len: u3, | ||
| 23 | offset: u32, | ||
| 24 | }; | ||
| 25 | |||
| 20 | /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. | 26 | /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. |
| 21 | pub const Iterator = struct { | 27 | pub const Iterator = struct { |
| 22 | buf: [2]?CodePoint = .{ null, null }, | 28 | buf: [2]?CodePoint = .{ null, null }, |
| @@ -33,7 +39,13 @@ pub const Iterator = struct { | |||
| 33 | 39 | ||
| 34 | fn advance(self: *Self) void { | 40 | fn advance(self: *Self) void { |
| 35 | self.buf[0] = self.buf[1]; | 41 | self.buf[0] = self.buf[1]; |
| 36 | self.buf[1] = self.cp_iter.next(); | 42 | |
| 43 | const maybe_cp = self.cp_iter.next(); | ||
| 44 | self.buf[1] = if (maybe_cp) |cp| .{ | ||
| 45 | .code = cp.code(self.cp_iter.bytes), | ||
| 46 | .len = cp.len, | ||
| 47 | .offset = cp.offset, | ||
| 48 | } else null; | ||
| 37 | } | 49 | } |
| 38 | 50 | ||
| 39 | pub fn next(self: *Self) ?Grapheme { | 51 | pub fn next(self: *Self) ?Grapheme { |
diff --git a/src/code_point.zig b/src/code_point.zig index ac37562..098e635 100644 --- a/src/code_point.zig +++ b/src/code_point.zig | |||
| @@ -3,9 +3,29 @@ const std = @import("std"); | |||
| 3 | /// `CodePoint` represents a Unicode code point by its code, | 3 | /// `CodePoint` represents a Unicode code point by its code, |
| 4 | /// length, and offset in the source bytes. | 4 | /// length, and offset in the source bytes. |
| 5 | pub const CodePoint = struct { | 5 | pub const CodePoint = struct { |
| 6 | code: u21, | ||
| 7 | len: u3, | 6 | len: u3, |
| 8 | offset: u32, | 7 | offset: u32, |
| 8 | |||
| 9 | pub fn code(self: CodePoint, src: []const u8) u21 { | ||
| 10 | const cp_bytes = src[self.offset..][0..self.len]; | ||
| 11 | |||
| 12 | return switch (self.len) { | ||
| 13 | 1 => cp_bytes[0], | ||
| 14 | |||
| 15 | 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111), | ||
| 16 | |||
| 17 | 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) | | ||
| 18 | (cp_bytes[1] & 0b00111111)) << 6) | | ||
| 19 | (cp_bytes[2] & 0b00111111), | ||
| 20 | |||
| 21 | 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) | | ||
| 22 | (cp_bytes[1] & 0b00111111)) << 6) | | ||
| 23 | (cp_bytes[2] & 0b00111111)) << 6) | | ||
| 24 | (cp_bytes[3] & 0b00111111), | ||
| 25 | |||
| 26 | else => @panic("code_point.CodePoint.code: Invalid code point length."), | ||
| 27 | }; | ||
| 28 | } | ||
| 9 | }; | 29 | }; |
| 10 | 30 | ||
| 11 | /// `Iterator` iterates a string one `CodePoint` at-a-time. | 31 | /// `Iterator` iterates a string one `CodePoint` at-a-time. |
| @@ -19,51 +39,20 @@ pub const Iterator = struct { | |||
| 19 | if (self.bytes[self.i] < 128) { | 39 | if (self.bytes[self.i] < 128) { |
| 20 | // ASCII fast path | 40 | // ASCII fast path |
| 21 | defer self.i += 1; | 41 | defer self.i += 1; |
| 22 | 42 | return .{ .len = 1, .offset = self.i }; | |
| 23 | return .{ | ||
| 24 | .code = self.bytes[self.i], | ||
| 25 | .len = 1, | ||
| 26 | .offset = self.i, | ||
| 27 | }; | ||
| 28 | } | 43 | } |
| 29 | 44 | ||
| 30 | var cp = CodePoint{ | 45 | const cp = CodePoint{ |
| 31 | .code = undefined, | ||
| 32 | .len = switch (self.bytes[self.i]) { | 46 | .len = switch (self.bytes[self.i]) { |
| 33 | 0b1100_0000...0b1101_1111 => 2, | 47 | 0b1100_0000...0b1101_1111 => 2, |
| 34 | 0b1110_0000...0b1110_1111 => 3, | 48 | 0b1110_0000...0b1110_1111 => 3, |
| 35 | 0b1111_0000...0b1111_0111 => 4, | 49 | 0b1111_0000...0b1111_0111 => 4, |
| 36 | else => { | 50 | else => @panic("code_point.Iterator.next: Invalid start byte."), |
| 37 | defer self.i += 1; | ||
| 38 | // Unicode replacement code point. | ||
| 39 | return .{ | ||
| 40 | .code = 0xfffd, | ||
| 41 | .len = 1, | ||
| 42 | .offset = self.i, | ||
| 43 | }; | ||
| 44 | }, | ||
| 45 | }, | 51 | }, |
| 46 | .offset = self.i, | 52 | .offset = self.i, |
| 47 | }; | 53 | }; |
| 48 | 54 | ||
| 49 | const cp_bytes = self.bytes[self.i..][0..cp.len]; | ||
| 50 | self.i += cp.len; | 55 | self.i += cp.len; |
| 51 | |||
| 52 | cp.code = switch (cp.len) { | ||
| 53 | 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111), | ||
| 54 | |||
| 55 | 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) | | ||
| 56 | (cp_bytes[1] & 0b00111111)) << 6) | | ||
| 57 | (cp_bytes[2] & 0b00111111), | ||
| 58 | |||
| 59 | 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) | | ||
| 60 | (cp_bytes[1] & 0b00111111)) << 6) | | ||
| 61 | (cp_bytes[2] & 0b00111111)) << 6) | | ||
| 62 | (cp_bytes[3] & 0b00111111), | ||
| 63 | |||
| 64 | else => @panic("CodePointIterator.next invalid code point length."), | ||
| 65 | }; | ||
| 66 | |||
| 67 | return cp; | 56 | return cp; |
| 68 | } | 57 | } |
| 69 | 58 | ||
| @@ -75,11 +64,12 @@ pub const Iterator = struct { | |||
| 75 | }; | 64 | }; |
| 76 | 65 | ||
| 77 | test "peek" { | 66 | test "peek" { |
| 78 | var iter = Iterator{ .bytes = "Hi" }; | 67 | const src = "Hi"; |
| 68 | var iter = Iterator{ .bytes = src }; | ||
| 79 | 69 | ||
| 80 | try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); | 70 | try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code(src)); |
| 81 | try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); | 71 | try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code(src)); |
| 82 | try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code); | 72 | try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code(src)); |
| 83 | try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); | 73 | try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); |
| 84 | try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); | 74 | try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); |
| 85 | } | 75 | } |
diff --git a/src/display_width.zig b/src/display_width.zig index e52da38..7f39566 100644 --- a/src/display_width.zig +++ b/src/display_width.zig | |||
| @@ -52,17 +52,18 @@ pub fn strWidth(str: []const u8) usize { | |||
| 52 | var giter = GraphemeIterator.init(str); | 52 | var giter = GraphemeIterator.init(str); |
| 53 | 53 | ||
| 54 | while (giter.next()) |gc| { | 54 | while (giter.next()) |gc| { |
| 55 | var cp_iter = CodePointIterator{ .bytes = str[gc.offset..][0..gc.len] }; | 55 | const gc_bytes = gc.bytes(str); |
| 56 | var cp_iter = CodePointIterator{ .bytes = gc_bytes }; | ||
| 56 | var gc_total: isize = 0; | 57 | var gc_total: isize = 0; |
| 57 | 58 | ||
| 58 | while (cp_iter.next()) |cp| { | 59 | while (cp_iter.next()) |cp| { |
| 59 | var w = codePointWidth(cp.code); | 60 | var w = codePointWidth(cp.code(gc_bytes)); |
| 60 | 61 | ||
| 61 | if (w != 0) { | 62 | if (w != 0) { |
| 62 | // Handle text emoji sequence. | 63 | // Handle text emoji sequence. |
| 63 | if (cp_iter.next()) |ncp| { | 64 | if (cp_iter.next()) |ncp| { |
| 64 | // emoji text sequence. | 65 | // emoji text sequence. |
| 65 | if (ncp.code == 0xFE0E) w = 1; | 66 | if (ncp.code(gc_bytes) == 0xFE0E) w = 1; |
| 66 | } | 67 | } |
| 67 | 68 | ||
| 68 | // Only adding width of first non-zero-width code point. | 69 | // Only adding width of first non-zero-width code point. |