summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-18 09:20:19 -0400
committerGravatar Jose Colon Rodriguez2024-02-18 09:20:19 -0400
commitf913551d27e07f0a7c7e201ba3141fd3a6cbb47c (patch)
tree74bfe0fb0aa98d053b5ab76beec6fdc733026017
parentCode point and grapheme are now namespaces. (diff)
downloadzg-f913551d27e07f0a7c7e201ba3141fd3a6cbb47c.tar.gz
zg-f913551d27e07f0a7c7e201ba3141fd3a6cbb47c.tar.xz
zg-f913551d27e07f0a7c7e201ba3141fd3a6cbb47c.zip
Code point code is now a method not a field.
-rw-r--r--src/Grapheme.zig16
-rw-r--r--src/code_point.zig68
-rw-r--r--src/display_width.zig7
3 files changed, 47 insertions, 44 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig
index f013aba..6981753 100644
--- a/src/Grapheme.zig
+++ b/src/Grapheme.zig
@@ -1,7 +1,6 @@
1const std = @import("std"); 1const std = @import("std");
2const unicode = std.unicode; 2const unicode = std.unicode;
3 3
4const CodePoint = @import("code_point").CodePoint;
5const CodePointIterator = @import("code_point").Iterator; 4const CodePointIterator = @import("code_point").Iterator;
6const gbp = @import("gbp"); 5const gbp = @import("gbp");
7 6
@@ -17,6 +16,13 @@ pub const Grapheme = struct {
17 } 16 }
18}; 17};
19 18
19// We need the code as a u21.
20const CodePoint = struct {
21 code: u21,
22 len: u3,
23 offset: u32,
24};
25
20/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. 26/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
21pub const Iterator = struct { 27pub const Iterator = struct {
22 buf: [2]?CodePoint = .{ null, null }, 28 buf: [2]?CodePoint = .{ null, null },
@@ -33,7 +39,13 @@ pub const Iterator = struct {
33 39
34 fn advance(self: *Self) void { 40 fn advance(self: *Self) void {
35 self.buf[0] = self.buf[1]; 41 self.buf[0] = self.buf[1];
36 self.buf[1] = self.cp_iter.next(); 42
43 const maybe_cp = self.cp_iter.next();
44 self.buf[1] = if (maybe_cp) |cp| .{
45 .code = cp.code(self.cp_iter.bytes),
46 .len = cp.len,
47 .offset = cp.offset,
48 } else null;
37 } 49 }
38 50
39 pub fn next(self: *Self) ?Grapheme { 51 pub fn next(self: *Self) ?Grapheme {
diff --git a/src/code_point.zig b/src/code_point.zig
index ac37562..098e635 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -3,9 +3,29 @@ const std = @import("std");
3/// `CodePoint` represents a Unicode code point by its code, 3/// `CodePoint` represents a Unicode code point by its code,
4/// length, and offset in the source bytes. 4/// length, and offset in the source bytes.
5pub const CodePoint = struct { 5pub const CodePoint = struct {
6 code: u21,
7 len: u3, 6 len: u3,
8 offset: u32, 7 offset: u32,
8
9 pub fn code(self: CodePoint, src: []const u8) u21 {
10 const cp_bytes = src[self.offset..][0..self.len];
11
12 return switch (self.len) {
13 1 => cp_bytes[0],
14
15 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
16
17 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
18 (cp_bytes[1] & 0b00111111)) << 6) |
19 (cp_bytes[2] & 0b00111111),
20
21 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
22 (cp_bytes[1] & 0b00111111)) << 6) |
23 (cp_bytes[2] & 0b00111111)) << 6) |
24 (cp_bytes[3] & 0b00111111),
25
26 else => @panic("code_point.CodePoint.code: Invalid code point length."),
27 };
28 }
9}; 29};
10 30
11/// `Iterator` iterates a string one `CodePoint` at-a-time. 31/// `Iterator` iterates a string one `CodePoint` at-a-time.
@@ -19,51 +39,20 @@ pub const Iterator = struct {
19 if (self.bytes[self.i] < 128) { 39 if (self.bytes[self.i] < 128) {
20 // ASCII fast path 40 // ASCII fast path
21 defer self.i += 1; 41 defer self.i += 1;
22 42 return .{ .len = 1, .offset = self.i };
23 return .{
24 .code = self.bytes[self.i],
25 .len = 1,
26 .offset = self.i,
27 };
28 } 43 }
29 44
30 var cp = CodePoint{ 45 const cp = CodePoint{
31 .code = undefined,
32 .len = switch (self.bytes[self.i]) { 46 .len = switch (self.bytes[self.i]) {
33 0b1100_0000...0b1101_1111 => 2, 47 0b1100_0000...0b1101_1111 => 2,
34 0b1110_0000...0b1110_1111 => 3, 48 0b1110_0000...0b1110_1111 => 3,
35 0b1111_0000...0b1111_0111 => 4, 49 0b1111_0000...0b1111_0111 => 4,
36 else => { 50 else => @panic("code_point.Iterator.next: Invalid start byte."),
37 defer self.i += 1;
38 // Unicode replacement code point.
39 return .{
40 .code = 0xfffd,
41 .len = 1,
42 .offset = self.i,
43 };
44 },
45 }, 51 },
46 .offset = self.i, 52 .offset = self.i,
47 }; 53 };
48 54
49 const cp_bytes = self.bytes[self.i..][0..cp.len];
50 self.i += cp.len; 55 self.i += cp.len;
51
52 cp.code = switch (cp.len) {
53 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
54
55 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
56 (cp_bytes[1] & 0b00111111)) << 6) |
57 (cp_bytes[2] & 0b00111111),
58
59 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
60 (cp_bytes[1] & 0b00111111)) << 6) |
61 (cp_bytes[2] & 0b00111111)) << 6) |
62 (cp_bytes[3] & 0b00111111),
63
64 else => @panic("CodePointIterator.next invalid code point length."),
65 };
66
67 return cp; 56 return cp;
68 } 57 }
69 58
@@ -75,11 +64,12 @@ pub const Iterator = struct {
75}; 64};
76 65
77test "peek" { 66test "peek" {
78 var iter = Iterator{ .bytes = "Hi" }; 67 const src = "Hi";
68 var iter = Iterator{ .bytes = src };
79 69
80 try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); 70 try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code(src));
81 try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); 71 try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code(src));
82 try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code); 72 try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code(src));
83 try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); 73 try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
84 try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); 74 try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
85} 75}
diff --git a/src/display_width.zig b/src/display_width.zig
index e52da38..7f39566 100644
--- a/src/display_width.zig
+++ b/src/display_width.zig
@@ -52,17 +52,18 @@ pub fn strWidth(str: []const u8) usize {
52 var giter = GraphemeIterator.init(str); 52 var giter = GraphemeIterator.init(str);
53 53
54 while (giter.next()) |gc| { 54 while (giter.next()) |gc| {
55 var cp_iter = CodePointIterator{ .bytes = str[gc.offset..][0..gc.len] }; 55 const gc_bytes = gc.bytes(str);
56 var cp_iter = CodePointIterator{ .bytes = gc_bytes };
56 var gc_total: isize = 0; 57 var gc_total: isize = 0;
57 58
58 while (cp_iter.next()) |cp| { 59 while (cp_iter.next()) |cp| {
59 var w = codePointWidth(cp.code); 60 var w = codePointWidth(cp.code(gc_bytes));
60 61
61 if (w != 0) { 62 if (w != 0) {
62 // Handle text emoji sequence. 63 // Handle text emoji sequence.
63 if (cp_iter.next()) |ncp| { 64 if (cp_iter.next()) |ncp| {
64 // emoji text sequence. 65 // emoji text sequence.
65 if (ncp.code == 0xFE0E) w = 1; 66 if (ncp.code(gc_bytes) == 0xFE0E) w = 1;
66 } 67 }
67 68
68 // Only adding width of first non-zero-width code point. 69 // Only adding width of first non-zero-width code point.