summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-18 08:48:03 -0400
committerGravatar Jose Colon Rodriguez2024-02-18 08:48:03 -0400
commit1404c85f513a88bbd399ab9f3453da71e7478727 (patch)
tree0080678ceac38f223910d60bf650ebaddf27b0f9 /src
parentFixed isAsciiOnly and CodePointIterator ASCII bugs (diff)
downloadzg-1404c85f513a88bbd399ab9f3453da71e7478727.tar.gz
zg-1404c85f513a88bbd399ab9f3453da71e7478727.tar.xz
zg-1404c85f513a88bbd399ab9f3453da71e7478727.zip
Code point and grapheme are now namespaces.
Diffstat (limited to 'src')
-rw-r--r--src/Grapheme.zig67
-rw-r--r--src/code_point.zig (renamed from src/CodePoint.zig)39
-rw-r--r--src/display_width.zig4
3 files changed, 54 insertions, 56 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig
index 910aec5..f013aba 100644
--- a/src/Grapheme.zig
+++ b/src/Grapheme.zig
@@ -1,30 +1,25 @@
1//! `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
2
3const std = @import("std"); 1const std = @import("std");
4const unicode = std.unicode; 2const unicode = std.unicode;
5 3
6const CodePoint = @import("CodePoint"); 4const CodePoint = @import("code_point").CodePoint;
7const CodePointIterator = CodePoint.CodePointIterator; 5const CodePointIterator = @import("code_point").Iterator;
8const gbp = @import("gbp"); 6const gbp = @import("gbp");
9 7
10pub const Grapheme = @This(); 8/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
11 9pub const Grapheme = struct {
12len: usize, 10 len: u8,
13offset: usize, 11 offset: u32,
14 12
15/// `eql` comparse `str` with the bytes of this grapheme cluster in `src` for equality. 13 /// `bytes` returns the slice of bytes that correspond to
16pub fn eql(self: Grapheme, src: []const u8, other: []const u8) bool { 14 /// this grapheme cluster in `src`.
17 return std.mem.eql(u8, src[self.offset .. self.offset + self.len], other); 15 pub fn bytes(self: Grapheme, src: []const u8) []const u8 {
18} 16 return src[self.offset..][0..self.len];
19 17 }
20/// `slice` returns the bytes that correspond to this grapheme cluster in `src`. 18};
21pub fn slice(self: Grapheme, src: []const u8) []const u8 {
22 return src[self.offset .. self.offset + self.len];
23}
24 19
25/// `GraphemeIterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. 20/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
26pub const GraphemeIterator = struct { 21pub const Iterator = struct {
27 buf: [2]?CodePoint = [_]?CodePoint{ null, null }, 22 buf: [2]?CodePoint = .{ null, null },
28 cp_iter: CodePointIterator, 23 cp_iter: CodePointIterator,
29 24
30 const Self = @This(); 25 const Self = @This();
@@ -32,8 +27,7 @@ pub const GraphemeIterator = struct {
32 /// Assumes `src` is valid UTF-8. 27 /// Assumes `src` is valid UTF-8.
33 pub fn init(str: []const u8) Self { 28 pub fn init(str: []const u8) Self {
34 var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; 29 var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } };
35 self.buf[1] = self.cp_iter.next(); 30 self.advance();
36
37 return self; 31 return self;
38 } 32 }
39 33
@@ -55,7 +49,7 @@ pub const GraphemeIterator = struct {
55 } 49 }
56 50
57 const gc_start = self.buf[0].?.offset; 51 const gc_start = self.buf[0].?.offset;
58 var gc_len: usize = self.buf[0].?.len; 52 var gc_len: u8 = self.buf[0].?.len;
59 var state = State{}; 53 var state = State{};
60 54
61 if (graphemeBreak( 55 if (graphemeBreak(
@@ -266,13 +260,13 @@ test "Segmentation GraphemeIterator" {
266 defer all_bytes.deinit(); 260 defer all_bytes.deinit();
267 261
268 var graphemes = std.mem.split(u8, line, " ÷ "); 262 var graphemes = std.mem.split(u8, line, " ÷ ");
269 var bytes_index: usize = 0; 263 var bytes_index: u32 = 0;
270 264
271 while (graphemes.next()) |field| { 265 while (graphemes.next()) |field| {
272 var code_points = std.mem.split(u8, field, " "); 266 var code_points = std.mem.split(u8, field, " ");
273 var cp_buf: [4]u8 = undefined; 267 var cp_buf: [4]u8 = undefined;
274 var cp_index: usize = 0; 268 var cp_index: u32 = 0;
275 var gc_len: usize = 0; 269 var gc_len: u8 = 0;
276 270
277 while (code_points.next()) |code_point| { 271 while (code_points.next()) |code_point| {
278 if (std.mem.eql(u8, code_point, "×")) continue; 272 if (std.mem.eql(u8, code_point, "×")) continue;
@@ -288,12 +282,15 @@ test "Segmentation GraphemeIterator" {
288 } 282 }
289 283
290 // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); 284 // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
291 var iter = GraphemeIterator.init(all_bytes.items); 285 var iter = Iterator.init(all_bytes.items);
292 286
293 // Chaeck. 287 // Chaeck.
294 for (want.items) |w| { 288 for (want.items) |want_gc| {
295 const g = (iter.next()).?; 289 const got_gc = (iter.next()).?;
296 try std.testing.expect(w.eql(all_bytes.items, all_bytes.items[g.offset .. g.offset + g.len])); 290 try std.testing.expectEqualStrings(
291 want_gc.bytes(all_bytes.items),
292 got_gc.bytes(all_bytes.items),
293 );
297 } 294 }
298 } 295 }
299} 296}
@@ -303,10 +300,10 @@ test "Segmentation comptime GraphemeIterator" {
303 300
304 comptime { 301 comptime {
305 const src = "Héllo"; 302 const src = "Héllo";
306 var ct_iter = GraphemeIterator.init(src); 303 var ct_iter = Iterator.init(src);
307 var i = 0; 304 var i = 0;
308 while (ct_iter.next()) |grapheme| : (i += 1) { 305 while (ct_iter.next()) |grapheme| : (i += 1) {
309 try std.testing.expect(grapheme.eql(src, want[i])); 306 try std.testing.expectEqualStrings(grapheme.bytes(src), want[i]);
310 } 307 }
311 } 308 }
312} 309}
@@ -318,17 +315,17 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
318 const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; 315 const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;
319 const no_joiner = seq_1 ++ seq_2; 316 const no_joiner = seq_1 ++ seq_2;
320 317
321 var ct_iter = GraphemeIterator.init(with_zwj); 318 var ct_iter = Iterator.init(with_zwj);
322 var i: usize = 0; 319 var i: usize = 0;
323 while (ct_iter.next()) |_| : (i += 1) {} 320 while (ct_iter.next()) |_| : (i += 1) {}
324 try std.testing.expectEqual(@as(usize, 1), i); 321 try std.testing.expectEqual(@as(usize, 1), i);
325 322
326 ct_iter = GraphemeIterator.init(with_zwsp); 323 ct_iter = Iterator.init(with_zwsp);
327 i = 0; 324 i = 0;
328 while (ct_iter.next()) |_| : (i += 1) {} 325 while (ct_iter.next()) |_| : (i += 1) {}
329 try std.testing.expectEqual(@as(usize, 3), i); 326 try std.testing.expectEqual(@as(usize, 3), i);
330 327
331 ct_iter = GraphemeIterator.init(no_joiner); 328 ct_iter = Iterator.init(no_joiner);
332 i = 0; 329 i = 0;
333 while (ct_iter.next()) |_| : (i += 1) {} 330 while (ct_iter.next()) |_| : (i += 1) {}
334 try std.testing.expectEqual(@as(usize, 2), i); 331 try std.testing.expectEqual(@as(usize, 2), i);
diff --git a/src/CodePoint.zig b/src/code_point.zig
index 62dd793..ac37562 100644
--- a/src/CodePoint.zig
+++ b/src/code_point.zig
@@ -1,28 +1,29 @@
1//! `CodePoint` represents a Unicode code point by its code, length, and offset in the source bytes.
2
3const std = @import("std"); 1const std = @import("std");
4 2
5code: u21, 3/// `CodePoint` represents a Unicode code point by its code,
6len: u3, 4/// length, and offset in the source bytes.
7offset: usize, 5pub const CodePoint = struct {
8 6 code: u21,
9const CodePoint = @This(); 7 len: u3,
8 offset: u32,
9};
10 10
11/// `CodePointIterator` iterates a string one `CodePoint` at-a-time. 11/// `Iterator` iterates a string one `CodePoint` at-a-time.
12pub const CodePointIterator = struct { 12pub const Iterator = struct {
13 bytes: []const u8, 13 bytes: []const u8,
14 i: usize = 0, 14 i: u32 = 0,
15 15
16 pub fn next(self: *CodePointIterator) ?CodePoint { 16 pub fn next(self: *Iterator) ?CodePoint {
17 if (self.i >= self.bytes.len) return null; 17 if (self.i >= self.bytes.len) return null;
18 18
19 if (self.bytes[self.i] < 128) { 19 if (self.bytes[self.i] < 128) {
20 // ASCII fast path 20 // ASCII fast path
21 self.i += 1; 21 defer self.i += 1;
22
22 return .{ 23 return .{
23 .code = self.bytes[self.i - 1], 24 .code = self.bytes[self.i],
24 .len = 1, 25 .len = 1,
25 .offset = self.i - 1, 26 .offset = self.i,
26 }; 27 };
27 } 28 }
28 29
@@ -33,12 +34,12 @@ pub const CodePointIterator = struct {
33 0b1110_0000...0b1110_1111 => 3, 34 0b1110_0000...0b1110_1111 => 3,
34 0b1111_0000...0b1111_0111 => 4, 35 0b1111_0000...0b1111_0111 => 4,
35 else => { 36 else => {
36 self.i += 1; 37 defer self.i += 1;
37 // Unicode replacement code point. 38 // Unicode replacement code point.
38 return .{ 39 return .{
39 .code = 0xfffd, 40 .code = 0xfffd,
40 .len = 1, 41 .len = 1,
41 .offset = self.i - 1, 42 .offset = self.i,
42 }; 43 };
43 }, 44 },
44 }, 45 },
@@ -66,15 +67,15 @@ pub const CodePointIterator = struct {
66 return cp; 67 return cp;
67 } 68 }
68 69
69 pub fn peek(self: *CodePointIterator) ?CodePoint { 70 pub fn peek(self: *Iterator) ?CodePoint {
70 const saved_i = self.i; 71 const saved_i = self.i;
71 defer self.i = saved_i; 72 defer self.i = saved_i;
72 return self.next(); 73 return self.next();
73 } 74 }
74}; 75};
75 76
76test "CodePointIterator peek" { 77test "peek" {
77 var iter = CodePointIterator{ .bytes = "Hi" }; 78 var iter = Iterator{ .bytes = "Hi" };
78 79
79 try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); 80 try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
80 try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); 81 try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
diff --git a/src/display_width.zig b/src/display_width.zig
index ba76052..e52da38 100644
--- a/src/display_width.zig
+++ b/src/display_width.zig
@@ -2,8 +2,8 @@ const std = @import("std");
2const simd = std.simd; 2const simd = std.simd;
3const testing = std.testing; 3const testing = std.testing;
4 4
5const CodePointIterator = @import("CodePoint").CodePointIterator; 5const CodePointIterator = @import("code_point").Iterator;
6const GraphemeIterator = @import("Grapheme").GraphemeIterator; 6const GraphemeIterator = @import("grapheme").Iterator;
7const dwp = @import("dwp"); 7const dwp = @import("dwp");
8 8
9/// codePointWidth returns the number of cells `cp` requires when rendered 9/// codePointWidth returns the number of cells `cp` requires when rendered