summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-17 09:50:50 -0400
committerGravatar Jose Colon Rodriguez2024-02-17 09:50:50 -0400
commit6c1a88471fc6444ee93d6ca0c64d0953a0d857ac (patch)
treec9ac886559bd1117b75482ab690364a5e792ad2c /src
parentisAsciiOnly SIMD tweaks (diff)
downloadzg-6c1a88471fc6444ee93d6ca0c64d0953a0d857ac.tar.gz
zg-6c1a88471fc6444ee93d6ca0c64d0953a0d857ac.tar.xz
zg-6c1a88471fc6444ee93d6ca0c64d0953a0d857ac.zip
GraphemeIterator ASCII optimization 3x faster
Diffstat (limited to 'src')
-rw-r--r--src/CodePoint.zig27
-rw-r--r--src/Grapheme.zig79
-rw-r--r--src/main.zig13
3 files changed, 64 insertions, 55 deletions
diff --git a/src/CodePoint.zig b/src/CodePoint.zig
index c03ecac..1c1bec1 100644
--- a/src/CodePoint.zig
+++ b/src/CodePoint.zig
@@ -18,26 +18,29 @@ pub const CodePointIterator = struct {
18 18
19 if (self.bytes[self.i] < 128) { 19 if (self.bytes[self.i] < 128) {
20 // ASCII fast path 20 // ASCII fast path
21 const cp = CodePoint{ 21 defer self.i += 1;
22 return .{
22 .code = self.bytes[self.i], 23 .code = self.bytes[self.i],
23 .len = 1, 24 .len = 1,
24 .offset = self.i, 25 .offset = self.i,
25 }; 26 };
26
27 self.i += 1;
28
29 return cp;
30 } 27 }
31 28
32 var cp = CodePoint{ 29 var cp = CodePoint{
33 .code = undefined, 30 .code = undefined,
34 .len = blk: { 31 .len = switch (self.bytes[self.i]) {
35 break :blk switch (self.bytes[self.i]) { 32 0b1100_0000...0b1101_1111 => 2,
36 0b1100_0000...0b1101_1111 => 2, 33 0b1110_0000...0b1110_1111 => 3,
37 0b1110_0000...0b1110_1111 => 3, 34 0b1111_0000...0b1111_0111 => 4,
38 0b1111_0000...0b1111_0111 => 4, 35 else => {
39 else => @panic("CodePointIterator.next: Ivalid code point start byte."), 36 self.i += 1;
40 }; 37 // Unicode replacement code point.
38 return .{
39 .code = 0xfffd,
40 .len = 1,
41 .offset = self.i - 1,
42 };
43 },
41 }, 44 },
42 .offset = self.i, 45 .offset = self.i,
43 }; 46 };
diff --git a/src/Grapheme.zig b/src/Grapheme.zig
index 888fcd4..6892a2a 100644
--- a/src/Grapheme.zig
+++ b/src/Grapheme.zig
@@ -45,9 +45,14 @@ pub const GraphemeIterator = struct {
45 pub fn next(self: *Self) ?Grapheme { 45 pub fn next(self: *Self) ?Grapheme {
46 self.advance(); 46 self.advance();
47 47
48 // If at end 48 // If no more
49 if (self.buf[0] == null) return null; 49 if (self.buf[0] == null) return null;
50 // If last one
50 if (self.buf[1] == null) return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset }; 51 if (self.buf[1] == null) return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset };
52 // If ASCII
53 if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) {
54 return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset };
55 }
51 56
52 const gc_start = self.buf[0].?.offset; 57 const gc_start = self.buf[0].?.offset;
53 var gc_len: usize = self.buf[0].?.len; 58 var gc_len: usize = self.buf[0].?.len;
@@ -89,42 +94,6 @@ fn isIgnorable(cp: u21) bool {
89 return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}'; 94 return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}';
90} 95}
91 96
92test "Segmentation comptime GraphemeIterator" {
93 const want = [_][]const u8{ "H", "é", "l", "l", "o" };
94
95 comptime {
96 const src = "Héllo";
97 var ct_iter = GraphemeIterator.init(src);
98 var i = 0;
99 while (ct_iter.next()) |grapheme| : (i += 1) {
100 try std.testing.expect(grapheme.eql(src, want[i]));
101 }
102 }
103}
104
105test "Segmentation ZWJ and ZWSP emoji sequences" {
106 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
107 const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
108 const with_zwj = seq_1 ++ "\u{200D}" ++ seq_2;
109 const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;
110 const no_joiner = seq_1 ++ seq_2;
111
112 var ct_iter = GraphemeIterator.init(with_zwj);
113 var i: usize = 0;
114 while (ct_iter.next()) |_| : (i += 1) {}
115 try std.testing.expectEqual(@as(usize, 1), i);
116
117 ct_iter = GraphemeIterator.init(with_zwsp);
118 i = 0;
119 while (ct_iter.next()) |_| : (i += 1) {}
120 try std.testing.expectEqual(@as(usize, 3), i);
121
122 ct_iter = GraphemeIterator.init(no_joiner);
123 i = 0;
124 while (ct_iter.next()) |_| : (i += 1) {}
125 try std.testing.expectEqual(@as(usize, 2), i);
126}
127
128// Grapheme break state. 97// Grapheme break state.
129// Extended Pictographic (emoji) 98// Extended Pictographic (emoji)
130fn hasXpic(state: *const u3) bool { 99fn hasXpic(state: *const u3) bool {
@@ -322,3 +291,39 @@ test "Segmentation GraphemeIterator" {
322 } 291 }
323 } 292 }
324} 293}
294
295test "Segmentation comptime GraphemeIterator" {
296 const want = [_][]const u8{ "H", "é", "l", "l", "o" };
297
298 comptime {
299 const src = "Héllo";
300 var ct_iter = GraphemeIterator.init(src);
301 var i = 0;
302 while (ct_iter.next()) |grapheme| : (i += 1) {
303 try std.testing.expect(grapheme.eql(src, want[i]));
304 }
305 }
306}
307
308test "Segmentation ZWJ and ZWSP emoji sequences" {
309 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
310 const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
311 const with_zwj = seq_1 ++ "\u{200D}" ++ seq_2;
312 const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;
313 const no_joiner = seq_1 ++ seq_2;
314
315 var ct_iter = GraphemeIterator.init(with_zwj);
316 var i: usize = 0;
317 while (ct_iter.next()) |_| : (i += 1) {}
318 try std.testing.expectEqual(@as(usize, 1), i);
319
320 ct_iter = GraphemeIterator.init(with_zwsp);
321 i = 0;
322 while (ct_iter.next()) |_| : (i += 1) {}
323 try std.testing.expectEqual(@as(usize, 3), i);
324
325 ct_iter = GraphemeIterator.init(no_joiner);
326 i = 0;
327 while (ct_iter.next()) |_| : (i += 1) {}
328 try std.testing.expectEqual(@as(usize, 2), i);
329}
diff --git a/src/main.zig b/src/main.zig
index e7c0828..bb188ff 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -1,12 +1,12 @@
1const std = @import("std"); 1const std = @import("std");
2 2
3// const GraphemeIterator = @import("ziglyph").GraphemeIterator; 3// const GraphemeIterator = @import("ziglyph").GraphemeIterator;
4// const GraphemeIterator = @import("Grapheme").GraphemeIterator; 4const GraphemeIterator = @import("Grapheme").GraphemeIterator;
5// const codePointWidth = @import("ziglyph").display_width.codePointWidth; 5// const codePointWidth = @import("ziglyph").display_width.codePointWidth;
6// const codePointWidth = @import("display_width").codePointWidth; 6// const codePointWidth = @import("display_width").codePointWidth;
7// const strWidth = @import("ziglyph").display_width.strWidth; 7// const strWidth = @import("ziglyph").display_width.strWidth;
8const strWidth = @import("display_width").strWidth; 8// const strWidth = @import("display_width").strWidth;
9const CodePointIterator = @import("CodePoint").CodePointIterator; 9// const CodePointIterator = @import("CodePoint").CodePointIterator;
10 10
11pub fn main() !void { 11pub fn main() !void {
12 var gpa = std.heap.GeneralPurposeAllocator(.{}){}; 12 var gpa = std.heap.GeneralPurposeAllocator(.{}){};
@@ -17,15 +17,16 @@ pub fn main() !void {
17 defer allocator.free(input); 17 defer allocator.free(input);
18 18
19 var result: usize = 0; 19 var result: usize = 0;
20 // var iter = GraphemeIterator.init(input); 20 var iter = GraphemeIterator.init(input);
21 // var iter = CodePointIterator{ .bytes = input }; 21 // var iter = CodePointIterator{ .bytes = input };
22 var iter = std.mem.splitScalar(u8, input, '\n'); 22 // var iter = std.mem.splitScalar(u8, input, '\n');
23 23
24 var timer = try std.time.Timer.start(); 24 var timer = try std.time.Timer.start();
25 25
26 // for (0..50) |_| { 26 // for (0..50) |_| {
27 // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); 27 // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code));
28 while (iter.next()) |line| result += strWidth(line); 28 while (iter.next()) |_| result += 1;
29 // while (iter.next()) |line| result += strWidth(line);
29 // iter.cp_iter.i = 0; 30 // iter.cp_iter.i = 0;
30 // } 31 // }
31 32