diff options
| author | 2024-02-17 09:50:50 -0400 | |
|---|---|---|
| committer | 2024-02-17 09:50:50 -0400 | |
| commit | 6c1a88471fc6444ee93d6ca0c64d0953a0d857ac (patch) | |
| tree | c9ac886559bd1117b75482ab690364a5e792ad2c /src | |
| parent | isAsciiOnly SIMD tweaks (diff) | |
| download | zg-6c1a88471fc6444ee93d6ca0c64d0953a0d857ac.tar.gz zg-6c1a88471fc6444ee93d6ca0c64d0953a0d857ac.tar.xz zg-6c1a88471fc6444ee93d6ca0c64d0953a0d857ac.zip | |
GraphemeIterator ASCII optimization 3x faster
Diffstat (limited to 'src')
| -rw-r--r-- | src/CodePoint.zig | 27 | ||||
| -rw-r--r-- | src/Grapheme.zig | 79 | ||||
| -rw-r--r-- | src/main.zig | 13 |
3 files changed, 64 insertions, 55 deletions
diff --git a/src/CodePoint.zig b/src/CodePoint.zig index c03ecac..1c1bec1 100644 --- a/src/CodePoint.zig +++ b/src/CodePoint.zig | |||
| @@ -18,26 +18,29 @@ pub const CodePointIterator = struct { | |||
| 18 | 18 | ||
| 19 | if (self.bytes[self.i] < 128) { | 19 | if (self.bytes[self.i] < 128) { |
| 20 | // ASCII fast path | 20 | // ASCII fast path |
| 21 | const cp = CodePoint{ | 21 | defer self.i += 1; |
| 22 | return .{ | ||
| 22 | .code = self.bytes[self.i], | 23 | .code = self.bytes[self.i], |
| 23 | .len = 1, | 24 | .len = 1, |
| 24 | .offset = self.i, | 25 | .offset = self.i, |
| 25 | }; | 26 | }; |
| 26 | |||
| 27 | self.i += 1; | ||
| 28 | |||
| 29 | return cp; | ||
| 30 | } | 27 | } |
| 31 | 28 | ||
| 32 | var cp = CodePoint{ | 29 | var cp = CodePoint{ |
| 33 | .code = undefined, | 30 | .code = undefined, |
| 34 | .len = blk: { | 31 | .len = switch (self.bytes[self.i]) { |
| 35 | break :blk switch (self.bytes[self.i]) { | 32 | 0b1100_0000...0b1101_1111 => 2, |
| 36 | 0b1100_0000...0b1101_1111 => 2, | 33 | 0b1110_0000...0b1110_1111 => 3, |
| 37 | 0b1110_0000...0b1110_1111 => 3, | 34 | 0b1111_0000...0b1111_0111 => 4, |
| 38 | 0b1111_0000...0b1111_0111 => 4, | 35 | else => { |
| 39 | else => @panic("CodePointIterator.next: Ivalid code point start byte."), | 36 | self.i += 1; |
| 40 | }; | 37 | // Unicode replacement code point. |
| 38 | return .{ | ||
| 39 | .code = 0xfffd, | ||
| 40 | .len = 1, | ||
| 41 | .offset = self.i - 1, | ||
| 42 | }; | ||
| 43 | }, | ||
| 41 | }, | 44 | }, |
| 42 | .offset = self.i, | 45 | .offset = self.i, |
| 43 | }; | 46 | }; |
diff --git a/src/Grapheme.zig b/src/Grapheme.zig index 888fcd4..6892a2a 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig | |||
| @@ -45,9 +45,14 @@ pub const GraphemeIterator = struct { | |||
| 45 | pub fn next(self: *Self) ?Grapheme { | 45 | pub fn next(self: *Self) ?Grapheme { |
| 46 | self.advance(); | 46 | self.advance(); |
| 47 | 47 | ||
| 48 | // If at end | 48 | // If no more |
| 49 | if (self.buf[0] == null) return null; | 49 | if (self.buf[0] == null) return null; |
| 50 | // If last one | ||
| 50 | if (self.buf[1] == null) return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset }; | 51 | if (self.buf[1] == null) return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset }; |
| 52 | // If ASCII | ||
| 53 | if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) { | ||
| 54 | return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset }; | ||
| 55 | } | ||
| 51 | 56 | ||
| 52 | const gc_start = self.buf[0].?.offset; | 57 | const gc_start = self.buf[0].?.offset; |
| 53 | var gc_len: usize = self.buf[0].?.len; | 58 | var gc_len: usize = self.buf[0].?.len; |
| @@ -89,42 +94,6 @@ fn isIgnorable(cp: u21) bool { | |||
| 89 | return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}'; | 94 | return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}'; |
| 90 | } | 95 | } |
| 91 | 96 | ||
| 92 | test "Segmentation comptime GraphemeIterator" { | ||
| 93 | const want = [_][]const u8{ "H", "é", "l", "l", "o" }; | ||
| 94 | |||
| 95 | comptime { | ||
| 96 | const src = "Héllo"; | ||
| 97 | var ct_iter = GraphemeIterator.init(src); | ||
| 98 | var i = 0; | ||
| 99 | while (ct_iter.next()) |grapheme| : (i += 1) { | ||
| 100 | try std.testing.expect(grapheme.eql(src, want[i])); | ||
| 101 | } | ||
| 102 | } | ||
| 103 | } | ||
| 104 | |||
| 105 | test "Segmentation ZWJ and ZWSP emoji sequences" { | ||
| 106 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | ||
| 107 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | ||
| 108 | const with_zwj = seq_1 ++ "\u{200D}" ++ seq_2; | ||
| 109 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; | ||
| 110 | const no_joiner = seq_1 ++ seq_2; | ||
| 111 | |||
| 112 | var ct_iter = GraphemeIterator.init(with_zwj); | ||
| 113 | var i: usize = 0; | ||
| 114 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 115 | try std.testing.expectEqual(@as(usize, 1), i); | ||
| 116 | |||
| 117 | ct_iter = GraphemeIterator.init(with_zwsp); | ||
| 118 | i = 0; | ||
| 119 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 120 | try std.testing.expectEqual(@as(usize, 3), i); | ||
| 121 | |||
| 122 | ct_iter = GraphemeIterator.init(no_joiner); | ||
| 123 | i = 0; | ||
| 124 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 125 | try std.testing.expectEqual(@as(usize, 2), i); | ||
| 126 | } | ||
| 127 | |||
| 128 | // Grapheme break state. | 97 | // Grapheme break state. |
| 129 | // Extended Pictographic (emoji) | 98 | // Extended Pictographic (emoji) |
| 130 | fn hasXpic(state: *const u3) bool { | 99 | fn hasXpic(state: *const u3) bool { |
| @@ -322,3 +291,39 @@ test "Segmentation GraphemeIterator" { | |||
| 322 | } | 291 | } |
| 323 | } | 292 | } |
| 324 | } | 293 | } |
| 294 | |||
| 295 | test "Segmentation comptime GraphemeIterator" { | ||
| 296 | const want = [_][]const u8{ "H", "é", "l", "l", "o" }; | ||
| 297 | |||
| 298 | comptime { | ||
| 299 | const src = "Héllo"; | ||
| 300 | var ct_iter = GraphemeIterator.init(src); | ||
| 301 | var i = 0; | ||
| 302 | while (ct_iter.next()) |grapheme| : (i += 1) { | ||
| 303 | try std.testing.expect(grapheme.eql(src, want[i])); | ||
| 304 | } | ||
| 305 | } | ||
| 306 | } | ||
| 307 | |||
| 308 | test "Segmentation ZWJ and ZWSP emoji sequences" { | ||
| 309 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | ||
| 310 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | ||
| 311 | const with_zwj = seq_1 ++ "\u{200D}" ++ seq_2; | ||
| 312 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; | ||
| 313 | const no_joiner = seq_1 ++ seq_2; | ||
| 314 | |||
| 315 | var ct_iter = GraphemeIterator.init(with_zwj); | ||
| 316 | var i: usize = 0; | ||
| 317 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 318 | try std.testing.expectEqual(@as(usize, 1), i); | ||
| 319 | |||
| 320 | ct_iter = GraphemeIterator.init(with_zwsp); | ||
| 321 | i = 0; | ||
| 322 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 323 | try std.testing.expectEqual(@as(usize, 3), i); | ||
| 324 | |||
| 325 | ct_iter = GraphemeIterator.init(no_joiner); | ||
| 326 | i = 0; | ||
| 327 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 328 | try std.testing.expectEqual(@as(usize, 2), i); | ||
| 329 | } | ||
diff --git a/src/main.zig b/src/main.zig index e7c0828..bb188ff 100644 --- a/src/main.zig +++ b/src/main.zig | |||
| @@ -1,12 +1,12 @@ | |||
| 1 | const std = @import("std"); | 1 | const std = @import("std"); |
| 2 | 2 | ||
| 3 | // const GraphemeIterator = @import("ziglyph").GraphemeIterator; | 3 | // const GraphemeIterator = @import("ziglyph").GraphemeIterator; |
| 4 | // const GraphemeIterator = @import("Grapheme").GraphemeIterator; | 4 | const GraphemeIterator = @import("Grapheme").GraphemeIterator; |
| 5 | // const codePointWidth = @import("ziglyph").display_width.codePointWidth; | 5 | // const codePointWidth = @import("ziglyph").display_width.codePointWidth; |
| 6 | // const codePointWidth = @import("display_width").codePointWidth; | 6 | // const codePointWidth = @import("display_width").codePointWidth; |
| 7 | // const strWidth = @import("ziglyph").display_width.strWidth; | 7 | // const strWidth = @import("ziglyph").display_width.strWidth; |
| 8 | const strWidth = @import("display_width").strWidth; | 8 | // const strWidth = @import("display_width").strWidth; |
| 9 | const CodePointIterator = @import("CodePoint").CodePointIterator; | 9 | // const CodePointIterator = @import("CodePoint").CodePointIterator; |
| 10 | 10 | ||
| 11 | pub fn main() !void { | 11 | pub fn main() !void { |
| 12 | var gpa = std.heap.GeneralPurposeAllocator(.{}){}; | 12 | var gpa = std.heap.GeneralPurposeAllocator(.{}){}; |
| @@ -17,15 +17,16 @@ pub fn main() !void { | |||
| 17 | defer allocator.free(input); | 17 | defer allocator.free(input); |
| 18 | 18 | ||
| 19 | var result: usize = 0; | 19 | var result: usize = 0; |
| 20 | // var iter = GraphemeIterator.init(input); | 20 | var iter = GraphemeIterator.init(input); |
| 21 | // var iter = CodePointIterator{ .bytes = input }; | 21 | // var iter = CodePointIterator{ .bytes = input }; |
| 22 | var iter = std.mem.splitScalar(u8, input, '\n'); | 22 | // var iter = std.mem.splitScalar(u8, input, '\n'); |
| 23 | 23 | ||
| 24 | var timer = try std.time.Timer.start(); | 24 | var timer = try std.time.Timer.start(); |
| 25 | 25 | ||
| 26 | // for (0..50) |_| { | 26 | // for (0..50) |_| { |
| 27 | // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); | 27 | // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); |
| 28 | while (iter.next()) |line| result += strWidth(line); | 28 | while (iter.next()) |_| result += 1; |
| 29 | // while (iter.next()) |line| result += strWidth(line); | ||
| 29 | // iter.cp_iter.i = 0; | 30 | // iter.cp_iter.i = 0; |
| 30 | // } | 31 | // } |
| 31 | 32 | ||