diff options
| author | 2024-02-17 09:50:50 -0400 | |
|---|---|---|
| committer | 2024-02-17 09:50:50 -0400 | |
| commit | 6c1a88471fc6444ee93d6ca0c64d0953a0d857ac (patch) | |
| tree | c9ac886559bd1117b75482ab690364a5e792ad2c /src/Grapheme.zig | |
| parent | isAsciiOnly SIMD tweaks (diff) | |
| download | zg-6c1a88471fc6444ee93d6ca0c64d0953a0d857ac.tar.gz zg-6c1a88471fc6444ee93d6ca0c64d0953a0d857ac.tar.xz zg-6c1a88471fc6444ee93d6ca0c64d0953a0d857ac.zip | |
GraphemeIterator ASCII optimization 3x faster
Diffstat (limited to 'src/Grapheme.zig')
| -rw-r--r-- | src/Grapheme.zig | 79 |
1 files changed, 42 insertions, 37 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig index 888fcd4..6892a2a 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig | |||
| @@ -45,9 +45,14 @@ pub const GraphemeIterator = struct { | |||
| 45 | pub fn next(self: *Self) ?Grapheme { | 45 | pub fn next(self: *Self) ?Grapheme { |
| 46 | self.advance(); | 46 | self.advance(); |
| 47 | 47 | ||
| 48 | // If at end | 48 | // If no more |
| 49 | if (self.buf[0] == null) return null; | 49 | if (self.buf[0] == null) return null; |
| 50 | // If last one | ||
| 50 | if (self.buf[1] == null) return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset }; | 51 | if (self.buf[1] == null) return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset }; |
| 52 | // If ASCII | ||
| 53 | if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) { | ||
| 54 | return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset }; | ||
| 55 | } | ||
| 51 | 56 | ||
| 52 | const gc_start = self.buf[0].?.offset; | 57 | const gc_start = self.buf[0].?.offset; |
| 53 | var gc_len: usize = self.buf[0].?.len; | 58 | var gc_len: usize = self.buf[0].?.len; |
| @@ -89,42 +94,6 @@ fn isIgnorable(cp: u21) bool { | |||
| 89 | return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}'; | 94 | return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}'; |
| 90 | } | 95 | } |
| 91 | 96 | ||
| 92 | test "Segmentation comptime GraphemeIterator" { | ||
| 93 | const want = [_][]const u8{ "H", "é", "l", "l", "o" }; | ||
| 94 | |||
| 95 | comptime { | ||
| 96 | const src = "Héllo"; | ||
| 97 | var ct_iter = GraphemeIterator.init(src); | ||
| 98 | var i = 0; | ||
| 99 | while (ct_iter.next()) |grapheme| : (i += 1) { | ||
| 100 | try std.testing.expect(grapheme.eql(src, want[i])); | ||
| 101 | } | ||
| 102 | } | ||
| 103 | } | ||
| 104 | |||
| 105 | test "Segmentation ZWJ and ZWSP emoji sequences" { | ||
| 106 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | ||
| 107 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | ||
| 108 | const with_zwj = seq_1 ++ "\u{200D}" ++ seq_2; | ||
| 109 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; | ||
| 110 | const no_joiner = seq_1 ++ seq_2; | ||
| 111 | |||
| 112 | var ct_iter = GraphemeIterator.init(with_zwj); | ||
| 113 | var i: usize = 0; | ||
| 114 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 115 | try std.testing.expectEqual(@as(usize, 1), i); | ||
| 116 | |||
| 117 | ct_iter = GraphemeIterator.init(with_zwsp); | ||
| 118 | i = 0; | ||
| 119 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 120 | try std.testing.expectEqual(@as(usize, 3), i); | ||
| 121 | |||
| 122 | ct_iter = GraphemeIterator.init(no_joiner); | ||
| 123 | i = 0; | ||
| 124 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 125 | try std.testing.expectEqual(@as(usize, 2), i); | ||
| 126 | } | ||
| 127 | |||
| 128 | // Grapheme break state. | 97 | // Grapheme break state. |
| 129 | // Extended Pictographic (emoji) | 98 | // Extended Pictographic (emoji) |
| 130 | fn hasXpic(state: *const u3) bool { | 99 | fn hasXpic(state: *const u3) bool { |
| @@ -322,3 +291,39 @@ test "Segmentation GraphemeIterator" { | |||
| 322 | } | 291 | } |
| 323 | } | 292 | } |
| 324 | } | 293 | } |
| 294 | |||
| 295 | test "Segmentation comptime GraphemeIterator" { | ||
| 296 | const want = [_][]const u8{ "H", "é", "l", "l", "o" }; | ||
| 297 | |||
| 298 | comptime { | ||
| 299 | const src = "Héllo"; | ||
| 300 | var ct_iter = GraphemeIterator.init(src); | ||
| 301 | var i = 0; | ||
| 302 | while (ct_iter.next()) |grapheme| : (i += 1) { | ||
| 303 | try std.testing.expect(grapheme.eql(src, want[i])); | ||
| 304 | } | ||
| 305 | } | ||
| 306 | } | ||
| 307 | |||
| 308 | test "Segmentation ZWJ and ZWSP emoji sequences" { | ||
| 309 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | ||
| 310 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | ||
| 311 | const with_zwj = seq_1 ++ "\u{200D}" ++ seq_2; | ||
| 312 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; | ||
| 313 | const no_joiner = seq_1 ++ seq_2; | ||
| 314 | |||
| 315 | var ct_iter = GraphemeIterator.init(with_zwj); | ||
| 316 | var i: usize = 0; | ||
| 317 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 318 | try std.testing.expectEqual(@as(usize, 1), i); | ||
| 319 | |||
| 320 | ct_iter = GraphemeIterator.init(with_zwsp); | ||
| 321 | i = 0; | ||
| 322 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 323 | try std.testing.expectEqual(@as(usize, 3), i); | ||
| 324 | |||
| 325 | ct_iter = GraphemeIterator.init(no_joiner); | ||
| 326 | i = 0; | ||
| 327 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 328 | try std.testing.expectEqual(@as(usize, 2), i); | ||
| 329 | } | ||