From 6c1a88471fc6444ee93d6ca0c64d0953a0d857ac Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Sat, 17 Feb 2024 09:50:50 -0400 Subject: GraphemeIterator ASCII optimization 3x faster --- src/CodePoint.zig | 27 ++++++++++--------- src/Grapheme.zig | 79 +++++++++++++++++++++++++++++-------------------------- src/main.zig | 13 ++++----- 3 files changed, 64 insertions(+), 55 deletions(-) (limited to 'src') diff --git a/src/CodePoint.zig b/src/CodePoint.zig index c03ecac..1c1bec1 100644 --- a/src/CodePoint.zig +++ b/src/CodePoint.zig @@ -18,26 +18,29 @@ pub const CodePointIterator = struct { if (self.bytes[self.i] < 128) { // ASCII fast path - const cp = CodePoint{ + defer self.i += 1; + return .{ .code = self.bytes[self.i], .len = 1, .offset = self.i, }; - - self.i += 1; - - return cp; } var cp = CodePoint{ .code = undefined, - .len = blk: { - break :blk switch (self.bytes[self.i]) { - 0b1100_0000...0b1101_1111 => 2, - 0b1110_0000...0b1110_1111 => 3, - 0b1111_0000...0b1111_0111 => 4, - else => @panic("CodePointIterator.next: Ivalid code point start byte."), - }; + .len = switch (self.bytes[self.i]) { + 0b1100_0000...0b1101_1111 => 2, + 0b1110_0000...0b1110_1111 => 3, + 0b1111_0000...0b1111_0111 => 4, + else => { + self.i += 1; + // Unicode replacement code point. + return .{ + .code = 0xfffd, + .len = 1, + .offset = self.i - 1, + }; + }, }, .offset = self.i, }; diff --git a/src/Grapheme.zig b/src/Grapheme.zig index 888fcd4..6892a2a 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig @@ -45,9 +45,14 @@ pub const GraphemeIterator = struct { pub fn next(self: *Self) ?Grapheme { self.advance(); - // If at end + // If no more if (self.buf[0] == null) return null; + // If last one if (self.buf[1] == null) return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset }; + // If ASCII + if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) { + return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset }; + } const gc_start = self.buf[0].?.offset; var gc_len: usize = self.buf[0].?.len; @@ -89,42 +94,6 @@ fn isIgnorable(cp: u21) bool { return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}'; } -test "Segmentation comptime GraphemeIterator" { - const want = [_][]const u8{ "H", "é", "l", "l", "o" }; - - comptime { - const src = "Héllo"; - var ct_iter = GraphemeIterator.init(src); - var i = 0; - while (ct_iter.next()) |grapheme| : (i += 1) { - try std.testing.expect(grapheme.eql(src, want[i])); - } - } -} - -test "Segmentation ZWJ and ZWSP emoji sequences" { - const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; - const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; - const with_zwj = seq_1 ++ "\u{200D}" ++ seq_2; - const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; - const no_joiner = seq_1 ++ seq_2; - - var ct_iter = GraphemeIterator.init(with_zwj); - var i: usize = 0; - while (ct_iter.next()) |_| : (i += 1) {} - try std.testing.expectEqual(@as(usize, 1), i); - - ct_iter = GraphemeIterator.init(with_zwsp); - i = 0; - while (ct_iter.next()) |_| : (i += 1) {} - try std.testing.expectEqual(@as(usize, 3), i); - - ct_iter = GraphemeIterator.init(no_joiner); - i = 0; - while (ct_iter.next()) |_| : (i += 1) {} - try std.testing.expectEqual(@as(usize, 2), i); -} - // Grapheme break state. // Extended Pictographic (emoji) fn hasXpic(state: *const u3) bool { @@ -322,3 +291,39 @@ test "Segmentation GraphemeIterator" { } } } + +test "Segmentation comptime GraphemeIterator" { + const want = [_][]const u8{ "H", "é", "l", "l", "o" }; + + comptime { + const src = "Héllo"; + var ct_iter = GraphemeIterator.init(src); + var i = 0; + while (ct_iter.next()) |grapheme| : (i += 1) { + try std.testing.expect(grapheme.eql(src, want[i])); + } + } +} + +test "Segmentation ZWJ and ZWSP emoji sequences" { + const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; + const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; + const with_zwj = seq_1 ++ "\u{200D}" ++ seq_2; + const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; + const no_joiner = seq_1 ++ seq_2; + + var ct_iter = GraphemeIterator.init(with_zwj); + var i: usize = 0; + while (ct_iter.next()) |_| : (i += 1) {} + try std.testing.expectEqual(@as(usize, 1), i); + + ct_iter = GraphemeIterator.init(with_zwsp); + i = 0; + while (ct_iter.next()) |_| : (i += 1) {} + try std.testing.expectEqual(@as(usize, 3), i); + + ct_iter = GraphemeIterator.init(no_joiner); + i = 0; + while (ct_iter.next()) |_| : (i += 1) {} + try std.testing.expectEqual(@as(usize, 2), i); +} diff --git a/src/main.zig b/src/main.zig index e7c0828..bb188ff 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,12 +1,12 @@ const std = @import("std"); // const GraphemeIterator = @import("ziglyph").GraphemeIterator; -// const GraphemeIterator = @import("Grapheme").GraphemeIterator; +const GraphemeIterator = @import("Grapheme").GraphemeIterator; // const codePointWidth = @import("ziglyph").display_width.codePointWidth; // const codePointWidth = @import("display_width").codePointWidth; // const strWidth = @import("ziglyph").display_width.strWidth; -const strWidth = @import("display_width").strWidth; -const CodePointIterator = @import("CodePoint").CodePointIterator; +// const strWidth = @import("display_width").strWidth; +// const CodePointIterator = @import("CodePoint").CodePointIterator; pub fn main() !void { var gpa = std.heap.GeneralPurposeAllocator(.{}){}; @@ -17,15 +17,16 @@ pub fn main() !void { defer allocator.free(input); var result: usize = 0; - // var iter = GraphemeIterator.init(input); + var iter = GraphemeIterator.init(input); // var iter = CodePointIterator{ .bytes = input }; - var iter = std.mem.splitScalar(u8, input, '\n'); + // var iter = std.mem.splitScalar(u8, input, '\n'); var timer = try std.time.Timer.start(); // for (0..50) |_| { // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); - while (iter.next()) |line| result += strWidth(line); + while (iter.next()) |_| result += 1; + // while (iter.next()) |line| result += strWidth(line); // iter.cp_iter.i = 0; // } -- cgit v1.2.3