1 files changed, 63 insertions, 92 deletions
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 81d874c..d14b6ab 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -3,70 +3,46 @@
 //! Code for handling graphemes: fragments of string which should be
 //! treated as one unit.  Like Farmer Bob here: 👨🏻‍🌾
-s1: []u16 = undefined,
-s2: []u16 = undefined,
-s3: []u8 = undefined,
 const Graphemes = @This();
-pub fn init(allocator: Allocator) Allocator.Error!Graphemes {
+const Data = struct {
-    var graphemes = Graphemes{};
+    s1: []const u16 = undefined,
-    try graphemes.setup(allocator);
+    s2: []const u7 = undefined,
-    return graphemes;
+    s3: []const u8 = undefined,
-}
+};
-pub fn setup(graphemes: *Graphemes, allocator: Allocator) Allocator.Error!void {
-    const in_bytes = @embedFile("gbp");
-    var in_fbs = std.io.fixedBufferStream(in_bytes);
-    var reader = in_fbs.reader();
-    const endian = builtin.cpu.arch.endian();
-    const s1_len: u16 = reader.readInt(u16, endian) catch unreachable;
-    graphemes.s1 = try allocator.alloc(u16, s1_len);
-    errdefer allocator.free(graphemes.s1);
-    for (0..s1_len) |i| graphemes.s1[i] = reader.readInt(u16, endian) catch unreachable;
-    const s2_len: u16 = reader.readInt(u16, endian) catch unreachable;
-    graphemes.s2 = try allocator.alloc(u16, s2_len);
-    errdefer allocator.free(graphemes.s2);
-    for (0..s2_len) |i| graphemes.s2[i] = reader.readInt(u16, endian) catch unreachable;
-    const s3_len: u16 = reader.readInt(u16, endian) catch unreachable;
-    graphemes.s3 = try allocator.alloc(u8, s3_len);
-    errdefer allocator.free(graphemes.s3);
-    _ = reader.readAll(graphemes.s3) catch unreachable;
-}
-pub fn deinit(graphemes: *const Graphemes, allocator: Allocator) void {
+const graphemes = graphemes: {
-    allocator.free(graphemes.s1);
+    const data = @import("gbp");
-    allocator.free(graphemes.s2);
+    break :graphemes Data{
-    allocator.free(graphemes.s3);
+        .s1 = &data.s1,
-}
+        .s2 = &data.s2,
+        .s3 = &data.s3,
+    };
+};
 /// Lookup the grapheme break property for a code point.
-pub fn gbp(graphemes: Graphemes, cp: u21) Gbp {
+pub fn gbp(cp: u21) Gbp {
    return @enumFromInt(graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 4);
 }
 /// Lookup the indic syllable type for a code point.
-pub fn indic(graphemes: Graphemes, cp: u21) Indic {
+pub fn indic(cp: u21) Indic {
    return @enumFromInt((graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7);
 }
 /// Lookup the emoji property for a code point.
-pub fn isEmoji(graphemes: Graphemes, cp: u21) bool {
+pub fn isEmoji(cp: u21) bool {
    return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1;
 }
 /// Returns an iterator over the graphemes in `string`.
-pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator {
+pub fn iterator(string: []const u8) Iterator {
-    return Iterator.init(string, graphemes);
+    return Iterator.init(string);
 }
 /// Returns a reverse iterator over the graphemes in `string`.
-pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator {
+pub fn reverseIterator(string: []const u8) ReverseIterator {
-    return ReverseIterator.init(string, graphemes);
+    return ReverseIterator.init(string);
 }
 /// Indic syllable type.
@@ -81,6 +57,7 @@ pub const Indic = enum {
 /// Grapheme break property.
 pub const Gbp = enum {
    none,
    Control,
    CR,
    Extend,
@@ -117,7 +94,7 @@ pub const Grapheme = struct {
 /// Returns the `Grapheme` at `string[index]`, which does not have to be a
 /// valid start of a codepoint.  Asserts the string is not empty.  Index must be
 /// less than `string.len`.  Always returns a `Grapheme`.
-pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme {
+pub fn graphemeAtIndex(string: []const u8, index: usize) Grapheme {
    assert(string.len != 0);
    if (index == 0 or (index > 0 and
        string[index] < 0x80 and
@@ -125,7 +102,7 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u
        (string[index - 1] != '\r' and string[index] != '\n'))
    {
        // There's always a grapheme break between two ASCII code points (except CRLF)
-        var iter = graphemes.iterator(string[index..]);
+        var iter = Graphemes.iterator(string[index..]);
        const next = iter.next().?;
        return Grapheme{
            .len = next.len,
@@ -134,14 +111,14 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u
    } // Otherwise it gets hairy.
    const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset;
    if (idx == string.len) {
-        var iter = graphemes.reverseIterator(string);
+        var iter = Graphemes.reverseIterator(string);
        return iter.prev().?;
    }
    // We're on a valid codepoint boundary, we go back from here
-    var r_iter = graphemes.reverseIterAtIndex(string, idx);
+    var r_iter = Graphemes.reverseIterAtIndex(string, idx);
    if (r_iter.prev()) |g| {
        if (g.offset == 0) {
-            var iter = graphemes.iterator(string);
+            var iter = Graphemes.iterator(string);
            while (iter.next()) |g2| {
                if (g2.offset <= idx and idx < g2.offset + g2.len) return g2;
            }
@@ -151,7 +128,7 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u
    // we in fact need to be.
    _ = r_iter.prev();
    while (r_iter.pending != .none) : (_ = r_iter.prev()) {}
-    var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0);
+    var iter = Graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0);
    while (iter.next()) |g| {
        if (g.offset <= idx and idx < g.offset + g.len) return g;
    }
@@ -159,23 +136,22 @@ pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: u
 }
 /// Return a (forward) iterator of `string` after `grapheme`.
-pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator {
+pub fn iterateAfterGrapheme(string: []const u8, grapheme: Grapheme) Iterator {
-    return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len);
+    return Graphemes.iterAtIndex(string, grapheme.offset + grapheme.len);
 }
 /// Return a reverse iterator of `string` before `grapheme`.
-pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator {
+pub fn iterateBeforeGrapheme(string: []const u8, grapheme: Grapheme) ReverseIterator {
    // This bit of weirdness is because reverse iterators are "advance last",
    // while forward iterators are "advance first".  This leaves some room for
    // further optimization, if anyone dares.
-    var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1);
+    var r_iter = Graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1);
    _ = r_iter.prev();
    return r_iter;
 }
-fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator {
+fn reverseIterAtIndex(string: []const u8, idx: uoffset) ReverseIterator {
    var r_iter: ReverseIterator = undefined;
-    r_iter.data = graphemes;
    var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
    r_iter.buf[1] = rcp_iter.prev();
    r_iter.buf[0] = rcp_iter.prev();
@@ -184,9 +160,8 @@ fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoff
    return r_iter;
 }
-fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator {
+fn iterAtIndex(string: []const u8, idx: uoffset) Iterator {
    var iter: Iterator = undefined;
-    iter.data = graphemes;
    iter.buf[0] = first: {
        if (idx == string.len) break :first null;
        var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx };
@@ -202,13 +177,12 @@ fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) It
 pub const Iterator = struct {
    buf: [2]?CodePoint = .{ null, null },
    cp_iter: CodePointIterator,
-    data: *const Graphemes,
    const Self = @This();
    /// Assumes `src` is valid UTF-8.
-    pub fn init(str: []const u8, data: *const Graphemes) Self {
+    pub fn init(str: []const u8) Self {
-        var self = Self{ .cp_iter = .{ .bytes = str }, .data = data };
+        var self = Self{ .cp_iter = .{ .bytes = str } };
        self.advance();
        return self;
    }
@@ -237,7 +211,6 @@ pub const Iterator = struct {
        if (graphemeBreak(
            self.buf[0].?.code,
            self.buf[1].?.code,
-            self.data,
            &state,
        )) return Grapheme{ .len = gc_len, .offset = gc_start };
@@ -250,7 +223,6 @@ pub const Iterator = struct {
            if (graphemeBreak(
                self.buf[0].?.code,
                if (self.buf[1]) |ncp| ncp.code else 0,
-                self.data,
                &state,
            )) break;
        }
@@ -275,7 +247,6 @@ pub const Iterator = struct {
 pub const ReverseIterator = struct {
    buf: [2]?CodePoint = .{ null, null },
    cp_iter: CodePointReverseIterator,
-    data: *const Graphemes,
    /// Codepoint read from `cp_iter` but not returned by `previous`
    pending: Pending = .none,
@@ -289,8 +260,8 @@ pub const ReverseIterator = struct {
    const Self = @This();
-    pub fn init(str: []const u8, data: *const Graphemes) Self {
+    pub fn init(str: []const u8) Self {
-        var self: Self = .{ .cp_iter = .init(str), .data = data };
+        var self: Self = .{ .cp_iter = .init(str) };
        self.advance();
        self.advance();
        return self;
@@ -352,7 +323,6 @@ pub const ReverseIterator = struct {
            if (graphemeBreak(
                self.buf[0].?.code,
                self.buf[1].?.code,
-                self.data,
                &state,
            )) break;
@@ -374,7 +344,7 @@ pub const ReverseIterator = struct {
                    const codepoint = self.buf[0].?;
-                    switch (self.data.indic(codepoint.code)) {
+                    switch (Graphemes.indic(codepoint.code)) {
                        .Extend, .Linker => {
                            self.advance();
                            continue :indic;
@@ -387,7 +357,7 @@ pub const ReverseIterator = struct {
                            if (self.buf[0]) |cp1| {
                                state.indic = true;
-                                if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break;
+                                if (graphemeBreak(cp1.code, self.buf[1].?.code, &state)) break;
                                if (!state.indic) {
                                    continue :indic;
@@ -426,12 +396,12 @@ pub const ReverseIterator = struct {
                    const codepoint = self.buf[0].?;
-                    if (self.data.gbp(codepoint.code) == .Extend) {
+                    if (Graphemes.gbp(codepoint.code) == .Extend) {
                        self.advance();
                        continue :emoji;
                    }
-                    if (self.data.isEmoji(codepoint.code)) {
+                    if (Graphemes.isEmoji(codepoint.code)) {
                        // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)*
                        emoji_offset = codepoint.offset;
                        self.advance();
@@ -462,7 +432,7 @@ pub const ReverseIterator = struct {
            if (state.regional) {
                var ri_count: usize = 0;
                while (self.buf[0] != null and
-                    self.data.gbp(self.buf[0].?.code) == .Regional_Indicator)
+                    Graphemes.gbp(self.buf[0].?.code) == .Regional_Indicator)
                {
                    ri_count += 1;
                    self.advance();
@@ -500,10 +470,13 @@ pub const IterState = packed struct(u3) {
    indic: bool = false,
 };
+// TODO: isBreaker is also expensive given the data is already available,
+// and should be "semantically inlined" wherever it belongs.
 // Predicates
-fn isBreaker(cp: u21, data: *const Graphemes) bool {
+fn isBreaker(cp: u21) bool {
    // Extract relevant properties.
-    const cp_gbp_prop = data.gbp(cp);
+    const cp_gbp_prop = Graphemes.gbp(cp);
    return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
 }
@@ -516,17 +489,20 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool {
 pub fn graphemeBreak(
    cp1: u21,
    cp2: u21,
-    data: *const Graphemes,
    state: *IterState,
 ) bool {
+    // TODO: it's silly to index the same field three times and
+    // just extra different bits from the data.  Optimizable? Maybe
+    // but it's silly to rely on that.
+    //
    // Extract relevant properties.
-    const cp1_gbp_prop = data.gbp(cp1);
+    const cp1_gbp_prop = Graphemes.gbp(cp1);
-    const cp1_indic_prop = data.indic(cp1);
+    const cp1_indic_prop = Graphemes.indic(cp1);
-    const cp1_is_emoji = data.isEmoji(cp1);
+    const cp1_is_emoji = Graphemes.isEmoji(cp1);
-    const cp2_gbp_prop = data.gbp(cp2);
+    const cp2_gbp_prop = Graphemes.gbp(cp2);
-    const cp2_indic_prop = data.indic(cp2);
+    const cp2_indic_prop = Graphemes.indic(cp2);
-    const cp2_is_emoji = data.isEmoji(cp2);
+    const cp2_is_emoji = Graphemes.isEmoji(cp2);
    // GB11: Emoji Extend* ZWJ x Emoji
    if (!state.xpic and cp1_is_emoji) state.xpic = true;
@@ -537,7 +513,7 @@ pub fn graphemeBreak(
    if (cp1 == '\r' and cp2 == '\n') return false;
    // GB4: Control
-    if (isBreaker(cp1, data)) return true;
+    if (isBreaker(cp1)) return true;
    // GB11: Emoji Extend* ZWJ x Emoji
    if (state.xpic and
@@ -555,7 +531,7 @@ pub fn graphemeBreak(
    if (cp2_gbp_prop == .SpacingMark) return false;
    // GB9b: Prepend x
-    if (cp1_gbp_prop == .Prepend and !isBreaker(cp2, data)) return false;
+    if (cp1_gbp_prop == .Prepend and !isBreaker(cp2)) return false;
    // GB12, GB13: RI x RI
    if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
@@ -620,25 +596,22 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
    const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;
    const no_joiner = seq_1 ++ seq_2;
-    const graphemes = try Graphemes.init(std.testing.allocator);
-    defer graphemes.deinit(std.testing.allocator);
    {
-        var iter = graphemes.iterator(with_zwj);
+        var iter = Graphemes.iterator(with_zwj);
        var i: usize = 0;
        while (iter.next()) |_| : (i += 1) {}
        try std.testing.expectEqual(@as(usize, 1), i);
    }
    {
-        var iter = graphemes.iterator(with_zwsp);
+        var iter = Graphemes.iterator(with_zwsp);
        var i: usize = 0;
        while (iter.next()) |_| : (i += 1) {}
        try std.testing.expectEqual(@as(usize, 3), i);
    }
    {
-        var iter = graphemes.iterator(no_joiner);
+        var iter = Graphemes.iterator(no_joiner);
        var i: usize = 0;
        while (iter.next()) |_| : (i += 1) {}
        try std.testing.expectEqual(@as(usize, 2), i);
@@ -647,10 +620,8 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
 test "Iterator.peek" {
    const peek_seq = "aΔ👨🏻‍🌾→";
-    const data = try Graphemes.init(std.testing.allocator);
-    defer data.deinit(std.testing.allocator);
-    var iter = data.iterator(peek_seq);
+    var iter = Graphemes.iterator(peek_seq);
    const peek_a = iter.peek().?;
    const next_a = iter.next().?;
    try std.testing.expectEqual(peek_a, next_a);