From 1404c85f513a88bbd399ab9f3453da71e7478727 Mon Sep 17 00:00:00 2001
From: Jose Colon Rodriguez
Date: Sun, 18 Feb 2024 08:48:03 -0400
Subject: Code point and grapheme are now namespaces.

---
 src/CodePoint.zig     | 84 --------------------------------------------------
 src/Grapheme.zig      | 67 +++++++++++++++++++---------------------
 src/code_point.zig    | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++
 src/display_width.zig |  4 +--
 4 files changed, 119 insertions(+), 121 deletions(-)
 delete mode 100644 src/CodePoint.zig
 create mode 100644 src/code_point.zig

(limited to 'src')

diff --git a/src/CodePoint.zig b/src/CodePoint.zig
deleted file mode 100644
index 62dd793..0000000
--- a/src/CodePoint.zig
+++ /dev/null
@@ -1,84 +0,0 @@
-//! `CodePoint` represents a Unicode code point by its code, length, and offset in the source bytes.
-
-const std = @import("std");
-
-code: u21,
-len: u3,
-offset: usize,
-
-const CodePoint = @This();
-
-/// `CodePointIterator` iterates a string one `CodePoint` at-a-time.
-pub const CodePointIterator = struct {
-    bytes: []const u8,
-    i: usize = 0,
-
-    pub fn next(self: *CodePointIterator) ?CodePoint {
-        if (self.i >= self.bytes.len) return null;
-
-        if (self.bytes[self.i] < 128) {
-            // ASCII fast path
-            self.i += 1;
-            return .{
-                .code = self.bytes[self.i - 1],
-                .len = 1,
-                .offset = self.i - 1,
-            };
-        }
-
-        var cp = CodePoint{
-            .code = undefined,
-            .len = switch (self.bytes[self.i]) {
-                0b1100_0000...0b1101_1111 => 2,
-                0b1110_0000...0b1110_1111 => 3,
-                0b1111_0000...0b1111_0111 => 4,
-                else => {
-                    self.i += 1;
-                    // Unicode replacement code point.
-                    return .{
-                        .code = 0xfffd,
-                        .len = 1,
-                        .offset = self.i - 1,
-                    };
-                },
-            },
-            .offset = self.i,
-        };
-
-        const cp_bytes = self.bytes[self.i..][0..cp.len];
-        self.i += cp.len;
-
-        cp.code = switch (cp.len) {
-            2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
-
-            3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
-                (cp_bytes[1] & 0b00111111)) << 6) |
-                (cp_bytes[2] & 0b00111111),
-
-            4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
-                (cp_bytes[1] & 0b00111111)) << 6) |
-                (cp_bytes[2] & 0b00111111)) << 6) |
-                (cp_bytes[3] & 0b00111111),
-
-            else => @panic("CodePointIterator.next invalid code point length."),
-        };
-
-        return cp;
-    }
-
-    pub fn peek(self: *CodePointIterator) ?CodePoint {
-        const saved_i = self.i;
-        defer self.i = saved_i;
-        return self.next();
-    }
-};
-
-test "CodePointIterator peek" {
-    var iter = CodePointIterator{ .bytes = "Hi" };
-
-    try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
-    try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
-    try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);
-    try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
-    try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
-}
diff --git a/src/Grapheme.zig b/src/Grapheme.zig
index 910aec5..f013aba 100644
--- a/src/Grapheme.zig
+++ b/src/Grapheme.zig
@@ -1,30 +1,25 @@
-//! `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
-
 const std = @import("std");
 const unicode = std.unicode;
 
-const CodePoint = @import("CodePoint");
-const CodePointIterator = CodePoint.CodePointIterator;
+const CodePoint = @import("code_point").CodePoint;
+const CodePointIterator = @import("code_point").Iterator;
 const gbp = @import("gbp");
 
-pub const Grapheme = @This();
-
-len: usize,
-offset: usize,
+/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
+pub const Grapheme = struct {
+    len: u8,
+    offset: u32,
 
-/// `eql` comparse `str` with the bytes of this grapheme cluster in `src` for equality.
-pub fn eql(self: Grapheme, src: []const u8, other: []const u8) bool {
-    return std.mem.eql(u8, src[self.offset .. self.offset + self.len], other);
-}
-
-/// `slice` returns the bytes that correspond to this grapheme cluster in `src`.
-pub fn slice(self: Grapheme, src: []const u8) []const u8 {
-    return src[self.offset .. self.offset + self.len];
-}
+    /// `bytes` returns the slice of bytes that correspond to
+    /// this grapheme cluster in `src`.
+    pub fn bytes(self: Grapheme, src: []const u8) []const u8 {
+        return src[self.offset..][0..self.len];
+    }
+};
 
-/// `GraphemeIterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
-pub const GraphemeIterator = struct {
-    buf: [2]?CodePoint = [_]?CodePoint{ null, null },
+/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
+pub const Iterator = struct {
+    buf: [2]?CodePoint = .{ null, null },
     cp_iter: CodePointIterator,
 
     const Self = @This();
@@ -32,8 +27,7 @@ pub const GraphemeIterator = struct {
     /// Assumes `src` is valid UTF-8.
     pub fn init(str: []const u8) Self {
         var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } };
-        self.buf[1] = self.cp_iter.next();
-
+        self.advance();
         return self;
     }
 
@@ -55,7 +49,7 @@ pub const GraphemeIterator = struct {
         }
 
         const gc_start = self.buf[0].?.offset;
-        var gc_len: usize = self.buf[0].?.len;
+        var gc_len: u8 = self.buf[0].?.len;
         var state = State{};
 
         if (graphemeBreak(
@@ -266,13 +260,13 @@ test "Segmentation GraphemeIterator" {
         defer all_bytes.deinit();
 
         var graphemes = std.mem.split(u8, line, " ÷ ");
-        var bytes_index: usize = 0;
+        var bytes_index: u32 = 0;
 
         while (graphemes.next()) |field| {
             var code_points = std.mem.split(u8, field, " ");
             var cp_buf: [4]u8 = undefined;
-            var cp_index: usize = 0;
-            var gc_len: usize = 0;
+            var cp_index: u32 = 0;
+            var gc_len: u8 = 0;
 
             while (code_points.next()) |code_point| {
                 if (std.mem.eql(u8, code_point, "×")) continue;
@@ -288,12 +282,15 @@ test "Segmentation GraphemeIterator" {
         }
 
         // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
-        var iter = GraphemeIterator.init(all_bytes.items);
+        var iter = Iterator.init(all_bytes.items);
 
         // Chaeck.
-        for (want.items) |w| {
-            const g = (iter.next()).?;
-            try std.testing.expect(w.eql(all_bytes.items, all_bytes.items[g.offset .. g.offset + g.len]));
+        for (want.items) |want_gc| {
+            const got_gc = (iter.next()).?;
+            try std.testing.expectEqualStrings(
+                want_gc.bytes(all_bytes.items),
+                got_gc.bytes(all_bytes.items),
+            );
         }
     }
 }
@@ -303,10 +300,10 @@ test "Segmentation comptime GraphemeIterator" {
 
     comptime {
         const src = "Héllo";
-        var ct_iter = GraphemeIterator.init(src);
+        var ct_iter = Iterator.init(src);
         var i = 0;
         while (ct_iter.next()) |grapheme| : (i += 1) {
-            try std.testing.expect(grapheme.eql(src, want[i]));
+            try std.testing.expectEqualStrings(grapheme.bytes(src), want[i]);
         }
     }
 }
@@ -318,17 +315,17 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
     const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;
     const no_joiner = seq_1 ++ seq_2;
 
-    var ct_iter = GraphemeIterator.init(with_zwj);
+    var ct_iter = Iterator.init(with_zwj);
     var i: usize = 0;
     while (ct_iter.next()) |_| : (i += 1) {}
     try std.testing.expectEqual(@as(usize, 1), i);
 
-    ct_iter = GraphemeIterator.init(with_zwsp);
+    ct_iter = Iterator.init(with_zwsp);
     i = 0;
     while (ct_iter.next()) |_| : (i += 1) {}
     try std.testing.expectEqual(@as(usize, 3), i);
 
-    ct_iter = GraphemeIterator.init(no_joiner);
+    ct_iter = Iterator.init(no_joiner);
     i = 0;
     while (ct_iter.next()) |_| : (i += 1) {}
     try std.testing.expectEqual(@as(usize, 2), i);
diff --git a/src/code_point.zig b/src/code_point.zig
new file mode 100644
index 0000000..ac37562
--- /dev/null
+++ b/src/code_point.zig
@@ -0,0 +1,85 @@
+const std = @import("std");
+
+/// `CodePoint` represents a Unicode code point by its code,
+/// length, and offset in the source bytes.
+pub const CodePoint = struct {
+    code: u21,
+    len: u3,
+    offset: u32,
+};
+
+/// `Iterator` iterates a string one `CodePoint` at-a-time.
+pub const Iterator = struct {
+    bytes: []const u8,
+    i: u32 = 0,
+
+    pub fn next(self: *Iterator) ?CodePoint {
+        if (self.i >= self.bytes.len) return null;
+
+        if (self.bytes[self.i] < 128) {
+            // ASCII fast path
+            defer self.i += 1;
+
+            return .{
+                .code = self.bytes[self.i],
+                .len = 1,
+                .offset = self.i,
+            };
+        }
+
+        var cp = CodePoint{
+            .code = undefined,
+            .len = switch (self.bytes[self.i]) {
+                0b1100_0000...0b1101_1111 => 2,
+                0b1110_0000...0b1110_1111 => 3,
+                0b1111_0000...0b1111_0111 => 4,
+                else => {
+                    defer self.i += 1;
+                    // Unicode replacement code point.
+                    return .{
+                        .code = 0xfffd,
+                        .len = 1,
+                        .offset = self.i,
+                    };
+                },
+            },
+            .offset = self.i,
+        };
+
+        const cp_bytes = self.bytes[self.i..][0..cp.len];
+        self.i += cp.len;
+
+        cp.code = switch (cp.len) {
+            2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
+
+            3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
+                (cp_bytes[1] & 0b00111111)) << 6) |
+                (cp_bytes[2] & 0b00111111),
+
+            4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
+                (cp_bytes[1] & 0b00111111)) << 6) |
+                (cp_bytes[2] & 0b00111111)) << 6) |
+                (cp_bytes[3] & 0b00111111),
+
+            else => @panic("CodePointIterator.next invalid code point length."),
+        };
+
+        return cp;
+    }
+
+    pub fn peek(self: *Iterator) ?CodePoint {
+        const saved_i = self.i;
+        defer self.i = saved_i;
+        return self.next();
+    }
+};
+
+test "peek" {
+    var iter = Iterator{ .bytes = "Hi" };
+
+    try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
+    try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
+    try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);
+    try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
+    try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
+}
diff --git a/src/display_width.zig b/src/display_width.zig
index ba76052..e52da38 100644
--- a/src/display_width.zig
+++ b/src/display_width.zig
@@ -2,8 +2,8 @@ const std = @import("std");
 const simd = std.simd;
 const testing = std.testing;
 
-const CodePointIterator = @import("CodePoint").CodePointIterator;
-const GraphemeIterator = @import("Grapheme").GraphemeIterator;
+const CodePointIterator = @import("code_point").Iterator;
+const GraphemeIterator = @import("grapheme").Iterator;
 const dwp = @import("dwp");
 
 /// codePointWidth returns the number of cells `cp` requires when rendered
-- 
cgit v1.2.3