From 1404c85f513a88bbd399ab9f3453da71e7478727 Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Sun, 18 Feb 2024 08:48:03 -0400 Subject: Code point and grapheme are now namespaces. --- src/CodePoint.zig | 84 -------------------------------------------------- src/Grapheme.zig | 67 +++++++++++++++++++--------------------- src/code_point.zig | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/display_width.zig | 4 +-- 4 files changed, 119 insertions(+), 121 deletions(-) delete mode 100644 src/CodePoint.zig create mode 100644 src/code_point.zig (limited to 'src') diff --git a/src/CodePoint.zig b/src/CodePoint.zig deleted file mode 100644 index 62dd793..0000000 --- a/src/CodePoint.zig +++ /dev/null @@ -1,84 +0,0 @@ -//! `CodePoint` represents a Unicode code point by its code, length, and offset in the source bytes. - -const std = @import("std"); - -code: u21, -len: u3, -offset: usize, - -const CodePoint = @This(); - -/// `CodePointIterator` iterates a string one `CodePoint` at-a-time. -pub const CodePointIterator = struct { - bytes: []const u8, - i: usize = 0, - - pub fn next(self: *CodePointIterator) ?CodePoint { - if (self.i >= self.bytes.len) return null; - - if (self.bytes[self.i] < 128) { - // ASCII fast path - self.i += 1; - return .{ - .code = self.bytes[self.i - 1], - .len = 1, - .offset = self.i - 1, - }; - } - - var cp = CodePoint{ - .code = undefined, - .len = switch (self.bytes[self.i]) { - 0b1100_0000...0b1101_1111 => 2, - 0b1110_0000...0b1110_1111 => 3, - 0b1111_0000...0b1111_0111 => 4, - else => { - self.i += 1; - // Unicode replacement code point. - return .{ - .code = 0xfffd, - .len = 1, - .offset = self.i - 1, - }; - }, - }, - .offset = self.i, - }; - - const cp_bytes = self.bytes[self.i..][0..cp.len]; - self.i += cp.len; - - cp.code = switch (cp.len) { - 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111), - - 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) | - (cp_bytes[1] & 0b00111111)) << 6) | - (cp_bytes[2] & 0b00111111), - - 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) | - (cp_bytes[1] & 0b00111111)) << 6) | - (cp_bytes[2] & 0b00111111)) << 6) | - (cp_bytes[3] & 0b00111111), - - else => @panic("CodePointIterator.next invalid code point length."), - }; - - return cp; - } - - pub fn peek(self: *CodePointIterator) ?CodePoint { - const saved_i = self.i; - defer self.i = saved_i; - return self.next(); - } -}; - -test "CodePointIterator peek" { - var iter = CodePointIterator{ .bytes = "Hi" }; - - try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); - try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); - try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code); - try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); - try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); -} diff --git a/src/Grapheme.zig b/src/Grapheme.zig index 910aec5..f013aba 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig @@ -1,30 +1,25 @@ -//! `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. - const std = @import("std"); const unicode = std.unicode; -const CodePoint = @import("CodePoint"); -const CodePointIterator = CodePoint.CodePointIterator; +const CodePoint = @import("code_point").CodePoint; +const CodePointIterator = @import("code_point").Iterator; const gbp = @import("gbp"); -pub const Grapheme = @This(); - -len: usize, -offset: usize, +/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. +pub const Grapheme = struct { + len: u8, + offset: u32, -/// `eql` comparse `str` with the bytes of this grapheme cluster in `src` for equality. -pub fn eql(self: Grapheme, src: []const u8, other: []const u8) bool { - return std.mem.eql(u8, src[self.offset .. self.offset + self.len], other); -} - -/// `slice` returns the bytes that correspond to this grapheme cluster in `src`. -pub fn slice(self: Grapheme, src: []const u8) []const u8 { - return src[self.offset .. self.offset + self.len]; -} + /// `bytes` returns the slice of bytes that correspond to + /// this grapheme cluster in `src`. + pub fn bytes(self: Grapheme, src: []const u8) []const u8 { + return src[self.offset..][0..self.len]; + } +}; -/// `GraphemeIterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. -pub const GraphemeIterator = struct { - buf: [2]?CodePoint = [_]?CodePoint{ null, null }, +/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. +pub const Iterator = struct { + buf: [2]?CodePoint = .{ null, null }, cp_iter: CodePointIterator, const Self = @This(); @@ -32,8 +27,7 @@ pub const GraphemeIterator = struct { /// Assumes `src` is valid UTF-8. pub fn init(str: []const u8) Self { var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; - self.buf[1] = self.cp_iter.next(); - + self.advance(); return self; } @@ -55,7 +49,7 @@ pub const GraphemeIterator = struct { } const gc_start = self.buf[0].?.offset; - var gc_len: usize = self.buf[0].?.len; + var gc_len: u8 = self.buf[0].?.len; var state = State{}; if (graphemeBreak( @@ -266,13 +260,13 @@ test "Segmentation GraphemeIterator" { defer all_bytes.deinit(); var graphemes = std.mem.split(u8, line, " ÷ "); - var bytes_index: usize = 0; + var bytes_index: u32 = 0; while (graphemes.next()) |field| { var code_points = std.mem.split(u8, field, " "); var cp_buf: [4]u8 = undefined; - var cp_index: usize = 0; - var gc_len: usize = 0; + var cp_index: u32 = 0; + var gc_len: u8 = 0; while (code_points.next()) |code_point| { if (std.mem.eql(u8, code_point, "×")) continue; @@ -288,12 +282,15 @@ test "Segmentation GraphemeIterator" { } // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); - var iter = GraphemeIterator.init(all_bytes.items); + var iter = Iterator.init(all_bytes.items); // Chaeck. - for (want.items) |w| { - const g = (iter.next()).?; - try std.testing.expect(w.eql(all_bytes.items, all_bytes.items[g.offset .. g.offset + g.len])); + for (want.items) |want_gc| { + const got_gc = (iter.next()).?; + try std.testing.expectEqualStrings( + want_gc.bytes(all_bytes.items), + got_gc.bytes(all_bytes.items), + ); } } } @@ -303,10 +300,10 @@ test "Segmentation comptime GraphemeIterator" { comptime { const src = "Héllo"; - var ct_iter = GraphemeIterator.init(src); + var ct_iter = Iterator.init(src); var i = 0; while (ct_iter.next()) |grapheme| : (i += 1) { - try std.testing.expect(grapheme.eql(src, want[i])); + try std.testing.expectEqualStrings(grapheme.bytes(src), want[i]); } } } @@ -318,17 +315,17 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; const no_joiner = seq_1 ++ seq_2; - var ct_iter = GraphemeIterator.init(with_zwj); + var ct_iter = Iterator.init(with_zwj); var i: usize = 0; while (ct_iter.next()) |_| : (i += 1) {} try std.testing.expectEqual(@as(usize, 1), i); - ct_iter = GraphemeIterator.init(with_zwsp); + ct_iter = Iterator.init(with_zwsp); i = 0; while (ct_iter.next()) |_| : (i += 1) {} try std.testing.expectEqual(@as(usize, 3), i); - ct_iter = GraphemeIterator.init(no_joiner); + ct_iter = Iterator.init(no_joiner); i = 0; while (ct_iter.next()) |_| : (i += 1) {} try std.testing.expectEqual(@as(usize, 2), i); diff --git a/src/code_point.zig b/src/code_point.zig new file mode 100644 index 0000000..ac37562 --- /dev/null +++ b/src/code_point.zig @@ -0,0 +1,85 @@ +const std = @import("std"); + +/// `CodePoint` represents a Unicode code point by its code, +/// length, and offset in the source bytes. +pub const CodePoint = struct { + code: u21, + len: u3, + offset: u32, +}; + +/// `Iterator` iterates a string one `CodePoint` at-a-time. +pub const Iterator = struct { + bytes: []const u8, + i: u32 = 0, + + pub fn next(self: *Iterator) ?CodePoint { + if (self.i >= self.bytes.len) return null; + + if (self.bytes[self.i] < 128) { + // ASCII fast path + defer self.i += 1; + + return .{ + .code = self.bytes[self.i], + .len = 1, + .offset = self.i, + }; + } + + var cp = CodePoint{ + .code = undefined, + .len = switch (self.bytes[self.i]) { + 0b1100_0000...0b1101_1111 => 2, + 0b1110_0000...0b1110_1111 => 3, + 0b1111_0000...0b1111_0111 => 4, + else => { + defer self.i += 1; + // Unicode replacement code point. + return .{ + .code = 0xfffd, + .len = 1, + .offset = self.i, + }; + }, + }, + .offset = self.i, + }; + + const cp_bytes = self.bytes[self.i..][0..cp.len]; + self.i += cp.len; + + cp.code = switch (cp.len) { + 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111), + + 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) | + (cp_bytes[1] & 0b00111111)) << 6) | + (cp_bytes[2] & 0b00111111), + + 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) | + (cp_bytes[1] & 0b00111111)) << 6) | + (cp_bytes[2] & 0b00111111)) << 6) | + (cp_bytes[3] & 0b00111111), + + else => @panic("CodePointIterator.next invalid code point length."), + }; + + return cp; + } + + pub fn peek(self: *Iterator) ?CodePoint { + const saved_i = self.i; + defer self.i = saved_i; + return self.next(); + } +}; + +test "peek" { + var iter = Iterator{ .bytes = "Hi" }; + + try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); + try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); + try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code); + try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); + try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); +} diff --git a/src/display_width.zig b/src/display_width.zig index ba76052..e52da38 100644 --- a/src/display_width.zig +++ b/src/display_width.zig @@ -2,8 +2,8 @@ const std = @import("std"); const simd = std.simd; const testing = std.testing; -const CodePointIterator = @import("CodePoint").CodePointIterator; -const GraphemeIterator = @import("Grapheme").GraphemeIterator; +const CodePointIterator = @import("code_point").Iterator; +const GraphemeIterator = @import("grapheme").Iterator; const dwp = @import("dwp"); /// codePointWidth returns the number of cells `cp` requires when rendered -- cgit v1.2.3