diff options
| author | 2024-02-18 08:48:03 -0400 | |
|---|---|---|
| committer | 2024-02-18 08:48:03 -0400 | |
| commit | 1404c85f513a88bbd399ab9f3453da71e7478727 (patch) | |
| tree | 0080678ceac38f223910d60bf650ebaddf27b0f9 /src | |
| parent | Fixed isAsciiOnly and CodePointIterator ASCII bugs (diff) | |
| download | zg-1404c85f513a88bbd399ab9f3453da71e7478727.tar.gz zg-1404c85f513a88bbd399ab9f3453da71e7478727.tar.xz zg-1404c85f513a88bbd399ab9f3453da71e7478727.zip | |
Code point and grapheme are now namespaces.
Diffstat (limited to 'src')
| -rw-r--r-- | src/Grapheme.zig | 67 | ||||
| -rw-r--r-- | src/code_point.zig (renamed from src/CodePoint.zig) | 39 | ||||
| -rw-r--r-- | src/display_width.zig | 4 |
3 files changed, 54 insertions, 56 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig index 910aec5..f013aba 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig | |||
| @@ -1,30 +1,25 @@ | |||
| 1 | //! `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. | ||
| 2 | |||
| 3 | const std = @import("std"); | 1 | const std = @import("std"); |
| 4 | const unicode = std.unicode; | 2 | const unicode = std.unicode; |
| 5 | 3 | ||
| 6 | const CodePoint = @import("CodePoint"); | 4 | const CodePoint = @import("code_point").CodePoint; |
| 7 | const CodePointIterator = CodePoint.CodePointIterator; | 5 | const CodePointIterator = @import("code_point").Iterator; |
| 8 | const gbp = @import("gbp"); | 6 | const gbp = @import("gbp"); |
| 9 | 7 | ||
| 10 | pub const Grapheme = @This(); | 8 | /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. |
| 11 | 9 | pub const Grapheme = struct { | |
| 12 | len: usize, | 10 | len: u8, |
| 13 | offset: usize, | 11 | offset: u32, |
| 14 | 12 | ||
| 15 | /// `eql` comparse `str` with the bytes of this grapheme cluster in `src` for equality. | 13 | /// `bytes` returns the slice of bytes that correspond to |
| 16 | pub fn eql(self: Grapheme, src: []const u8, other: []const u8) bool { | 14 | /// this grapheme cluster in `src`. |
| 17 | return std.mem.eql(u8, src[self.offset .. self.offset + self.len], other); | 15 | pub fn bytes(self: Grapheme, src: []const u8) []const u8 { |
| 18 | } | 16 | return src[self.offset..][0..self.len]; |
| 19 | 17 | } | |
| 20 | /// `slice` returns the bytes that correspond to this grapheme cluster in `src`. | 18 | }; |
| 21 | pub fn slice(self: Grapheme, src: []const u8) []const u8 { | ||
| 22 | return src[self.offset .. self.offset + self.len]; | ||
| 23 | } | ||
| 24 | 19 | ||
| 25 | /// `GraphemeIterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. | 20 | /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. |
| 26 | pub const GraphemeIterator = struct { | 21 | pub const Iterator = struct { |
| 27 | buf: [2]?CodePoint = [_]?CodePoint{ null, null }, | 22 | buf: [2]?CodePoint = .{ null, null }, |
| 28 | cp_iter: CodePointIterator, | 23 | cp_iter: CodePointIterator, |
| 29 | 24 | ||
| 30 | const Self = @This(); | 25 | const Self = @This(); |
| @@ -32,8 +27,7 @@ pub const GraphemeIterator = struct { | |||
| 32 | /// Assumes `src` is valid UTF-8. | 27 | /// Assumes `src` is valid UTF-8. |
| 33 | pub fn init(str: []const u8) Self { | 28 | pub fn init(str: []const u8) Self { |
| 34 | var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; | 29 | var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; |
| 35 | self.buf[1] = self.cp_iter.next(); | 30 | self.advance(); |
| 36 | |||
| 37 | return self; | 31 | return self; |
| 38 | } | 32 | } |
| 39 | 33 | ||
| @@ -55,7 +49,7 @@ pub const GraphemeIterator = struct { | |||
| 55 | } | 49 | } |
| 56 | 50 | ||
| 57 | const gc_start = self.buf[0].?.offset; | 51 | const gc_start = self.buf[0].?.offset; |
| 58 | var gc_len: usize = self.buf[0].?.len; | 52 | var gc_len: u8 = self.buf[0].?.len; |
| 59 | var state = State{}; | 53 | var state = State{}; |
| 60 | 54 | ||
| 61 | if (graphemeBreak( | 55 | if (graphemeBreak( |
| @@ -266,13 +260,13 @@ test "Segmentation GraphemeIterator" { | |||
| 266 | defer all_bytes.deinit(); | 260 | defer all_bytes.deinit(); |
| 267 | 261 | ||
| 268 | var graphemes = std.mem.split(u8, line, " ÷ "); | 262 | var graphemes = std.mem.split(u8, line, " ÷ "); |
| 269 | var bytes_index: usize = 0; | 263 | var bytes_index: u32 = 0; |
| 270 | 264 | ||
| 271 | while (graphemes.next()) |field| { | 265 | while (graphemes.next()) |field| { |
| 272 | var code_points = std.mem.split(u8, field, " "); | 266 | var code_points = std.mem.split(u8, field, " "); |
| 273 | var cp_buf: [4]u8 = undefined; | 267 | var cp_buf: [4]u8 = undefined; |
| 274 | var cp_index: usize = 0; | 268 | var cp_index: u32 = 0; |
| 275 | var gc_len: usize = 0; | 269 | var gc_len: u8 = 0; |
| 276 | 270 | ||
| 277 | while (code_points.next()) |code_point| { | 271 | while (code_points.next()) |code_point| { |
| 278 | if (std.mem.eql(u8, code_point, "×")) continue; | 272 | if (std.mem.eql(u8, code_point, "×")) continue; |
| @@ -288,12 +282,15 @@ test "Segmentation GraphemeIterator" { | |||
| 288 | } | 282 | } |
| 289 | 283 | ||
| 290 | // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); | 284 | // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); |
| 291 | var iter = GraphemeIterator.init(all_bytes.items); | 285 | var iter = Iterator.init(all_bytes.items); |
| 292 | 286 | ||
| 293 | // Chaeck. | 287 | // Chaeck. |
| 294 | for (want.items) |w| { | 288 | for (want.items) |want_gc| { |
| 295 | const g = (iter.next()).?; | 289 | const got_gc = (iter.next()).?; |
| 296 | try std.testing.expect(w.eql(all_bytes.items, all_bytes.items[g.offset .. g.offset + g.len])); | 290 | try std.testing.expectEqualStrings( |
| 291 | want_gc.bytes(all_bytes.items), | ||
| 292 | got_gc.bytes(all_bytes.items), | ||
| 293 | ); | ||
| 297 | } | 294 | } |
| 298 | } | 295 | } |
| 299 | } | 296 | } |
| @@ -303,10 +300,10 @@ test "Segmentation comptime GraphemeIterator" { | |||
| 303 | 300 | ||
| 304 | comptime { | 301 | comptime { |
| 305 | const src = "Héllo"; | 302 | const src = "Héllo"; |
| 306 | var ct_iter = GraphemeIterator.init(src); | 303 | var ct_iter = Iterator.init(src); |
| 307 | var i = 0; | 304 | var i = 0; |
| 308 | while (ct_iter.next()) |grapheme| : (i += 1) { | 305 | while (ct_iter.next()) |grapheme| : (i += 1) { |
| 309 | try std.testing.expect(grapheme.eql(src, want[i])); | 306 | try std.testing.expectEqualStrings(grapheme.bytes(src), want[i]); |
| 310 | } | 307 | } |
| 311 | } | 308 | } |
| 312 | } | 309 | } |
| @@ -318,17 +315,17 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { | |||
| 318 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; | 315 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; |
| 319 | const no_joiner = seq_1 ++ seq_2; | 316 | const no_joiner = seq_1 ++ seq_2; |
| 320 | 317 | ||
| 321 | var ct_iter = GraphemeIterator.init(with_zwj); | 318 | var ct_iter = Iterator.init(with_zwj); |
| 322 | var i: usize = 0; | 319 | var i: usize = 0; |
| 323 | while (ct_iter.next()) |_| : (i += 1) {} | 320 | while (ct_iter.next()) |_| : (i += 1) {} |
| 324 | try std.testing.expectEqual(@as(usize, 1), i); | 321 | try std.testing.expectEqual(@as(usize, 1), i); |
| 325 | 322 | ||
| 326 | ct_iter = GraphemeIterator.init(with_zwsp); | 323 | ct_iter = Iterator.init(with_zwsp); |
| 327 | i = 0; | 324 | i = 0; |
| 328 | while (ct_iter.next()) |_| : (i += 1) {} | 325 | while (ct_iter.next()) |_| : (i += 1) {} |
| 329 | try std.testing.expectEqual(@as(usize, 3), i); | 326 | try std.testing.expectEqual(@as(usize, 3), i); |
| 330 | 327 | ||
| 331 | ct_iter = GraphemeIterator.init(no_joiner); | 328 | ct_iter = Iterator.init(no_joiner); |
| 332 | i = 0; | 329 | i = 0; |
| 333 | while (ct_iter.next()) |_| : (i += 1) {} | 330 | while (ct_iter.next()) |_| : (i += 1) {} |
| 334 | try std.testing.expectEqual(@as(usize, 2), i); | 331 | try std.testing.expectEqual(@as(usize, 2), i); |
diff --git a/src/CodePoint.zig b/src/code_point.zig index 62dd793..ac37562 100644 --- a/src/CodePoint.zig +++ b/src/code_point.zig | |||
| @@ -1,28 +1,29 @@ | |||
| 1 | //! `CodePoint` represents a Unicode code point by its code, length, and offset in the source bytes. | ||
| 2 | |||
| 3 | const std = @import("std"); | 1 | const std = @import("std"); |
| 4 | 2 | ||
| 5 | code: u21, | 3 | /// `CodePoint` represents a Unicode code point by its code, |
| 6 | len: u3, | 4 | /// length, and offset in the source bytes. |
| 7 | offset: usize, | 5 | pub const CodePoint = struct { |
| 8 | 6 | code: u21, | |
| 9 | const CodePoint = @This(); | 7 | len: u3, |
| 8 | offset: u32, | ||
| 9 | }; | ||
| 10 | 10 | ||
| 11 | /// `CodePointIterator` iterates a string one `CodePoint` at-a-time. | 11 | /// `Iterator` iterates a string one `CodePoint` at-a-time. |
| 12 | pub const CodePointIterator = struct { | 12 | pub const Iterator = struct { |
| 13 | bytes: []const u8, | 13 | bytes: []const u8, |
| 14 | i: usize = 0, | 14 | i: u32 = 0, |
| 15 | 15 | ||
| 16 | pub fn next(self: *CodePointIterator) ?CodePoint { | 16 | pub fn next(self: *Iterator) ?CodePoint { |
| 17 | if (self.i >= self.bytes.len) return null; | 17 | if (self.i >= self.bytes.len) return null; |
| 18 | 18 | ||
| 19 | if (self.bytes[self.i] < 128) { | 19 | if (self.bytes[self.i] < 128) { |
| 20 | // ASCII fast path | 20 | // ASCII fast path |
| 21 | self.i += 1; | 21 | defer self.i += 1; |
| 22 | |||
| 22 | return .{ | 23 | return .{ |
| 23 | .code = self.bytes[self.i - 1], | 24 | .code = self.bytes[self.i], |
| 24 | .len = 1, | 25 | .len = 1, |
| 25 | .offset = self.i - 1, | 26 | .offset = self.i, |
| 26 | }; | 27 | }; |
| 27 | } | 28 | } |
| 28 | 29 | ||
| @@ -33,12 +34,12 @@ pub const CodePointIterator = struct { | |||
| 33 | 0b1110_0000...0b1110_1111 => 3, | 34 | 0b1110_0000...0b1110_1111 => 3, |
| 34 | 0b1111_0000...0b1111_0111 => 4, | 35 | 0b1111_0000...0b1111_0111 => 4, |
| 35 | else => { | 36 | else => { |
| 36 | self.i += 1; | 37 | defer self.i += 1; |
| 37 | // Unicode replacement code point. | 38 | // Unicode replacement code point. |
| 38 | return .{ | 39 | return .{ |
| 39 | .code = 0xfffd, | 40 | .code = 0xfffd, |
| 40 | .len = 1, | 41 | .len = 1, |
| 41 | .offset = self.i - 1, | 42 | .offset = self.i, |
| 42 | }; | 43 | }; |
| 43 | }, | 44 | }, |
| 44 | }, | 45 | }, |
| @@ -66,15 +67,15 @@ pub const CodePointIterator = struct { | |||
| 66 | return cp; | 67 | return cp; |
| 67 | } | 68 | } |
| 68 | 69 | ||
| 69 | pub fn peek(self: *CodePointIterator) ?CodePoint { | 70 | pub fn peek(self: *Iterator) ?CodePoint { |
| 70 | const saved_i = self.i; | 71 | const saved_i = self.i; |
| 71 | defer self.i = saved_i; | 72 | defer self.i = saved_i; |
| 72 | return self.next(); | 73 | return self.next(); |
| 73 | } | 74 | } |
| 74 | }; | 75 | }; |
| 75 | 76 | ||
| 76 | test "CodePointIterator peek" { | 77 | test "peek" { |
| 77 | var iter = CodePointIterator{ .bytes = "Hi" }; | 78 | var iter = Iterator{ .bytes = "Hi" }; |
| 78 | 79 | ||
| 79 | try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); | 80 | try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); |
| 80 | try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); | 81 | try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); |
diff --git a/src/display_width.zig b/src/display_width.zig index ba76052..e52da38 100644 --- a/src/display_width.zig +++ b/src/display_width.zig | |||
| @@ -2,8 +2,8 @@ const std = @import("std"); | |||
| 2 | const simd = std.simd; | 2 | const simd = std.simd; |
| 3 | const testing = std.testing; | 3 | const testing = std.testing; |
| 4 | 4 | ||
| 5 | const CodePointIterator = @import("CodePoint").CodePointIterator; | 5 | const CodePointIterator = @import("code_point").Iterator; |
| 6 | const GraphemeIterator = @import("Grapheme").GraphemeIterator; | 6 | const GraphemeIterator = @import("grapheme").Iterator; |
| 7 | const dwp = @import("dwp"); | 7 | const dwp = @import("dwp"); |
| 8 | 8 | ||
| 9 | /// codePointWidth returns the number of cells `cp` requires when rendered | 9 | /// codePointWidth returns the number of cells `cp` requires when rendered |