diff options
| author | 2024-11-03 19:29:09 +0000 | |
|---|---|---|
| committer | 2024-11-03 19:29:09 +0000 | |
| commit | d667d180c82c83d5c3b41853d80b12536084404e (patch) | |
| tree | 0424b9f9e54972837652042dc858dfe5ba12b5de /src | |
| parent | Merge pull request 'GraphemeData and WidthData: make init read errors unreach... (diff) | |
| parent | Add peek() to Grapheme.Iterator (diff) | |
| download | zg-d667d180c82c83d5c3b41853d80b12536084404e.tar.gz zg-d667d180c82c83d5c3b41853d80b12536084404e.tar.xz zg-d667d180c82c83d5c3b41853d80b12536084404e.zip | |
Merge pull request 'grapheme-peek' (#18) from atman/zg:grapheme-peek into master
Reviewed-on: https://codeberg.org/dude_the_builder/zg/pulls/18
Diffstat (limited to 'src')
| -rw-r--r-- | src/grapheme.zig | 69 | ||||
| -rw-r--r-- | src/unicode_tests.zig | 42 |
2 files changed, 103 insertions, 8 deletions
diff --git a/src/grapheme.zig b/src/grapheme.zig index 911c856..7538f5b 100644 --- a/src/grapheme.zig +++ b/src/grapheme.zig | |||
| @@ -77,6 +77,75 @@ pub const Iterator = struct { | |||
| 77 | 77 | ||
| 78 | return Grapheme{ .len = gc_len, .offset = gc_start }; | 78 | return Grapheme{ .len = gc_len, .offset = gc_start }; |
| 79 | } | 79 | } |
| 80 | |||
| 81 | pub fn peek(self: *Self) ?Grapheme { | ||
| 82 | const saved_cp_iter = self.cp_iter; | ||
| 83 | const s0 = self.buf[0]; | ||
| 84 | const s1 = self.buf[1]; | ||
| 85 | |||
| 86 | self.advance(); | ||
| 87 | |||
| 88 | // If no more | ||
| 89 | if (self.buf[0] == null) { | ||
| 90 | self.cp_iter = saved_cp_iter; | ||
| 91 | self.buf[0] = s0; | ||
| 92 | self.buf[1] = s1; | ||
| 93 | return null; | ||
| 94 | } | ||
| 95 | // If last one | ||
| 96 | if (self.buf[1] == null) { | ||
| 97 | const len = self.buf[0].?.len; | ||
| 98 | const offset = self.buf[0].?.offset; | ||
| 99 | self.cp_iter = saved_cp_iter; | ||
| 100 | self.buf[0] = s0; | ||
| 101 | self.buf[1] = s1; | ||
| 102 | return Grapheme{ .len = len, .offset = offset }; | ||
| 103 | } | ||
| 104 | // If ASCII | ||
| 105 | if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) { | ||
| 106 | const len = self.buf[0].?.len; | ||
| 107 | const offset = self.buf[0].?.offset; | ||
| 108 | self.cp_iter = saved_cp_iter; | ||
| 109 | self.buf[0] = s0; | ||
| 110 | self.buf[1] = s1; | ||
| 111 | return Grapheme{ .len = len, .offset = offset }; | ||
| 112 | } | ||
| 113 | |||
| 114 | const gc_start = self.buf[0].?.offset; | ||
| 115 | var gc_len: u8 = self.buf[0].?.len; | ||
| 116 | var state = State{}; | ||
| 117 | |||
| 118 | if (graphemeBreak( | ||
| 119 | self.buf[0].?.code, | ||
| 120 | self.buf[1].?.code, | ||
| 121 | self.data, | ||
| 122 | &state, | ||
| 123 | )) { | ||
| 124 | self.cp_iter = saved_cp_iter; | ||
| 125 | self.buf[0] = s0; | ||
| 126 | self.buf[1] = s1; | ||
| 127 | return Grapheme{ .len = gc_len, .offset = gc_start }; | ||
| 128 | } | ||
| 129 | |||
| 130 | while (true) { | ||
| 131 | self.advance(); | ||
| 132 | if (self.buf[0] == null) break; | ||
| 133 | |||
| 134 | gc_len += self.buf[0].?.len; | ||
| 135 | |||
| 136 | if (graphemeBreak( | ||
| 137 | self.buf[0].?.code, | ||
| 138 | if (self.buf[1]) |ncp| ncp.code else 0, | ||
| 139 | self.data, | ||
| 140 | &state, | ||
| 141 | )) break; | ||
| 142 | } | ||
| 143 | self.cp_iter = saved_cp_iter; | ||
| 144 | self.buf[0] = s0; | ||
| 145 | self.buf[1] = s1; | ||
| 146 | |||
| 147 | return Grapheme{ .len = gc_len, .offset = gc_start }; | ||
| 148 | } | ||
| 80 | }; | 149 | }; |
| 81 | 150 | ||
| 82 | // Predicates | 151 | // Predicates |
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 448ce41..245c03f 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -7,11 +7,37 @@ const mem = std.mem; | |||
| 7 | const testing = std.testing; | 7 | const testing = std.testing; |
| 8 | const unicode = std.unicode; | 8 | const unicode = std.unicode; |
| 9 | 9 | ||
| 10 | const grapheme = @import("grapheme"); | ||
| 10 | const Grapheme = @import("grapheme").Grapheme; | 11 | const Grapheme = @import("grapheme").Grapheme; |
| 11 | const GraphemeData = @import("grapheme").GraphemeData; | 12 | const GraphemeData = @import("grapheme").GraphemeData; |
| 12 | const GraphemeIterator = @import("grapheme").Iterator; | 13 | const GraphemeIterator = @import("grapheme").Iterator; |
| 13 | const Normalize = @import("Normalize"); | 14 | const Normalize = @import("Normalize"); |
| 14 | 15 | ||
| 16 | comptime { | ||
| 17 | testing.refAllDecls(grapheme); | ||
| 18 | } | ||
| 19 | test "Iterator.peek" { | ||
| 20 | const peek_seq = "aΔ👨🏻🌾→"; | ||
| 21 | const data = try GraphemeData.init(std.testing.allocator); | ||
| 22 | defer data.deinit(); | ||
| 23 | |||
| 24 | var iter = grapheme.Iterator.init(peek_seq, &data); | ||
| 25 | const peek_a = iter.peek().?; | ||
| 26 | const next_a = iter.next().?; | ||
| 27 | try std.testing.expectEqual(peek_a, next_a); | ||
| 28 | try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq)); | ||
| 29 | const peek_d1 = iter.peek().?; | ||
| 30 | const peek_d2 = iter.peek().?; | ||
| 31 | try std.testing.expectEqual(peek_d1, peek_d2); | ||
| 32 | const next_d = iter.next().?; | ||
| 33 | try std.testing.expectEqual(peek_d2, next_d); | ||
| 34 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 35 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 36 | try std.testing.expectEqual(null, iter.peek()); | ||
| 37 | try std.testing.expectEqual(null, iter.peek()); | ||
| 38 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 39 | } | ||
| 40 | |||
| 15 | test "Unicode normalization tests" { | 41 | test "Unicode normalization tests" { |
| 16 | var arena = heap.ArenaAllocator.init(testing.allocator); | 42 | var arena = heap.ArenaAllocator.init(testing.allocator); |
| 17 | defer arena.deinit(); | 43 | defer arena.deinit(); |
| @@ -35,7 +61,7 @@ test "Unicode normalization tests" { | |||
| 35 | // Skip comments or empty lines. | 61 | // Skip comments or empty lines. |
| 36 | if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; | 62 | if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; |
| 37 | // Iterate over fields. | 63 | // Iterate over fields. |
| 38 | var fields = mem.split(u8, line, ";"); | 64 | var fields = mem.splitScalar(u8, line, ';'); |
| 39 | var field_index: usize = 0; | 65 | var field_index: usize = 0; |
| 40 | var input: []u8 = undefined; | 66 | var input: []u8 = undefined; |
| 41 | defer allocator.free(input); | 67 | defer allocator.free(input); |
| @@ -45,7 +71,7 @@ test "Unicode normalization tests" { | |||
| 45 | var i_buf = std.ArrayList(u8).init(allocator); | 71 | var i_buf = std.ArrayList(u8).init(allocator); |
| 46 | defer i_buf.deinit(); | 72 | defer i_buf.deinit(); |
| 47 | 73 | ||
| 48 | var i_fields = mem.split(u8, field, " "); | 74 | var i_fields = mem.splitScalar(u8, field, ' '); |
| 49 | while (i_fields.next()) |s| { | 75 | while (i_fields.next()) |s| { |
| 50 | const icp = try fmt.parseInt(u21, s, 16); | 76 | const icp = try fmt.parseInt(u21, s, 16); |
| 51 | const len = try unicode.utf8Encode(icp, &cp_buf); | 77 | const len = try unicode.utf8Encode(icp, &cp_buf); |
| @@ -59,7 +85,7 @@ test "Unicode normalization tests" { | |||
| 59 | var w_buf = std.ArrayList(u8).init(allocator); | 85 | var w_buf = std.ArrayList(u8).init(allocator); |
| 60 | defer w_buf.deinit(); | 86 | defer w_buf.deinit(); |
| 61 | 87 | ||
| 62 | var w_fields = mem.split(u8, field, " "); | 88 | var w_fields = mem.splitScalar(u8, field, ' '); |
| 63 | while (w_fields.next()) |s| { | 89 | while (w_fields.next()) |s| { |
| 64 | const wcp = try fmt.parseInt(u21, s, 16); | 90 | const wcp = try fmt.parseInt(u21, s, 16); |
| 65 | const len = try unicode.utf8Encode(wcp, &cp_buf); | 91 | const len = try unicode.utf8Encode(wcp, &cp_buf); |
| @@ -76,7 +102,7 @@ test "Unicode normalization tests" { | |||
| 76 | var w_buf = std.ArrayList(u8).init(allocator); | 102 | var w_buf = std.ArrayList(u8).init(allocator); |
| 77 | defer w_buf.deinit(); | 103 | defer w_buf.deinit(); |
| 78 | 104 | ||
| 79 | var w_fields = mem.split(u8, field, " "); | 105 | var w_fields = mem.splitScalar(u8, field, ' '); |
| 80 | while (w_fields.next()) |s| { | 106 | while (w_fields.next()) |s| { |
| 81 | const wcp = try fmt.parseInt(u21, s, 16); | 107 | const wcp = try fmt.parseInt(u21, s, 16); |
| 82 | const len = try unicode.utf8Encode(wcp, &cp_buf); | 108 | const len = try unicode.utf8Encode(wcp, &cp_buf); |
| @@ -93,7 +119,7 @@ test "Unicode normalization tests" { | |||
| 93 | var w_buf = std.ArrayList(u8).init(allocator); | 119 | var w_buf = std.ArrayList(u8).init(allocator); |
| 94 | defer w_buf.deinit(); | 120 | defer w_buf.deinit(); |
| 95 | 121 | ||
| 96 | var w_fields = mem.split(u8, field, " "); | 122 | var w_fields = mem.splitScalar(u8, field, ' '); |
| 97 | while (w_fields.next()) |s| { | 123 | while (w_fields.next()) |s| { |
| 98 | const wcp = try fmt.parseInt(u21, s, 16); | 124 | const wcp = try fmt.parseInt(u21, s, 16); |
| 99 | const len = try unicode.utf8Encode(wcp, &cp_buf); | 125 | const len = try unicode.utf8Encode(wcp, &cp_buf); |
| @@ -110,7 +136,7 @@ test "Unicode normalization tests" { | |||
| 110 | var w_buf = std.ArrayList(u8).init(allocator); | 136 | var w_buf = std.ArrayList(u8).init(allocator); |
| 111 | defer w_buf.deinit(); | 137 | defer w_buf.deinit(); |
| 112 | 138 | ||
| 113 | var w_fields = mem.split(u8, field, " "); | 139 | var w_fields = mem.splitScalar(u8, field, ' '); |
| 114 | while (w_fields.next()) |s| { | 140 | while (w_fields.next()) |s| { |
| 115 | const wcp = try fmt.parseInt(u21, s, 16); | 141 | const wcp = try fmt.parseInt(u21, s, 16); |
| 116 | const len = try unicode.utf8Encode(wcp, &cp_buf); | 142 | const len = try unicode.utf8Encode(wcp, &cp_buf); |
| @@ -158,11 +184,11 @@ test "Segmentation GraphemeIterator" { | |||
| 158 | var all_bytes = std.ArrayList(u8).init(allocator); | 184 | var all_bytes = std.ArrayList(u8).init(allocator); |
| 159 | defer all_bytes.deinit(); | 185 | defer all_bytes.deinit(); |
| 160 | 186 | ||
| 161 | var graphemes = std.mem.split(u8, line, " ÷ "); | 187 | var graphemes = std.mem.splitSequence(u8, line, " ÷ "); |
| 162 | var bytes_index: u32 = 0; | 188 | var bytes_index: u32 = 0; |
| 163 | 189 | ||
| 164 | while (graphemes.next()) |field| { | 190 | while (graphemes.next()) |field| { |
| 165 | var code_points = std.mem.split(u8, field, " "); | 191 | var code_points = std.mem.splitScalar(u8, field, ' '); |
| 166 | var cp_buf: [4]u8 = undefined; | 192 | var cp_buf: [4]u8 = undefined; |
| 167 | var cp_index: u32 = 0; | 193 | var cp_index: u32 = 0; |
| 168 | var gc_len: u8 = 0; | 194 | var gc_len: u8 = 0; |