diff options
| author | 2024-02-14 12:11:07 -0400 | |
|---|---|---|
| committer | 2024-02-14 12:11:07 -0400 | |
| commit | 95bc908ed25be9fa597c559791cbf6d5f5a6b8ed (patch) | |
| tree | 0b8e83d4f9c6e701a0c55e134159f2eea1740f8f /src | |
| parent | Removed unreachables from Grapheme (diff) | |
| download | zg-95bc908ed25be9fa597c559791cbf6d5f5a6b8ed.tar.gz zg-95bc908ed25be9fa597c559791cbf6d5f5a6b8ed.tar.xz zg-95bc908ed25be9fa597c559791cbf6d5f5a6b8ed.zip | |
Removed readCodePoint and StreamingGraphemeIterator
Diffstat (limited to 'src')
| -rw-r--r-- | src/CodePoint.zig | 50 | ||||
| -rw-r--r-- | src/Grapheme.zig | 182 |
2 files changed, 0 insertions, 232 deletions
diff --git a/src/CodePoint.zig b/src/CodePoint.zig index e72823b..c03ecac 100644 --- a/src/CodePoint.zig +++ b/src/CodePoint.zig | |||
| @@ -79,53 +79,3 @@ test "CodePointIterator peek" { | |||
| 79 | try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); | 79 | try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); |
| 80 | try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); | 80 | try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); |
| 81 | } | 81 | } |
| 82 | |||
| 83 | /// `readCodePoint` returns the next code point code as a `u21` in the given reader, or null at end-of-input. | ||
| 84 | pub fn readCodePoint(reader: anytype) !?u21 { | ||
| 85 | var buf: [4]u8 = undefined; | ||
| 86 | |||
| 87 | buf[0] = reader.readByte() catch |err| switch (err) { | ||
| 88 | error.EndOfStream => return null, | ||
| 89 | else => return err, | ||
| 90 | }; | ||
| 91 | |||
| 92 | if (buf[0] < 128) return @as(u21, buf[0]); | ||
| 93 | |||
| 94 | const len: u3 = switch (buf[0]) { | ||
| 95 | 0b1100_0000...0b1101_1111 => 2, | ||
| 96 | 0b1110_0000...0b1110_1111 => 3, | ||
| 97 | 0b1111_0000...0b1111_0111 => 4, | ||
| 98 | else => return error.InvalidUtf8, | ||
| 99 | }; | ||
| 100 | |||
| 101 | const read = try reader.read(buf[1..len]); | ||
| 102 | |||
| 103 | if (read < len - 1) return error.InvalidUtf8; | ||
| 104 | |||
| 105 | return switch (len) { | ||
| 106 | 2 => (@as(u21, (buf[0] & 0b00011111)) << 6) | (buf[1] & 0b00111111), | ||
| 107 | |||
| 108 | 3 => (((@as(u21, (buf[0] & 0b00001111)) << 6) | | ||
| 109 | (buf[1] & 0b00111111)) << 6) | | ||
| 110 | (buf[2] & 0b00111111), | ||
| 111 | |||
| 112 | 4 => (((((@as(u21, (buf[0] & 0b00000111)) << 6) | | ||
| 113 | (buf[1] & 0b00111111)) << 6) | | ||
| 114 | (buf[2] & 0b00111111)) << 6) | | ||
| 115 | (buf[3] & 0b00111111), | ||
| 116 | |||
| 117 | else => @panic("readCodePoint invalid code point length."), | ||
| 118 | }; | ||
| 119 | } | ||
| 120 | |||
| 121 | test "readCodePoint" { | ||
| 122 | var buf = "abé😹".*; | ||
| 123 | var fis = std.io.fixedBufferStream(&buf); | ||
| 124 | const reader = fis.reader(); | ||
| 125 | |||
| 126 | try std.testing.expectEqual(@as(u21, 'a'), (try readCodePoint(reader)).?); | ||
| 127 | try std.testing.expectEqual(@as(u21, 'b'), (try readCodePoint(reader)).?); | ||
| 128 | try std.testing.expectEqual(@as(u21, 'é'), (try readCodePoint(reader)).?); | ||
| 129 | try std.testing.expectEqual(@as(u21, '😹'), (try readCodePoint(reader)).?); | ||
| 130 | try std.testing.expectEqual(@as(?u21, null), try readCodePoint(reader)); | ||
| 131 | } | ||
diff --git a/src/Grapheme.zig b/src/Grapheme.zig index 01eff80..41ea545 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig | |||
| @@ -6,7 +6,6 @@ const unicode = std.unicode; | |||
| 6 | const ziglyph = @import("ziglyph"); | 6 | const ziglyph = @import("ziglyph"); |
| 7 | const CodePoint = @import("CodePoint.zig"); | 7 | const CodePoint = @import("CodePoint.zig"); |
| 8 | const CodePointIterator = CodePoint.CodePointIterator; | 8 | const CodePointIterator = CodePoint.CodePointIterator; |
| 9 | const readCodePoint = CodePoint.readCodePoint; | ||
| 10 | // const emoji = ziglyph.emoji; | 9 | // const emoji = ziglyph.emoji; |
| 11 | // const gbp = ziglyph.grapheme_break; | 10 | // const gbp = ziglyph.grapheme_break; |
| 12 | const gbp = @import("gbp"); | 11 | const gbp = @import("gbp"); |
| @@ -81,171 +80,6 @@ pub const GraphemeIterator = struct { | |||
| 81 | } | 80 | } |
| 82 | }; | 81 | }; |
| 83 | 82 | ||
| 84 | /// `StreamingGraphemeIterator` iterates a `std.io.Reader` one grapheme cluster at-a-time. | ||
| 85 | /// Note that, given the steaming context, each grapheme cluster is returned as a slice of bytes. | ||
| 86 | pub fn StreamingGraphemeIterator(comptime T: type) type { | ||
| 87 | return struct { | ||
| 88 | allocator: std.mem.Allocator, | ||
| 89 | buf: [2]?u21 = [_]?u21{ null, null }, | ||
| 90 | reader: T, | ||
| 91 | |||
| 92 | const Self = @This(); | ||
| 93 | |||
| 94 | pub fn init(allocator: std.mem.Allocator, reader: anytype) !Self { | ||
| 95 | var self = Self{ .allocator = allocator, .reader = reader }; | ||
| 96 | self.buf[1] = try readCodePoint(self.reader); | ||
| 97 | |||
| 98 | return self; | ||
| 99 | } | ||
| 100 | |||
| 101 | /// Caller must free returned bytes with `allocator` passed to `init`. | ||
| 102 | pub fn next(self: *Self) !?[]u8 { | ||
| 103 | const code = (try self.advance()) orelse return null; | ||
| 104 | |||
| 105 | var all_bytes = std.ArrayList(u8).init(self.allocator); | ||
| 106 | errdefer all_bytes.deinit(); | ||
| 107 | |||
| 108 | try encode_and_append(code, &all_bytes); | ||
| 109 | |||
| 110 | // If at end | ||
| 111 | if (self.buf[1] == null) return try all_bytes.toOwnedSlice(); | ||
| 112 | |||
| 113 | // Instant breakers | ||
| 114 | // CR | ||
| 115 | if (code == '\x0d') { | ||
| 116 | if (self.buf[1].? == '\x0a') { | ||
| 117 | // CRLF | ||
| 118 | try encode_and_append(self.buf[1].?, &all_bytes); | ||
| 119 | _ = self.advance() catch @panic("GraphemeIterator.advance failed."); | ||
| 120 | } | ||
| 121 | |||
| 122 | return try all_bytes.toOwnedSlice(); | ||
| 123 | } | ||
| 124 | // LF | ||
| 125 | if (code == '\x0a') return try all_bytes.toOwnedSlice(); | ||
| 126 | // Control | ||
| 127 | if (gbp.isControl(code)) return try all_bytes.toOwnedSlice(); | ||
| 128 | |||
| 129 | // Common chars | ||
| 130 | if (code < 0xa9) { | ||
| 131 | // Extend / ignorables loop | ||
| 132 | while (self.buf[1]) |next_cp| { | ||
| 133 | if (next_cp >= 0x300 and isIgnorable(next_cp)) { | ||
| 134 | try encode_and_append(next_cp, &all_bytes); | ||
| 135 | _ = self.advance() catch @panic("GraphemeIterator.advance failed."); | ||
| 136 | } else { | ||
| 137 | break; | ||
| 138 | } | ||
| 139 | } | ||
| 140 | |||
| 141 | return try all_bytes.toOwnedSlice(); | ||
| 142 | } | ||
| 143 | |||
| 144 | if (emoji.isExtendedPictographic(code)) { | ||
| 145 | var after_zwj = false; | ||
| 146 | |||
| 147 | // Extend / ignorables loop | ||
| 148 | while (self.buf[1]) |next_cp| { | ||
| 149 | if (next_cp >= 0x300 and | ||
| 150 | after_zwj and | ||
| 151 | emoji.isExtendedPictographic(next_cp)) | ||
| 152 | { | ||
| 153 | try encode_and_append(next_cp, &all_bytes); | ||
| 154 | _ = self.advance() catch @panic("GraphemeIterator.advance failed."); | ||
| 155 | after_zwj = false; | ||
| 156 | } else if (next_cp >= 0x300 and isIgnorable(next_cp)) { | ||
| 157 | try encode_and_append(next_cp, &all_bytes); | ||
| 158 | _ = self.advance() catch @panic("GraphemeIterator.advance failed."); | ||
| 159 | if (next_cp == '\u{200d}') after_zwj = true; | ||
| 160 | } else { | ||
| 161 | break; | ||
| 162 | } | ||
| 163 | } | ||
| 164 | |||
| 165 | return try all_bytes.toOwnedSlice(); | ||
| 166 | } | ||
| 167 | |||
| 168 | if (0x1100 <= code and code <= 0xd7c6) { | ||
| 169 | const next_cp = self.buf[1].?; | ||
| 170 | |||
| 171 | if (gbp.isL(code)) { | ||
| 172 | if (next_cp >= 0x1100 and | ||
| 173 | (gbp.isL(next_cp) or | ||
| 174 | gbp.isV(next_cp) or | ||
| 175 | gbp.isLv(next_cp) or | ||
| 176 | gbp.isLvt(next_cp))) | ||
| 177 | { | ||
| 178 | try encode_and_append(next_cp, &all_bytes); | ||
| 179 | _ = self.advance() catch @panic("GraphemeIterator.advance failed."); | ||
| 180 | } | ||
| 181 | } else if (gbp.isLv(code) or gbp.isV(code)) { | ||
| 182 | if (next_cp >= 0x1100 and | ||
| 183 | (gbp.isV(next_cp) or | ||
| 184 | gbp.isT(next_cp))) | ||
| 185 | { | ||
| 186 | try encode_and_append(next_cp, &all_bytes); | ||
| 187 | _ = self.advance() catch @panic("GraphemeIterator.advance failed."); | ||
| 188 | } | ||
| 189 | } else if (gbp.isLvt(code) or gbp.isT(code)) { | ||
| 190 | if (next_cp >= 0x1100 and gbp.isT(next_cp)) { | ||
| 191 | try encode_and_append(next_cp, &all_bytes); | ||
| 192 | _ = self.advance() catch @panic("GraphemeIterator.advance failed."); | ||
| 193 | } | ||
| 194 | } | ||
| 195 | } else if (0x600 <= code and code <= 0x11f02) { | ||
| 196 | if (gbp.isPrepend(code)) { | ||
| 197 | const next_cp = self.buf[1].?; | ||
| 198 | |||
| 199 | if (isBreaker(next_cp)) { | ||
| 200 | return try all_bytes.toOwnedSlice(); | ||
| 201 | } else { | ||
| 202 | try encode_and_append(next_cp, &all_bytes); | ||
| 203 | _ = self.advance() catch @panic("GraphemeIterator.advance failed."); | ||
| 204 | } | ||
| 205 | } | ||
| 206 | } else if (0x1f1e6 <= code and code <= 0x1f1ff) { | ||
| 207 | if (gbp.isRegionalIndicator(code)) { | ||
| 208 | const next_cp = self.buf[1].?; | ||
| 209 | |||
| 210 | if (next_cp >= 0x1f1e6 and gbp.isRegionalIndicator(next_cp)) { | ||
| 211 | try encode_and_append(next_cp, &all_bytes); | ||
| 212 | _ = self.advance() catch @panic("GraphemeIterator.advance failed."); | ||
| 213 | } | ||
| 214 | } | ||
| 215 | } | ||
| 216 | |||
| 217 | // Extend / ignorables loop | ||
| 218 | while (self.buf[1]) |next_cp| { | ||
| 219 | if (next_cp >= 0x300 and isIgnorable(next_cp)) { | ||
| 220 | try encode_and_append(next_cp, &all_bytes); | ||
| 221 | _ = self.advance() catch @panic("GraphemeIterator.advance failed."); | ||
| 222 | } else { | ||
| 223 | break; | ||
| 224 | } | ||
| 225 | } | ||
| 226 | |||
| 227 | return try all_bytes.toOwnedSlice(); | ||
| 228 | } | ||
| 229 | |||
| 230 | fn advance(self: *Self) !?u21 { | ||
| 231 | self.buf[0] = self.buf[1]; | ||
| 232 | self.buf[1] = try readCodePoint(self.reader); | ||
| 233 | |||
| 234 | return self.buf[0]; | ||
| 235 | } | ||
| 236 | |||
| 237 | fn peek(self: Self) ?u21 { | ||
| 238 | return self.buf[1]; | ||
| 239 | } | ||
| 240 | |||
| 241 | fn encode_and_append(cp: u21, list: *std.ArrayList(u8)) !void { | ||
| 242 | var tmp: [4]u8 = undefined; | ||
| 243 | const len = try unicode.utf8Encode(cp, &tmp); | ||
| 244 | try list.appendSlice(tmp[0..len]); | ||
| 245 | } | ||
| 246 | }; | ||
| 247 | } | ||
| 248 | |||
| 249 | // Predicates | 83 | // Predicates |
| 250 | fn isBreaker(cp: u21) bool { | 84 | fn isBreaker(cp: u21) bool { |
| 251 | return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp); | 85 | return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp); |
| @@ -268,22 +102,6 @@ test "Segmentation comptime GraphemeIterator" { | |||
| 268 | } | 102 | } |
| 269 | } | 103 | } |
| 270 | 104 | ||
| 271 | test "Simple StreamingGraphemeIterator" { | ||
| 272 | var buf = "abe\u{301}😹".*; | ||
| 273 | var fis = std.io.fixedBufferStream(&buf); | ||
| 274 | const reader = fis.reader(); | ||
| 275 | var iter = try StreamingGraphemeIterator(@TypeOf(reader)).init(std.testing.allocator, reader); | ||
| 276 | const want = [_][]const u8{ "a", "b", "e\u{301}", "😹" }; | ||
| 277 | |||
| 278 | for (want) |str| { | ||
| 279 | const gc = (try iter.next()).?; | ||
| 280 | defer std.testing.allocator.free(gc); | ||
| 281 | try std.testing.expectEqualStrings(gc, str); | ||
| 282 | } | ||
| 283 | |||
| 284 | try std.testing.expectEqual(@as(?[]u8, null), try iter.next()); | ||
| 285 | } | ||
| 286 | |||
| 287 | test "Segmentation ZWJ and ZWSP emoji sequences" { | 105 | test "Segmentation ZWJ and ZWSP emoji sequences" { |
| 288 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | 106 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; |
| 289 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | 107 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; |