summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-14 12:11:07 -0400
committerGravatar Jose Colon Rodriguez2024-02-14 12:11:07 -0400
commit95bc908ed25be9fa597c559791cbf6d5f5a6b8ed (patch)
tree0b8e83d4f9c6e701a0c55e134159f2eea1740f8f /src
parentRemoved unreachables from Grapheme (diff)
downloadzg-95bc908ed25be9fa597c559791cbf6d5f5a6b8ed.tar.gz
zg-95bc908ed25be9fa597c559791cbf6d5f5a6b8ed.tar.xz
zg-95bc908ed25be9fa597c559791cbf6d5f5a6b8ed.zip
Removed readCodePoint and StreamingGraphemeIterator
Diffstat (limited to 'src')
-rw-r--r--src/CodePoint.zig50
-rw-r--r--src/Grapheme.zig182
2 files changed, 0 insertions, 232 deletions
diff --git a/src/CodePoint.zig b/src/CodePoint.zig
index e72823b..c03ecac 100644
--- a/src/CodePoint.zig
+++ b/src/CodePoint.zig
@@ -79,53 +79,3 @@ test "CodePointIterator peek" {
79 try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); 79 try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
80 try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); 80 try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
81} 81}
82
83/// `readCodePoint` returns the next code point code as a `u21` in the given reader, or null at end-of-input.
84pub fn readCodePoint(reader: anytype) !?u21 {
85 var buf: [4]u8 = undefined;
86
87 buf[0] = reader.readByte() catch |err| switch (err) {
88 error.EndOfStream => return null,
89 else => return err,
90 };
91
92 if (buf[0] < 128) return @as(u21, buf[0]);
93
94 const len: u3 = switch (buf[0]) {
95 0b1100_0000...0b1101_1111 => 2,
96 0b1110_0000...0b1110_1111 => 3,
97 0b1111_0000...0b1111_0111 => 4,
98 else => return error.InvalidUtf8,
99 };
100
101 const read = try reader.read(buf[1..len]);
102
103 if (read < len - 1) return error.InvalidUtf8;
104
105 return switch (len) {
106 2 => (@as(u21, (buf[0] & 0b00011111)) << 6) | (buf[1] & 0b00111111),
107
108 3 => (((@as(u21, (buf[0] & 0b00001111)) << 6) |
109 (buf[1] & 0b00111111)) << 6) |
110 (buf[2] & 0b00111111),
111
112 4 => (((((@as(u21, (buf[0] & 0b00000111)) << 6) |
113 (buf[1] & 0b00111111)) << 6) |
114 (buf[2] & 0b00111111)) << 6) |
115 (buf[3] & 0b00111111),
116
117 else => @panic("readCodePoint invalid code point length."),
118 };
119}
120
121test "readCodePoint" {
122 var buf = "abé😹".*;
123 var fis = std.io.fixedBufferStream(&buf);
124 const reader = fis.reader();
125
126 try std.testing.expectEqual(@as(u21, 'a'), (try readCodePoint(reader)).?);
127 try std.testing.expectEqual(@as(u21, 'b'), (try readCodePoint(reader)).?);
128 try std.testing.expectEqual(@as(u21, 'é'), (try readCodePoint(reader)).?);
129 try std.testing.expectEqual(@as(u21, '😹'), (try readCodePoint(reader)).?);
130 try std.testing.expectEqual(@as(?u21, null), try readCodePoint(reader));
131}
diff --git a/src/Grapheme.zig b/src/Grapheme.zig
index 01eff80..41ea545 100644
--- a/src/Grapheme.zig
+++ b/src/Grapheme.zig
@@ -6,7 +6,6 @@ const unicode = std.unicode;
6const ziglyph = @import("ziglyph"); 6const ziglyph = @import("ziglyph");
7const CodePoint = @import("CodePoint.zig"); 7const CodePoint = @import("CodePoint.zig");
8const CodePointIterator = CodePoint.CodePointIterator; 8const CodePointIterator = CodePoint.CodePointIterator;
9const readCodePoint = CodePoint.readCodePoint;
10// const emoji = ziglyph.emoji; 9// const emoji = ziglyph.emoji;
11// const gbp = ziglyph.grapheme_break; 10// const gbp = ziglyph.grapheme_break;
12const gbp = @import("gbp"); 11const gbp = @import("gbp");
@@ -81,171 +80,6 @@ pub const GraphemeIterator = struct {
81 } 80 }
82}; 81};
83 82
84/// `StreamingGraphemeIterator` iterates a `std.io.Reader` one grapheme cluster at-a-time.
85/// Note that, given the steaming context, each grapheme cluster is returned as a slice of bytes.
86pub fn StreamingGraphemeIterator(comptime T: type) type {
87 return struct {
88 allocator: std.mem.Allocator,
89 buf: [2]?u21 = [_]?u21{ null, null },
90 reader: T,
91
92 const Self = @This();
93
94 pub fn init(allocator: std.mem.Allocator, reader: anytype) !Self {
95 var self = Self{ .allocator = allocator, .reader = reader };
96 self.buf[1] = try readCodePoint(self.reader);
97
98 return self;
99 }
100
101 /// Caller must free returned bytes with `allocator` passed to `init`.
102 pub fn next(self: *Self) !?[]u8 {
103 const code = (try self.advance()) orelse return null;
104
105 var all_bytes = std.ArrayList(u8).init(self.allocator);
106 errdefer all_bytes.deinit();
107
108 try encode_and_append(code, &all_bytes);
109
110 // If at end
111 if (self.buf[1] == null) return try all_bytes.toOwnedSlice();
112
113 // Instant breakers
114 // CR
115 if (code == '\x0d') {
116 if (self.buf[1].? == '\x0a') {
117 // CRLF
118 try encode_and_append(self.buf[1].?, &all_bytes);
119 _ = self.advance() catch @panic("GraphemeIterator.advance failed.");
120 }
121
122 return try all_bytes.toOwnedSlice();
123 }
124 // LF
125 if (code == '\x0a') return try all_bytes.toOwnedSlice();
126 // Control
127 if (gbp.isControl(code)) return try all_bytes.toOwnedSlice();
128
129 // Common chars
130 if (code < 0xa9) {
131 // Extend / ignorables loop
132 while (self.buf[1]) |next_cp| {
133 if (next_cp >= 0x300 and isIgnorable(next_cp)) {
134 try encode_and_append(next_cp, &all_bytes);
135 _ = self.advance() catch @panic("GraphemeIterator.advance failed.");
136 } else {
137 break;
138 }
139 }
140
141 return try all_bytes.toOwnedSlice();
142 }
143
144 if (emoji.isExtendedPictographic(code)) {
145 var after_zwj = false;
146
147 // Extend / ignorables loop
148 while (self.buf[1]) |next_cp| {
149 if (next_cp >= 0x300 and
150 after_zwj and
151 emoji.isExtendedPictographic(next_cp))
152 {
153 try encode_and_append(next_cp, &all_bytes);
154 _ = self.advance() catch @panic("GraphemeIterator.advance failed.");
155 after_zwj = false;
156 } else if (next_cp >= 0x300 and isIgnorable(next_cp)) {
157 try encode_and_append(next_cp, &all_bytes);
158 _ = self.advance() catch @panic("GraphemeIterator.advance failed.");
159 if (next_cp == '\u{200d}') after_zwj = true;
160 } else {
161 break;
162 }
163 }
164
165 return try all_bytes.toOwnedSlice();
166 }
167
168 if (0x1100 <= code and code <= 0xd7c6) {
169 const next_cp = self.buf[1].?;
170
171 if (gbp.isL(code)) {
172 if (next_cp >= 0x1100 and
173 (gbp.isL(next_cp) or
174 gbp.isV(next_cp) or
175 gbp.isLv(next_cp) or
176 gbp.isLvt(next_cp)))
177 {
178 try encode_and_append(next_cp, &all_bytes);
179 _ = self.advance() catch @panic("GraphemeIterator.advance failed.");
180 }
181 } else if (gbp.isLv(code) or gbp.isV(code)) {
182 if (next_cp >= 0x1100 and
183 (gbp.isV(next_cp) or
184 gbp.isT(next_cp)))
185 {
186 try encode_and_append(next_cp, &all_bytes);
187 _ = self.advance() catch @panic("GraphemeIterator.advance failed.");
188 }
189 } else if (gbp.isLvt(code) or gbp.isT(code)) {
190 if (next_cp >= 0x1100 and gbp.isT(next_cp)) {
191 try encode_and_append(next_cp, &all_bytes);
192 _ = self.advance() catch @panic("GraphemeIterator.advance failed.");
193 }
194 }
195 } else if (0x600 <= code and code <= 0x11f02) {
196 if (gbp.isPrepend(code)) {
197 const next_cp = self.buf[1].?;
198
199 if (isBreaker(next_cp)) {
200 return try all_bytes.toOwnedSlice();
201 } else {
202 try encode_and_append(next_cp, &all_bytes);
203 _ = self.advance() catch @panic("GraphemeIterator.advance failed.");
204 }
205 }
206 } else if (0x1f1e6 <= code and code <= 0x1f1ff) {
207 if (gbp.isRegionalIndicator(code)) {
208 const next_cp = self.buf[1].?;
209
210 if (next_cp >= 0x1f1e6 and gbp.isRegionalIndicator(next_cp)) {
211 try encode_and_append(next_cp, &all_bytes);
212 _ = self.advance() catch @panic("GraphemeIterator.advance failed.");
213 }
214 }
215 }
216
217 // Extend / ignorables loop
218 while (self.buf[1]) |next_cp| {
219 if (next_cp >= 0x300 and isIgnorable(next_cp)) {
220 try encode_and_append(next_cp, &all_bytes);
221 _ = self.advance() catch @panic("GraphemeIterator.advance failed.");
222 } else {
223 break;
224 }
225 }
226
227 return try all_bytes.toOwnedSlice();
228 }
229
230 fn advance(self: *Self) !?u21 {
231 self.buf[0] = self.buf[1];
232 self.buf[1] = try readCodePoint(self.reader);
233
234 return self.buf[0];
235 }
236
237 fn peek(self: Self) ?u21 {
238 return self.buf[1];
239 }
240
241 fn encode_and_append(cp: u21, list: *std.ArrayList(u8)) !void {
242 var tmp: [4]u8 = undefined;
243 const len = try unicode.utf8Encode(cp, &tmp);
244 try list.appendSlice(tmp[0..len]);
245 }
246 };
247}
248
249// Predicates 83// Predicates
250fn isBreaker(cp: u21) bool { 84fn isBreaker(cp: u21) bool {
251 return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp); 85 return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp);
@@ -268,22 +102,6 @@ test "Segmentation comptime GraphemeIterator" {
268 } 102 }
269} 103}
270 104
271test "Simple StreamingGraphemeIterator" {
272 var buf = "abe\u{301}😹".*;
273 var fis = std.io.fixedBufferStream(&buf);
274 const reader = fis.reader();
275 var iter = try StreamingGraphemeIterator(@TypeOf(reader)).init(std.testing.allocator, reader);
276 const want = [_][]const u8{ "a", "b", "e\u{301}", "😹" };
277
278 for (want) |str| {
279 const gc = (try iter.next()).?;
280 defer std.testing.allocator.free(gc);
281 try std.testing.expectEqualStrings(gc, str);
282 }
283
284 try std.testing.expectEqual(@as(?[]u8, null), try iter.next());
285}
286
287test "Segmentation ZWJ and ZWSP emoji sequences" { 105test "Segmentation ZWJ and ZWSP emoji sequences" {
288 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; 106 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
289 const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; 107 const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";