summaryrefslogtreecommitdiff
path: root/src/Grapheme.zig
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-18 11:26:00 -0400
committerGravatar Jose Colon Rodriguez2024-02-18 11:26:00 -0400
commit2c14444bc9054604b71ad5b0e8fb3c3e34877e58 (patch)
treeb6713a4413288445fd0ec2e7fead29653600b524 /src/Grapheme.zig
parentBack to zg code_point. 4ms faster than Ghostty's Utf8Decoder (diff)
downloadzg-2c14444bc9054604b71ad5b0e8fb3c3e34877e58.tar.gz
zg-2c14444bc9054604b71ad5b0e8fb3c3e34877e58.tar.xz
zg-2c14444bc9054604b71ad5b0e8fb3c3e34877e58.zip
Grapheme -> grapheme
Diffstat (limited to 'src/Grapheme.zig')
-rw-r--r--src/Grapheme.zig332
1 files changed, 0 insertions, 332 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig
deleted file mode 100644
index f013aba..0000000
--- a/src/Grapheme.zig
+++ /dev/null
@@ -1,332 +0,0 @@
1const std = @import("std");
2const unicode = std.unicode;
3
4const CodePoint = @import("code_point").CodePoint;
5const CodePointIterator = @import("code_point").Iterator;
6const gbp = @import("gbp");
7
8/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
9pub const Grapheme = struct {
10 len: u8,
11 offset: u32,
12
13 /// `bytes` returns the slice of bytes that correspond to
14 /// this grapheme cluster in `src`.
15 pub fn bytes(self: Grapheme, src: []const u8) []const u8 {
16 return src[self.offset..][0..self.len];
17 }
18};
19
20/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
21pub const Iterator = struct {
22 buf: [2]?CodePoint = .{ null, null },
23 cp_iter: CodePointIterator,
24
25 const Self = @This();
26
27 /// Assumes `src` is valid UTF-8.
28 pub fn init(str: []const u8) Self {
29 var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } };
30 self.advance();
31 return self;
32 }
33
34 fn advance(self: *Self) void {
35 self.buf[0] = self.buf[1];
36 self.buf[1] = self.cp_iter.next();
37 }
38
39 pub fn next(self: *Self) ?Grapheme {
40 self.advance();
41
42 // If no more
43 if (self.buf[0] == null) return null;
44 // If last one
45 if (self.buf[1] == null) return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset };
46 // If ASCII
47 if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) {
48 return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset };
49 }
50
51 const gc_start = self.buf[0].?.offset;
52 var gc_len: u8 = self.buf[0].?.len;
53 var state = State{};
54
55 if (graphemeBreak(
56 self.buf[0].?.code,
57 self.buf[1].?.code,
58 &state,
59 )) return Grapheme{ .len = gc_len, .offset = gc_start };
60
61 while (true) {
62 self.advance();
63 if (self.buf[0] == null) break;
64
65 gc_len += self.buf[0].?.len;
66
67 if (graphemeBreak(
68 self.buf[0].?.code,
69 if (self.buf[1]) |ncp| ncp.code else 0,
70 &state,
71 )) break;
72 }
73
74 return Grapheme{ .len = gc_len, .offset = gc_start };
75 }
76};
77
78// Predicates
79fn isBreaker(cp: u21) bool {
80 // Extract relevant properties.
81 const cp_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]];
82 const cp_gbp_prop: gbp.Gbp = @enumFromInt(cp_props_byte >> 4);
83 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
84}
85
86fn isIgnorable(cp: u21) bool {
87 const cp_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]];
88 return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}';
89}
90
91// Grapheme break state.
92const State = struct {
93 bits: u3 = 0,
94
95 // Extended Pictographic (emoji)
96 fn hasXpic(self: State) bool {
97 return self.bits & 1 == 1;
98 }
99 fn setXpic(self: *State) void {
100 self.bits |= 1;
101 }
102 fn unsetXpic(self: *State) void {
103 self.bits ^= 1;
104 }
105
106 // Regional Indicatior (flags)
107 fn hasRegional(self: State) bool {
108 return self.bits & 2 == 2;
109 }
110 fn setRegional(self: *State) void {
111 self.bits |= 2;
112 }
113 fn unsetRegional(self: *State) void {
114 self.bits ^= 2;
115 }
116
117 // Indic Conjunct
118 fn hasIndic(self: State) bool {
119 return self.bits & 4 == 4;
120 }
121 fn setIndic(self: *State) void {
122 self.bits |= 4;
123 }
124 fn unsetIndic(self: *State) void {
125 self.bits ^= 4;
126 }
127};
128
129/// `graphemeBreak` returns true only if a grapheme break point is required
130/// between `cp1` and `cp2`. `state` should start out as 0. If calling
131/// iteratively over a sequence of code points, this function must be called
132/// IN ORDER on ALL potential breaks in a string.
133/// Modeled after the API of utf8proc's `utf8proc_grapheme_break_stateful`.
134/// https://github.com/JuliaStrings/utf8proc/blob/2bbb1ba932f727aad1fab14fafdbc89ff9dc4604/utf8proc.h#L599-L617
135pub fn graphemeBreak(
136 cp1: u21,
137 cp2: u21,
138 state: *State,
139) bool {
140 // Extract relevant properties.
141 const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]];
142 const cp1_gbp_prop: gbp.Gbp = @enumFromInt(cp1_props_byte >> 4);
143 const cp1_indic_prop: gbp.Indic = @enumFromInt((cp1_props_byte >> 1) & 0x7);
144 const cp1_is_emoji = cp1_props_byte & 1 == 1;
145
146 const cp2_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp2 >> 8] + (cp2 & 0xff)]];
147 const cp2_gbp_prop: gbp.Gbp = @enumFromInt(cp2_props_byte >> 4);
148 const cp2_indic_prop: gbp.Indic = @enumFromInt((cp2_props_byte >> 1) & 0x7);
149 const cp2_is_emoji = cp2_props_byte & 1 == 1;
150
151 // GB11: Emoji Extend* ZWJ x Emoji
152 if (!state.hasXpic() and cp1_is_emoji) state.setXpic();
153 // GB9c: Indic Conjunct Break
154 if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic();
155
156 // GB3: CR x LF
157 if (cp1 == '\r' and cp2 == '\n') return false;
158
159 // GB4: Control
160 if (isBreaker(cp1)) return true;
161
162 // GB11: Emoji Extend* ZWJ x Emoji
163 if (state.hasXpic() and
164 cp1_gbp_prop == .ZWJ and
165 cp2_is_emoji)
166 {
167 state.unsetXpic();
168 return false;
169 }
170
171 // GB9b: x (Extend | ZWJ)
172 if (cp2_gbp_prop == .Extend or cp2_gbp_prop == .ZWJ) return false;
173
174 // GB9a: x Spacing
175 if (cp2_gbp_prop == .SpacingMark) return false;
176
177 // GB9b: Prepend x
178 if (cp1_gbp_prop == .Prepend and !isBreaker(cp2)) return false;
179
180 // GB12, GB13: RI x RI
181 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
182 if (state.hasRegional()) {
183 state.unsetRegional();
184 return true;
185 } else {
186 state.setRegional();
187 return false;
188 }
189 }
190
191 // GB6: Hangul L x (L|V|LV|VT)
192 if (cp1_gbp_prop == .L) {
193 if (cp2_gbp_prop == .L or
194 cp2_gbp_prop == .V or
195 cp2_gbp_prop == .LV or
196 cp2_gbp_prop == .LVT) return false;
197 }
198
199 // GB7: Hangul (LV | V) x (V | T)
200 if (cp1_gbp_prop == .LV or cp1_gbp_prop == .V) {
201 if (cp2_gbp_prop == .V or
202 cp2_gbp_prop == .T) return false;
203 }
204
205 // GB8: Hangul (LVT | T) x T
206 if (cp1_gbp_prop == .LVT or cp1_gbp_prop == .T) {
207 if (cp2_gbp_prop == .T) return false;
208 }
209
210 // GB9c: Indic Conjunct Break
211 if (state.hasIndic() and
212 cp1_indic_prop == .Consonant and
213 (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker))
214 {
215 return false;
216 }
217
218 if (state.hasIndic() and
219 cp1_indic_prop == .Extend and
220 cp2_indic_prop == .Linker)
221 {
222 return false;
223 }
224
225 if (state.hasIndic() and
226 (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and
227 cp2_indic_prop == .Consonant)
228 {
229 state.unsetIndic();
230 return false;
231 }
232
233 return true;
234}
235
236test "Segmentation GraphemeIterator" {
237 const allocator = std.testing.allocator;
238 var file = try std.fs.cwd().openFile("GraphemeBreakTest.txt", .{});
239 defer file.close();
240 var buf_reader = std.io.bufferedReader(file.reader());
241 var input_stream = buf_reader.reader();
242
243 var buf: [4096]u8 = undefined;
244 var line_no: usize = 1;
245
246 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) {
247 // Skip comments or empty lines.
248 if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
249
250 // Clean up.
251 var line = std.mem.trimLeft(u8, raw, "÷ ");
252 if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| {
253 line = line[0..octo];
254 }
255 // Iterate over fields.
256 var want = std.ArrayList(Grapheme).init(allocator);
257 defer want.deinit();
258
259 var all_bytes = std.ArrayList(u8).init(allocator);
260 defer all_bytes.deinit();
261
262 var graphemes = std.mem.split(u8, line, " ÷ ");
263 var bytes_index: u32 = 0;
264
265 while (graphemes.next()) |field| {
266 var code_points = std.mem.split(u8, field, " ");
267 var cp_buf: [4]u8 = undefined;
268 var cp_index: u32 = 0;
269 var gc_len: u8 = 0;
270
271 while (code_points.next()) |code_point| {
272 if (std.mem.eql(u8, code_point, "×")) continue;
273 const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
274 const len = try unicode.utf8Encode(cp, &cp_buf);
275 try all_bytes.appendSlice(cp_buf[0..len]);
276 cp_index += len;
277 gc_len += len;
278 }
279
280 try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
281 bytes_index += cp_index;
282 }
283
284 // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
285 var iter = Iterator.init(all_bytes.items);
286
287 // Chaeck.
288 for (want.items) |want_gc| {
289 const got_gc = (iter.next()).?;
290 try std.testing.expectEqualStrings(
291 want_gc.bytes(all_bytes.items),
292 got_gc.bytes(all_bytes.items),
293 );
294 }
295 }
296}
297
298test "Segmentation comptime GraphemeIterator" {
299 const want = [_][]const u8{ "H", "é", "l", "l", "o" };
300
301 comptime {
302 const src = "Héllo";
303 var ct_iter = Iterator.init(src);
304 var i = 0;
305 while (ct_iter.next()) |grapheme| : (i += 1) {
306 try std.testing.expectEqualStrings(grapheme.bytes(src), want[i]);
307 }
308 }
309}
310
311test "Segmentation ZWJ and ZWSP emoji sequences" {
312 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
313 const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
314 const with_zwj = seq_1 ++ "\u{200D}" ++ seq_2;
315 const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;
316 const no_joiner = seq_1 ++ seq_2;
317
318 var ct_iter = Iterator.init(with_zwj);
319 var i: usize = 0;
320 while (ct_iter.next()) |_| : (i += 1) {}
321 try std.testing.expectEqual(@as(usize, 1), i);
322
323 ct_iter = Iterator.init(with_zwsp);
324 i = 0;
325 while (ct_iter.next()) |_| : (i += 1) {}
326 try std.testing.expectEqual(@as(usize, 3), i);
327
328 ct_iter = Iterator.init(no_joiner);
329 i = 0;
330 while (ct_iter.next()) |_| : (i += 1) {}
331 try std.testing.expectEqual(@as(usize, 2), i);
332}