summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-15 16:06:35 -0400
committerGravatar Sam Atman2025-05-15 16:06:35 -0400
commit6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3 (patch)
tree61f83982aaab67f862524d46b073fcfee01d56cf
parentwordAtIndex passes conformance (diff)
parentMerge branch 'work-branch' into HEAD (diff)
downloadzg-6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3.tar.gz
zg-6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3.tar.xz
zg-6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3.zip
Merge commit 'b5d955f' into develop-next
-rw-r--r--CONTRIBUTORS.md2
-rw-r--r--src/Graphemes.zig226
-rw-r--r--src/unicode_tests.zig74
3 files changed, 299 insertions, 3 deletions
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index c42cb8b..530c921 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -18,3 +18,5 @@ Jonathan Raphaelson <jon@accidental.cc>
18Sam Atman <atmanistan@gmail.com> 18Sam Atman <atmanistan@gmail.com>
19 19
20lch361 <author@lch361.net> 20lch361 <author@lch361.net>
21
22Matteo Romano <mr.matteo.romano.02@gmail.com>
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 1f67fc6..0338c04 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -7,6 +7,7 @@ const unicode = std.unicode;
7 7
8const CodePoint = @import("code_point").CodePoint; 8const CodePoint = @import("code_point").CodePoint;
9const CodePointIterator = @import("code_point").Iterator; 9const CodePointIterator = @import("code_point").Iterator;
10const CodePointReverseIterator = @import("code_point").ReverseIterator;
10 11
11s1: []u16 = undefined, 12s1: []u16 = undefined,
12s2: []u16 = undefined, 13s2: []u16 = undefined,
@@ -70,6 +71,10 @@ pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator {
70 return Iterator.init(string, graphemes); 71 return Iterator.init(string, graphemes);
71} 72}
72 73
74pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator {
75 return ReverseIterator.init(string, graphemes);
76}
77
73/// Indic syllable type. 78/// Indic syllable type.
74pub const Indic = enum { 79pub const Indic = enum {
75 none, 80 none,
@@ -182,6 +187,221 @@ pub const Iterator = struct {
182 } 187 }
183}; 188};
184 189
190pub const ReverseIterator = struct {
191 buf: [2]?CodePoint = .{ null, null },
192 cp_iter: CodePointReverseIterator,
193 data: *const Graphemes,
194 /// Codepoint read from `cp_iter` but not returned by `previous`
195 pending: Pending = .{ .none = {} },
196
197 const Pending = union(enum) {
198 none: void,
199 /// Count of pending RI codepoints, it is an even number
200 ri_count: usize,
201 /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji
202 extend_end: u32,
203 };
204
205 const Self = @This();
206
207 pub fn init(str: []const u8, data: *const Graphemes) Self {
208 var self: Self = .{ .cp_iter = .init(str), .data = data };
209 self.advance();
210 self.advance();
211 return self;
212 }
213
214 fn advance(self: *Self) void {
215 self.buf[1] = self.buf[0];
216 self.buf[0] = self.cp_iter.prev();
217 }
218
219 pub fn prev(self: *Self) ?Grapheme {
220 if (self.buf[1] == null) return null;
221
222 const grapheme_end: u32 = end: {
223 const codepoint = self.buf[1].?;
224
225 switch (self.pending) {
226 // BUF: [?Any, Any]
227 .none => break :end codepoint.offset + codepoint.len,
228 .ri_count => |ri_count| {
229 std.debug.assert(ri_count > 0);
230 std.debug.assert(ri_count % 2 == 0);
231
232 if (ri_count > 2) {
233 self.pending.ri_count -= 2;
234
235 // Use the fact that all RI have length 4 in utf8 encoding
236 // since they are in range 0x1f1e6...0x1f1ff
237 // https://en.wikipedia.org/wiki/UTF-8#Encoding
238 return Grapheme{
239 .len = 8,
240 .offset = @intCast(codepoint.offset + self.pending.ri_count * 4),
241 };
242 } else {
243 self.pending = .{ .none = {} };
244 break :end codepoint.offset + codepoint.len + 4;
245 }
246 },
247 // BUF: [?Any, Extend] Extend* ZWJ
248 .extend_end => |extend_end| {
249 self.pending = .{ .none = {} };
250 break :end extend_end;
251 },
252 }
253 };
254
255 while (self.buf[0] != null) {
256 var state: State = .{};
257 state.setXpic();
258 state.unsetRegional();
259 state.setIndic();
260
261 if (graphemeBreak(
262 self.buf[0].?.code,
263 self.buf[1].?.code,
264 self.data,
265 &state,
266 )) break;
267
268 self.advance();
269
270 if (!state.hasIndic()) {
271
272 // BUF: [?Any, Extend | Linker] Consonant
273 var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len;
274
275 indic: while (true) {
276 if (self.buf[0] == null) {
277 self.pending = .{ .extend_end = indic_offset };
278 return .{
279 .len = @intCast(grapheme_end - indic_offset),
280 .offset = indic_offset,
281 };
282 }
283
284 const codepoint = self.buf[0].?;
285
286 switch (self.data.indic(codepoint.code)) {
287 .Extend, .Linker => {
288 self.advance();
289 continue :indic;
290 },
291 .Consonant => {
292 // BUF: [Consonant, Extend | Linker] (Extend | Linker)* Consonant
293 indic_offset = codepoint.offset;
294 self.advance();
295
296 if (self.buf[0]) |cp1| {
297 state.setIndic();
298
299 if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break;
300
301 if (!state.hasIndic()) {
302 continue :indic;
303 } else {
304 break :indic;
305 }
306 } else {
307 break :indic;
308 }
309 },
310 .none => {
311 // BUF: [Any, Extend | Linker] (Extend | Linker)* Consonant
312 self.pending = .{ .extend_end = indic_offset };
313 return .{
314 .len = @intCast(grapheme_end - indic_offset),
315 .offset = indic_offset,
316 };
317 },
318 }
319 }
320 }
321
322 if (!state.hasXpic()) {
323 // BUF: [?Any, ZWJ] Emoji
324 var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len;
325
326 // Look for previous Emoji
327 emoji: while (true) {
328 if (self.buf[0] == null) {
329 self.pending = .{ .extend_end = emoji_offset };
330 return .{
331 .len = @intCast(grapheme_end - emoji_offset),
332 .offset = emoji_offset,
333 };
334 }
335
336 const codepoint = self.buf[0].?;
337
338 if (self.data.gbp(codepoint.code) == .Extend) {
339 self.advance();
340 continue :emoji;
341 }
342
343 if (self.data.isEmoji(codepoint.code)) {
344 // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)*
345 emoji_offset = codepoint.offset;
346 self.advance();
347
348 if (self.buf[0] != null and
349 // ZWJ = 0x200d
350 self.buf[0].?.code == 0x200d)
351 {
352 // BUF: [ZWJ, Emoji] (Extend* ZWJ Emoji)*
353 // Back at the beginning of the loop, "recursively" look for emoji
354 self.advance();
355 continue :emoji;
356 } else {
357 // BUF: [?Any, Emoji] (Extend* ZWJ Emoji)*
358 break :emoji;
359 }
360 } else {
361 // BUF: [Any, Extend] (Extend* ZWJ Emoji)*
362 self.pending = .{ .extend_end = emoji_offset };
363 return .{
364 .len = @intCast(grapheme_end - emoji_offset),
365 .offset = emoji_offset,
366 };
367 }
368 }
369 }
370
371 if (state.hasRegional()) {
372 var ri_count: usize = 0;
373 while (self.buf[0] != null and
374 self.data.gbp(self.buf[0].?.code) == .Regional_Indicator)
375 {
376 ri_count += 1;
377 self.advance();
378 }
379
380 // Use the fact that all RI have length 4 in utf8 encoding
381 // since they are in range 0x1f1e6...0x1f1ff
382 // https://en.wikipedia.org/wiki/UTF-8#Encoding
383 if (ri_count == 0) {
384 // There are no pending RI codepoints
385 } else if (ri_count % 2 == 0) {
386 self.pending = .{ .ri_count = ri_count };
387 return .{ .len = 8, .offset = grapheme_end - 8 };
388 } else {
389 // Add one to count for the unused RI
390 self.pending = .{ .ri_count = ri_count + 1 };
391 return .{ .len = 4, .offset = grapheme_end - 4 };
392 }
393 }
394 }
395
396 const grapheme_start = if (self.buf[1]) |codepoint| codepoint.offset else 0;
397 self.advance();
398 return .{
399 .len = @intCast(grapheme_end - grapheme_start),
400 .offset = grapheme_start,
401 };
402 }
403};
404
185// Predicates 405// Predicates
186fn isBreaker(cp: u21, data: *const Graphemes) bool { 406fn isBreaker(cp: u21, data: *const Graphemes) bool {
187 // Extract relevant properties. 407 // Extract relevant properties.
@@ -201,7 +421,7 @@ pub const State = struct {
201 self.bits |= 1; 421 self.bits |= 1;
202 } 422 }
203 fn unsetXpic(self: *State) void { 423 fn unsetXpic(self: *State) void {
204 self.bits ^= 1; 424 self.bits &= ~@as(u3, 1);
205 } 425 }
206 426
207 // Regional Indicatior (flags) 427 // Regional Indicatior (flags)
@@ -212,7 +432,7 @@ pub const State = struct {
212 self.bits |= 2; 432 self.bits |= 2;
213 } 433 }
214 fn unsetRegional(self: *State) void { 434 fn unsetRegional(self: *State) void {
215 self.bits ^= 2; 435 self.bits &= ~@as(u3, 2);
216 } 436 }
217 437
218 // Indic Conjunct 438 // Indic Conjunct
@@ -223,7 +443,7 @@ pub const State = struct {
223 self.bits |= 4; 443 self.bits |= 4;
224 } 444 }
225 fn unsetIndic(self: *State) void { 445 fn unsetIndic(self: *State) void {
226 self.bits ^= 4; 446 self.bits &= ~@as(u3, 4);
227 } 447 }
228}; 448};
229 449
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 8b02e98..0204b92 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -175,6 +175,80 @@ test "Segmentation GraphemeIterator" {
175 } 175 }
176} 176}
177 177
178test "Segmentation ReverseGraphemeIterator" {
179 const allocator = std.testing.allocator;
180 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{});
181 defer file.close();
182 var buf_reader = std.io.bufferedReader(file.reader());
183 var input_stream = buf_reader.reader();
184
185 const data = try Graphemes.init(allocator);
186 defer data.deinit(allocator);
187
188 var buf: [4096]u8 = undefined;
189 var line_no: usize = 1;
190
191 while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) {
192 // Skip comments or empty lines.
193 if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
194
195 // Clean up.
196 var line = std.mem.trimLeft(u8, raw, "÷ ");
197 if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| {
198 line = line[0..octo];
199 }
200 // Iterate over fields.
201 var want = std.ArrayList(Grapheme).init(allocator);
202 defer want.deinit();
203
204 var all_bytes = std.ArrayList(u8).init(allocator);
205 defer all_bytes.deinit();
206
207 var graphemes = std.mem.splitSequence(u8, line, " ÷ ");
208 var bytes_index: u32 = 0;
209
210 while (graphemes.next()) |field| {
211 var code_points = std.mem.splitScalar(u8, field, ' ');
212 var cp_buf: [4]u8 = undefined;
213 var cp_index: u32 = 0;
214 var gc_len: u8 = 0;
215
216 while (code_points.next()) |code_point| {
217 if (std.mem.eql(u8, code_point, "×")) continue;
218 const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
219 const len = try unicode.utf8Encode(cp, &cp_buf);
220 try all_bytes.appendSlice(cp_buf[0..len]);
221 cp_index += len;
222 gc_len += len;
223 }
224
225 try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
226 bytes_index += cp_index;
227 }
228
229 // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
230 var iter = data.reverseIterator(all_bytes.items);
231
232 // Check.
233 var i: usize = want.items.len;
234 while (i > 0) {
235 i -= 1;
236 const want_gc = want.items[i];
237 const got_gc = iter.prev() orelse {
238 std.debug.print("line {d} grapheme {d}: expected {any} found null\n", .{ line_no, i, want_gc });
239 return error.TestExpectedEqual;
240 };
241 std.testing.expectEqualStrings(
242 want_gc.bytes(all_bytes.items),
243 got_gc.bytes(all_bytes.items),
244 ) catch |err| {
245 std.debug.print("line {d} grapheme {d}: expected {any} found {any}\n", .{ line_no, i, want_gc, got_gc });
246 return err;
247 };
248 }
249 }
250}
251
178test "Segmentation Word Iterator" { 252test "Segmentation Word Iterator" {
179 const allocator = std.testing.allocator; 253 const allocator = std.testing.allocator;
180 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{}); 254 var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{});