summaryrefslogtreecommitdiff
path: root/src/Graphemes.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/Graphemes.zig')
-rw-r--r--src/Graphemes.zig226
1 files changed, 223 insertions, 3 deletions
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 7bf328a..3bff18d 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -7,6 +7,7 @@ const unicode = std.unicode;
7 7
8const CodePoint = @import("code_point").CodePoint; 8const CodePoint = @import("code_point").CodePoint;
9const CodePointIterator = @import("code_point").Iterator; 9const CodePointIterator = @import("code_point").Iterator;
10const CodePointReverseIterator = @import("code_point").ReverseIterator;
10 11
11s1: []u16 = undefined, 12s1: []u16 = undefined,
12s2: []u16 = undefined, 13s2: []u16 = undefined,
@@ -70,6 +71,10 @@ pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator {
70 return Iterator.init(string, graphemes); 71 return Iterator.init(string, graphemes);
71} 72}
72 73
74pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator {
75 return ReverseIterator.init(string, graphemes);
76}
77
73/// Indic syllable type. 78/// Indic syllable type.
74pub const Indic = enum { 79pub const Indic = enum {
75 none, 80 none,
@@ -239,6 +244,221 @@ pub const Iterator = struct {
239 } 244 }
240}; 245};
241 246
247pub const ReverseIterator = struct {
248 buf: [2]?CodePoint = .{ null, null },
249 cp_iter: CodePointReverseIterator,
250 data: *const Graphemes,
251 /// Codepoint read from `cp_iter` but not returned by `previous`
252 pending: Pending = .{ .none = {} },
253
254 const Pending = union(enum) {
255 none: void,
256 /// Count of pending RI codepoints, it is an even number
257 ri_count: usize,
258 /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji
259 extend_end: u32,
260 };
261
262 const Self = @This();
263
264 pub fn init(str: []const u8, data: *const Graphemes) Self {
265 var self: Self = .{ .cp_iter = .init(str), .data = data };
266 self.advance();
267 self.advance();
268 return self;
269 }
270
271 fn advance(self: *Self) void {
272 self.buf[1] = self.buf[0];
273 self.buf[0] = self.cp_iter.prev();
274 }
275
276 pub fn prev(self: *Self) ?Grapheme {
277 if (self.buf[1] == null) return null;
278
279 const grapheme_end: u32 = end: {
280 const codepoint = self.buf[1].?;
281
282 switch (self.pending) {
283 // BUF: [?Any, Any]
284 .none => break :end codepoint.offset + codepoint.len,
285 .ri_count => |ri_count| {
286 std.debug.assert(ri_count > 0);
287 std.debug.assert(ri_count % 2 == 0);
288
289 if (ri_count > 2) {
290 self.pending.ri_count -= 2;
291
292 // Use the fact that all RI have length 4 in utf8 encoding
293 // since they are in range 0x1f1e6...0x1f1ff
294 // https://en.wikipedia.org/wiki/UTF-8#Encoding
295 return Grapheme{
296 .len = 8,
297 .offset = @intCast(codepoint.offset + self.pending.ri_count * 4),
298 };
299 } else {
300 self.pending = .{ .none = {} };
301 break :end codepoint.offset + codepoint.len + 4;
302 }
303 },
304 // BUF: [?Any, Extend] Extend* ZWJ
305 .extend_end => |extend_end| {
306 self.pending = .{ .none = {} };
307 break :end extend_end;
308 },
309 }
310 };
311
312 while (self.buf[0] != null) {
313 var state: State = .{};
314 state.setXpic();
315 state.unsetRegional();
316 state.setIndic();
317
318 if (graphemeBreak(
319 self.buf[0].?.code,
320 self.buf[1].?.code,
321 self.data,
322 &state,
323 )) break;
324
325 self.advance();
326
327 if (!state.hasIndic()) {
328
329 // BUF: [?Any, Extend | Linker] Consonant
330 var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len;
331
332 indic: while (true) {
333 if (self.buf[0] == null) {
334 self.pending = .{ .extend_end = indic_offset };
335 return .{
336 .len = @intCast(grapheme_end - indic_offset),
337 .offset = indic_offset,
338 };
339 }
340
341 const codepoint = self.buf[0].?;
342
343 switch (self.data.indic(codepoint.code)) {
344 .Extend, .Linker => {
345 self.advance();
346 continue :indic;
347 },
348 .Consonant => {
349 // BUF: [Consonant, Extend | Linker] (Extend | Linker)* Consonant
350 indic_offset = codepoint.offset;
351 self.advance();
352
353 if (self.buf[0]) |cp1| {
354 state.setIndic();
355
356 if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break;
357
358 if (!state.hasIndic()) {
359 continue :indic;
360 } else {
361 break :indic;
362 }
363 } else {
364 break :indic;
365 }
366 },
367 .none => {
368 // BUF: [Any, Extend | Linker] (Extend | Linker)* Consonant
369 self.pending = .{ .extend_end = indic_offset };
370 return .{
371 .len = @intCast(grapheme_end - indic_offset),
372 .offset = indic_offset,
373 };
374 },
375 }
376 }
377 }
378
379 if (!state.hasXpic()) {
380 // BUF: [?Any, ZWJ] Emoji
381 var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len;
382
383 // Look for previous Emoji
384 emoji: while (true) {
385 if (self.buf[0] == null) {
386 self.pending = .{ .extend_end = emoji_offset };
387 return .{
388 .len = @intCast(grapheme_end - emoji_offset),
389 .offset = emoji_offset,
390 };
391 }
392
393 const codepoint = self.buf[0].?;
394
395 if (self.data.gbp(codepoint.code) == .Extend) {
396 self.advance();
397 continue :emoji;
398 }
399
400 if (self.data.isEmoji(codepoint.code)) {
401 // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)*
402 emoji_offset = codepoint.offset;
403 self.advance();
404
405 if (self.buf[0] != null and
406 // ZWJ = 0x200d
407 self.buf[0].?.code == 0x200d)
408 {
409 // BUF: [ZWJ, Emoji] (Extend* ZWJ Emoji)*
410 // Back at the beginning of the loop, "recursively" look for emoji
411 self.advance();
412 continue :emoji;
413 } else {
414 // BUF: [?Any, Emoji] (Extend* ZWJ Emoji)*
415 break :emoji;
416 }
417 } else {
418 // BUF: [Any, Extend] (Extend* ZWJ Emoji)*
419 self.pending = .{ .extend_end = emoji_offset };
420 return .{
421 .len = @intCast(grapheme_end - emoji_offset),
422 .offset = emoji_offset,
423 };
424 }
425 }
426 }
427
428 if (state.hasRegional()) {
429 var ri_count: usize = 0;
430 while (self.buf[0] != null and
431 self.data.gbp(self.buf[0].?.code) == .Regional_Indicator)
432 {
433 ri_count += 1;
434 self.advance();
435 }
436
437 // Use the fact that all RI have length 4 in utf8 encoding
438 // since they are in range 0x1f1e6...0x1f1ff
439 // https://en.wikipedia.org/wiki/UTF-8#Encoding
440 if (ri_count == 0) {
441 // There are no pending RI codepoints
442 } else if (ri_count % 2 == 0) {
443 self.pending = .{ .ri_count = ri_count };
444 return .{ .len = 8, .offset = grapheme_end - 8 };
445 } else {
446 // Add one to count for the unused RI
447 self.pending = .{ .ri_count = ri_count + 1 };
448 return .{ .len = 4, .offset = grapheme_end - 4 };
449 }
450 }
451 }
452
453 const grapheme_start = if (self.buf[1]) |codepoint| codepoint.offset else 0;
454 self.advance();
455 return .{
456 .len = @intCast(grapheme_end - grapheme_start),
457 .offset = grapheme_start,
458 };
459 }
460};
461
242// Predicates 462// Predicates
243fn isBreaker(cp: u21, data: *const Graphemes) bool { 463fn isBreaker(cp: u21, data: *const Graphemes) bool {
244 // Extract relevant properties. 464 // Extract relevant properties.
@@ -258,7 +478,7 @@ pub const State = struct {
258 self.bits |= 1; 478 self.bits |= 1;
259 } 479 }
260 fn unsetXpic(self: *State) void { 480 fn unsetXpic(self: *State) void {
261 self.bits ^= 1; 481 self.bits &= ~@as(u3, 1);
262 } 482 }
263 483
264 // Regional Indicatior (flags) 484 // Regional Indicatior (flags)
@@ -269,7 +489,7 @@ pub const State = struct {
269 self.bits |= 2; 489 self.bits |= 2;
270 } 490 }
271 fn unsetRegional(self: *State) void { 491 fn unsetRegional(self: *State) void {
272 self.bits ^= 2; 492 self.bits &= ~@as(u3, 2);
273 } 493 }
274 494
275 // Indic Conjunct 495 // Indic Conjunct
@@ -280,7 +500,7 @@ pub const State = struct {
280 self.bits |= 4; 500 self.bits |= 4;
281 } 501 }
282 fn unsetIndic(self: *State) void { 502 fn unsetIndic(self: *State) void {
283 self.bits ^= 4; 503 self.bits &= ~@as(u3, 4);
284 } 504 }
285}; 505};
286 506