diff options
Diffstat (limited to 'src/Graphemes.zig')
| -rw-r--r-- | src/Graphemes.zig | 226 |
1 files changed, 223 insertions, 3 deletions
diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 7bf328a..3bff18d 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig | |||
| @@ -7,6 +7,7 @@ const unicode = std.unicode; | |||
| 7 | 7 | ||
| 8 | const CodePoint = @import("code_point").CodePoint; | 8 | const CodePoint = @import("code_point").CodePoint; |
| 9 | const CodePointIterator = @import("code_point").Iterator; | 9 | const CodePointIterator = @import("code_point").Iterator; |
| 10 | const CodePointReverseIterator = @import("code_point").ReverseIterator; | ||
| 10 | 11 | ||
| 11 | s1: []u16 = undefined, | 12 | s1: []u16 = undefined, |
| 12 | s2: []u16 = undefined, | 13 | s2: []u16 = undefined, |
| @@ -70,6 +71,10 @@ pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { | |||
| 70 | return Iterator.init(string, graphemes); | 71 | return Iterator.init(string, graphemes); |
| 71 | } | 72 | } |
| 72 | 73 | ||
| 74 | pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { | ||
| 75 | return ReverseIterator.init(string, graphemes); | ||
| 76 | } | ||
| 77 | |||
| 73 | /// Indic syllable type. | 78 | /// Indic syllable type. |
| 74 | pub const Indic = enum { | 79 | pub const Indic = enum { |
| 75 | none, | 80 | none, |
| @@ -239,6 +244,221 @@ pub const Iterator = struct { | |||
| 239 | } | 244 | } |
| 240 | }; | 245 | }; |
| 241 | 246 | ||
| 247 | pub const ReverseIterator = struct { | ||
| 248 | buf: [2]?CodePoint = .{ null, null }, | ||
| 249 | cp_iter: CodePointReverseIterator, | ||
| 250 | data: *const Graphemes, | ||
| 251 | /// Codepoint read from `cp_iter` but not returned by `previous` | ||
| 252 | pending: Pending = .{ .none = {} }, | ||
| 253 | |||
| 254 | const Pending = union(enum) { | ||
| 255 | none: void, | ||
| 256 | /// Count of pending RI codepoints, it is an even number | ||
| 257 | ri_count: usize, | ||
| 258 | /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji | ||
| 259 | extend_end: u32, | ||
| 260 | }; | ||
| 261 | |||
| 262 | const Self = @This(); | ||
| 263 | |||
| 264 | pub fn init(str: []const u8, data: *const Graphemes) Self { | ||
| 265 | var self: Self = .{ .cp_iter = .init(str), .data = data }; | ||
| 266 | self.advance(); | ||
| 267 | self.advance(); | ||
| 268 | return self; | ||
| 269 | } | ||
| 270 | |||
| 271 | fn advance(self: *Self) void { | ||
| 272 | self.buf[1] = self.buf[0]; | ||
| 273 | self.buf[0] = self.cp_iter.prev(); | ||
| 274 | } | ||
| 275 | |||
| 276 | pub fn prev(self: *Self) ?Grapheme { | ||
| 277 | if (self.buf[1] == null) return null; | ||
| 278 | |||
| 279 | const grapheme_end: u32 = end: { | ||
| 280 | const codepoint = self.buf[1].?; | ||
| 281 | |||
| 282 | switch (self.pending) { | ||
| 283 | // BUF: [?Any, Any] | ||
| 284 | .none => break :end codepoint.offset + codepoint.len, | ||
| 285 | .ri_count => |ri_count| { | ||
| 286 | std.debug.assert(ri_count > 0); | ||
| 287 | std.debug.assert(ri_count % 2 == 0); | ||
| 288 | |||
| 289 | if (ri_count > 2) { | ||
| 290 | self.pending.ri_count -= 2; | ||
| 291 | |||
| 292 | // Use the fact that all RI have length 4 in utf8 encoding | ||
| 293 | // since they are in range 0x1f1e6...0x1f1ff | ||
| 294 | // https://en.wikipedia.org/wiki/UTF-8#Encoding | ||
| 295 | return Grapheme{ | ||
| 296 | .len = 8, | ||
| 297 | .offset = @intCast(codepoint.offset + self.pending.ri_count * 4), | ||
| 298 | }; | ||
| 299 | } else { | ||
| 300 | self.pending = .{ .none = {} }; | ||
| 301 | break :end codepoint.offset + codepoint.len + 4; | ||
| 302 | } | ||
| 303 | }, | ||
| 304 | // BUF: [?Any, Extend] Extend* ZWJ | ||
| 305 | .extend_end => |extend_end| { | ||
| 306 | self.pending = .{ .none = {} }; | ||
| 307 | break :end extend_end; | ||
| 308 | }, | ||
| 309 | } | ||
| 310 | }; | ||
| 311 | |||
| 312 | while (self.buf[0] != null) { | ||
| 313 | var state: State = .{}; | ||
| 314 | state.setXpic(); | ||
| 315 | state.unsetRegional(); | ||
| 316 | state.setIndic(); | ||
| 317 | |||
| 318 | if (graphemeBreak( | ||
| 319 | self.buf[0].?.code, | ||
| 320 | self.buf[1].?.code, | ||
| 321 | self.data, | ||
| 322 | &state, | ||
| 323 | )) break; | ||
| 324 | |||
| 325 | self.advance(); | ||
| 326 | |||
| 327 | if (!state.hasIndic()) { | ||
| 328 | |||
| 329 | // BUF: [?Any, Extend | Linker] Consonant | ||
| 330 | var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; | ||
| 331 | |||
| 332 | indic: while (true) { | ||
| 333 | if (self.buf[0] == null) { | ||
| 334 | self.pending = .{ .extend_end = indic_offset }; | ||
| 335 | return .{ | ||
| 336 | .len = @intCast(grapheme_end - indic_offset), | ||
| 337 | .offset = indic_offset, | ||
| 338 | }; | ||
| 339 | } | ||
| 340 | |||
| 341 | const codepoint = self.buf[0].?; | ||
| 342 | |||
| 343 | switch (self.data.indic(codepoint.code)) { | ||
| 344 | .Extend, .Linker => { | ||
| 345 | self.advance(); | ||
| 346 | continue :indic; | ||
| 347 | }, | ||
| 348 | .Consonant => { | ||
| 349 | // BUF: [Consonant, Extend | Linker] (Extend | Linker)* Consonant | ||
| 350 | indic_offset = codepoint.offset; | ||
| 351 | self.advance(); | ||
| 352 | |||
| 353 | if (self.buf[0]) |cp1| { | ||
| 354 | state.setIndic(); | ||
| 355 | |||
| 356 | if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; | ||
| 357 | |||
| 358 | if (!state.hasIndic()) { | ||
| 359 | continue :indic; | ||
| 360 | } else { | ||
| 361 | break :indic; | ||
| 362 | } | ||
| 363 | } else { | ||
| 364 | break :indic; | ||
| 365 | } | ||
| 366 | }, | ||
| 367 | .none => { | ||
| 368 | // BUF: [Any, Extend | Linker] (Extend | Linker)* Consonant | ||
| 369 | self.pending = .{ .extend_end = indic_offset }; | ||
| 370 | return .{ | ||
| 371 | .len = @intCast(grapheme_end - indic_offset), | ||
| 372 | .offset = indic_offset, | ||
| 373 | }; | ||
| 374 | }, | ||
| 375 | } | ||
| 376 | } | ||
| 377 | } | ||
| 378 | |||
| 379 | if (!state.hasXpic()) { | ||
| 380 | // BUF: [?Any, ZWJ] Emoji | ||
| 381 | var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; | ||
| 382 | |||
| 383 | // Look for previous Emoji | ||
| 384 | emoji: while (true) { | ||
| 385 | if (self.buf[0] == null) { | ||
| 386 | self.pending = .{ .extend_end = emoji_offset }; | ||
| 387 | return .{ | ||
| 388 | .len = @intCast(grapheme_end - emoji_offset), | ||
| 389 | .offset = emoji_offset, | ||
| 390 | }; | ||
| 391 | } | ||
| 392 | |||
| 393 | const codepoint = self.buf[0].?; | ||
| 394 | |||
| 395 | if (self.data.gbp(codepoint.code) == .Extend) { | ||
| 396 | self.advance(); | ||
| 397 | continue :emoji; | ||
| 398 | } | ||
| 399 | |||
| 400 | if (self.data.isEmoji(codepoint.code)) { | ||
| 401 | // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)* | ||
| 402 | emoji_offset = codepoint.offset; | ||
| 403 | self.advance(); | ||
| 404 | |||
| 405 | if (self.buf[0] != null and | ||
| 406 | // ZWJ = 0x200d | ||
| 407 | self.buf[0].?.code == 0x200d) | ||
| 408 | { | ||
| 409 | // BUF: [ZWJ, Emoji] (Extend* ZWJ Emoji)* | ||
| 410 | // Back at the beginning of the loop, "recursively" look for emoji | ||
| 411 | self.advance(); | ||
| 412 | continue :emoji; | ||
| 413 | } else { | ||
| 414 | // BUF: [?Any, Emoji] (Extend* ZWJ Emoji)* | ||
| 415 | break :emoji; | ||
| 416 | } | ||
| 417 | } else { | ||
| 418 | // BUF: [Any, Extend] (Extend* ZWJ Emoji)* | ||
| 419 | self.pending = .{ .extend_end = emoji_offset }; | ||
| 420 | return .{ | ||
| 421 | .len = @intCast(grapheme_end - emoji_offset), | ||
| 422 | .offset = emoji_offset, | ||
| 423 | }; | ||
| 424 | } | ||
| 425 | } | ||
| 426 | } | ||
| 427 | |||
| 428 | if (state.hasRegional()) { | ||
| 429 | var ri_count: usize = 0; | ||
| 430 | while (self.buf[0] != null and | ||
| 431 | self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) | ||
| 432 | { | ||
| 433 | ri_count += 1; | ||
| 434 | self.advance(); | ||
| 435 | } | ||
| 436 | |||
| 437 | // Use the fact that all RI have length 4 in utf8 encoding | ||
| 438 | // since they are in range 0x1f1e6...0x1f1ff | ||
| 439 | // https://en.wikipedia.org/wiki/UTF-8#Encoding | ||
| 440 | if (ri_count == 0) { | ||
| 441 | // There are no pending RI codepoints | ||
| 442 | } else if (ri_count % 2 == 0) { | ||
| 443 | self.pending = .{ .ri_count = ri_count }; | ||
| 444 | return .{ .len = 8, .offset = grapheme_end - 8 }; | ||
| 445 | } else { | ||
| 446 | // Add one to count for the unused RI | ||
| 447 | self.pending = .{ .ri_count = ri_count + 1 }; | ||
| 448 | return .{ .len = 4, .offset = grapheme_end - 4 }; | ||
| 449 | } | ||
| 450 | } | ||
| 451 | } | ||
| 452 | |||
| 453 | const grapheme_start = if (self.buf[1]) |codepoint| codepoint.offset else 0; | ||
| 454 | self.advance(); | ||
| 455 | return .{ | ||
| 456 | .len = @intCast(grapheme_end - grapheme_start), | ||
| 457 | .offset = grapheme_start, | ||
| 458 | }; | ||
| 459 | } | ||
| 460 | }; | ||
| 461 | |||
| 242 | // Predicates | 462 | // Predicates |
| 243 | fn isBreaker(cp: u21, data: *const Graphemes) bool { | 463 | fn isBreaker(cp: u21, data: *const Graphemes) bool { |
| 244 | // Extract relevant properties. | 464 | // Extract relevant properties. |
| @@ -258,7 +478,7 @@ pub const State = struct { | |||
| 258 | self.bits |= 1; | 478 | self.bits |= 1; |
| 259 | } | 479 | } |
| 260 | fn unsetXpic(self: *State) void { | 480 | fn unsetXpic(self: *State) void { |
| 261 | self.bits ^= 1; | 481 | self.bits &= ~@as(u3, 1); |
| 262 | } | 482 | } |
| 263 | 483 | ||
| 264 | // Regional Indicatior (flags) | 484 | // Regional Indicatior (flags) |
| @@ -269,7 +489,7 @@ pub const State = struct { | |||
| 269 | self.bits |= 2; | 489 | self.bits |= 2; |
| 270 | } | 490 | } |
| 271 | fn unsetRegional(self: *State) void { | 491 | fn unsetRegional(self: *State) void { |
| 272 | self.bits ^= 2; | 492 | self.bits &= ~@as(u3, 2); |
| 273 | } | 493 | } |
| 274 | 494 | ||
| 275 | // Indic Conjunct | 495 | // Indic Conjunct |
| @@ -280,7 +500,7 @@ pub const State = struct { | |||
| 280 | self.bits |= 4; | 500 | self.bits |= 4; |
| 281 | } | 501 | } |
| 282 | fn unsetIndic(self: *State) void { | 502 | fn unsetIndic(self: *State) void { |
| 283 | self.bits ^= 4; | 503 | self.bits &= ~@as(u3, 4); |
| 284 | } | 504 | } |
| 285 | }; | 505 | }; |
| 286 | 506 | ||