diff options
Diffstat (limited to 'src/Graphemes.zig')
| -rw-r--r-- | src/Graphemes.zig | 479 |
1 files changed, 370 insertions, 109 deletions
diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 7bf328a..f1c56ed 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig | |||
| @@ -1,12 +1,7 @@ | |||
| 1 | const std = @import("std"); | 1 | //! Graphemes Module |
| 2 | const builtin = @import("builtin"); | 2 | //! |
| 3 | const mem = std.mem; | 3 | //! Code for handling graphemes: fragments of string which should be |
| 4 | const Allocator = mem.Allocator; | 4 | //! treated as one unit. Like Farmer Bob here: 👨🏻🌾 |
| 5 | const compress = std.compress; | ||
| 6 | const unicode = std.unicode; | ||
| 7 | |||
| 8 | const CodePoint = @import("code_point").CodePoint; | ||
| 9 | const CodePointIterator = @import("code_point").Iterator; | ||
| 10 | 5 | ||
| 11 | s1: []u16 = undefined, | 6 | s1: []u16 = undefined, |
| 12 | s2: []u16 = undefined, | 7 | s2: []u16 = undefined, |
| @@ -66,10 +61,16 @@ pub fn isEmoji(graphemes: Graphemes, cp: u21) bool { | |||
| 66 | return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; | 61 | return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; |
| 67 | } | 62 | } |
| 68 | 63 | ||
| 64 | /// Returns an iterator over the graphemes in `string`. | ||
| 69 | pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { | 65 | pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { |
| 70 | return Iterator.init(string, graphemes); | 66 | return Iterator.init(string, graphemes); |
| 71 | } | 67 | } |
| 72 | 68 | ||
| 69 | /// Returns a reverse iterator over the graphemes in `string`. | ||
| 70 | pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { | ||
| 71 | return ReverseIterator.init(string, graphemes); | ||
| 72 | } | ||
| 73 | |||
| 73 | /// Indic syllable type. | 74 | /// Indic syllable type. |
| 74 | pub const Indic = enum { | 75 | pub const Indic = enum { |
| 75 | none, | 76 | none, |
| @@ -99,8 +100,8 @@ pub const Gbp = enum { | |||
| 99 | 100 | ||
| 100 | /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. | 101 | /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. |
| 101 | pub const Grapheme = struct { | 102 | pub const Grapheme = struct { |
| 102 | len: u8, | 103 | len: uoffset, |
| 103 | offset: u32, | 104 | offset: uoffset, |
| 104 | 105 | ||
| 105 | /// `bytes` returns the slice of bytes that correspond to | 106 | /// `bytes` returns the slice of bytes that correspond to |
| 106 | /// this grapheme cluster in `src`. | 107 | /// this grapheme cluster in `src`. |
| @@ -109,6 +110,96 @@ pub const Grapheme = struct { | |||
| 109 | } | 110 | } |
| 110 | }; | 111 | }; |
| 111 | 112 | ||
| 113 | // NOTE: graphemeAtIndex is, probably, not in an optimal form. It has the advantage | ||
| 114 | // of being composed of other parts, but the constant factor can _probably_ be improved | ||
| 115 | // by a bespoke implmentation using graphemes.graphemeBreak directly. There's a limit | ||
| 116 | // to how much cycle-bumming I'm willing to do at any given moment; that limit has been | ||
| 117 | // reached. Perhaps you, Dear Reader, might pick up the torch? | ||
| 118 | |||
| 119 | /// Returns the `Grapheme` at `string[index]`, which does not have to be a | ||
| 120 | /// valid start of a codepoint. Asserts the string is not empty. Index must be | ||
| 121 | /// less than `string.len`. Always returns a `Grapheme`. | ||
| 122 | pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme { | ||
| 123 | assert(string.len != 0); | ||
| 124 | if (index == 0 or (index > 0 and | ||
| 125 | string[index] < 0x80 and | ||
| 126 | string[index - 1] < 0x80) and | ||
| 127 | (string[index - 1] != '\r' and string[index] != '\n')) | ||
| 128 | { | ||
| 129 | // There's always a grapheme break between two ASCII code points (except CRLF) | ||
| 130 | var iter = graphemes.iterator(string[index..]); | ||
| 131 | const next = iter.next().?; | ||
| 132 | return Grapheme{ | ||
| 133 | .len = next.len, | ||
| 134 | .offset = @as(u32, @intCast(index)) + next.offset, | ||
| 135 | }; | ||
| 136 | } // Otherwise it gets hairy. | ||
| 137 | const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset; | ||
| 138 | if (idx == string.len) { | ||
| 139 | var iter = graphemes.reverseIterator(string); | ||
| 140 | return iter.prev().?; | ||
| 141 | } | ||
| 142 | // We're on a valid codepoint boundary, we go back from here | ||
| 143 | var r_iter = graphemes.reverseIterAtIndex(string, idx); | ||
| 144 | if (r_iter.prev()) |g| { | ||
| 145 | if (g.offset == 0) { | ||
| 146 | var iter = graphemes.iterator(string); | ||
| 147 | while (iter.next()) |g2| { | ||
| 148 | if (g2.offset <= idx and idx < g2.offset + g2.len) return g2; | ||
| 149 | } | ||
| 150 | } | ||
| 151 | } | ||
| 152 | // We need to toss one, because otherwise we might not be pending when | ||
| 153 | // we in fact need to be. | ||
| 154 | _ = r_iter.prev(); | ||
| 155 | while (r_iter.pending != .none) : (_ = r_iter.prev()) {} | ||
| 156 | var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0); | ||
| 157 | while (iter.next()) |g| { | ||
| 158 | if (g.offset <= idx and idx < g.offset + g.len) return g; | ||
| 159 | } | ||
| 160 | unreachable; | ||
| 161 | } | ||
| 162 | |||
| 163 | /// Return a (forward) iterator of `string` after `grapheme`. | ||
| 164 | pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator { | ||
| 165 | return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len); | ||
| 166 | } | ||
| 167 | |||
| 168 | /// Return a reverse iterator of `string` before `grapheme`. | ||
| 169 | pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator { | ||
| 170 | // This bit of weirdness is because reverse iterators are "advance last", | ||
| 171 | // while forward iterators are "advance first". This leaves some room for | ||
| 172 | // further optimization, if anyone dares. | ||
| 173 | var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1); | ||
| 174 | _ = r_iter.prev(); | ||
| 175 | return r_iter; | ||
| 176 | } | ||
| 177 | |||
| 178 | fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator { | ||
| 179 | var r_iter: ReverseIterator = undefined; | ||
| 180 | r_iter.data = graphemes; | ||
| 181 | var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; | ||
| 182 | r_iter.buf[1] = rcp_iter.prev(); | ||
| 183 | r_iter.buf[0] = rcp_iter.prev(); | ||
| 184 | r_iter.pending = .none; | ||
| 185 | r_iter.cp_iter = rcp_iter; | ||
| 186 | return r_iter; | ||
| 187 | } | ||
| 188 | |||
| 189 | fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator { | ||
| 190 | var iter: Iterator = undefined; | ||
| 191 | iter.data = graphemes; | ||
| 192 | iter.buf[0] = first: { | ||
| 193 | if (idx == string.len) break :first null; | ||
| 194 | var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; | ||
| 195 | break :first r_cp_iter.prev(); | ||
| 196 | }; | ||
| 197 | var cp_iter: CodePointIterator = .{ .bytes = string, .i = idx }; | ||
| 198 | iter.buf[1] = cp_iter.next(); | ||
| 199 | iter.cp_iter = cp_iter; | ||
| 200 | return iter; | ||
| 201 | } | ||
| 202 | |||
| 112 | /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. | 203 | /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. |
| 113 | pub const Iterator = struct { | 204 | pub const Iterator = struct { |
| 114 | buf: [2]?CodePoint = .{ null, null }, | 205 | buf: [2]?CodePoint = .{ null, null }, |
| @@ -143,7 +234,7 @@ pub const Iterator = struct { | |||
| 143 | 234 | ||
| 144 | const gc_start = self.buf[0].?.offset; | 235 | const gc_start = self.buf[0].?.offset; |
| 145 | var gc_len: u8 = self.buf[0].?.len; | 236 | var gc_len: u8 = self.buf[0].?.len; |
| 146 | var state = State{}; | 237 | var state = IterState{}; |
| 147 | 238 | ||
| 148 | if (graphemeBreak( | 239 | if (graphemeBreak( |
| 149 | self.buf[0].?.code, | 240 | self.buf[0].?.code, |
| @@ -173,72 +264,244 @@ pub const Iterator = struct { | |||
| 173 | const saved_cp_iter = self.cp_iter; | 264 | const saved_cp_iter = self.cp_iter; |
| 174 | const s0 = self.buf[0]; | 265 | const s0 = self.buf[0]; |
| 175 | const s1 = self.buf[1]; | 266 | const s1 = self.buf[1]; |
| 176 | 267 | defer { | |
| 177 | self.advance(); | ||
| 178 | |||
| 179 | // If no more | ||
| 180 | if (self.buf[0] == null) { | ||
| 181 | self.cp_iter = saved_cp_iter; | ||
| 182 | self.buf[0] = s0; | ||
| 183 | self.buf[1] = s1; | ||
| 184 | return null; | ||
| 185 | } | ||
| 186 | // If last one | ||
| 187 | if (self.buf[1] == null) { | ||
| 188 | const len = self.buf[0].?.len; | ||
| 189 | const offset = self.buf[0].?.offset; | ||
| 190 | self.cp_iter = saved_cp_iter; | ||
| 191 | self.buf[0] = s0; | ||
| 192 | self.buf[1] = s1; | ||
| 193 | return Grapheme{ .len = len, .offset = offset }; | ||
| 194 | } | ||
| 195 | // If ASCII | ||
| 196 | if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) { | ||
| 197 | const len = self.buf[0].?.len; | ||
| 198 | const offset = self.buf[0].?.offset; | ||
| 199 | self.cp_iter = saved_cp_iter; | 268 | self.cp_iter = saved_cp_iter; |
| 200 | self.buf[0] = s0; | 269 | self.buf[0] = s0; |
| 201 | self.buf[1] = s1; | 270 | self.buf[1] = s1; |
| 202 | return Grapheme{ .len = len, .offset = offset }; | ||
| 203 | } | 271 | } |
| 272 | return self.next(); | ||
| 273 | } | ||
| 274 | }; | ||
| 204 | 275 | ||
| 205 | const gc_start = self.buf[0].?.offset; | 276 | /// Iterate a string backward by Grapheme. |
| 206 | var gc_len: u8 = self.buf[0].?.len; | 277 | pub const ReverseIterator = struct { |
| 207 | var state = State{}; | 278 | buf: [2]?CodePoint = .{ null, null }, |
| 279 | cp_iter: CodePointReverseIterator, | ||
| 280 | data: *const Graphemes, | ||
| 281 | /// Codepoint read from `cp_iter` but not returned by `previous` | ||
| 282 | pending: Pending = .none, | ||
| 208 | 283 | ||
| 209 | if (graphemeBreak( | 284 | const Pending = union(enum) { |
| 210 | self.buf[0].?.code, | 285 | none: void, |
| 211 | self.buf[1].?.code, | 286 | /// Count of pending RI codepoints, it is an even number |
| 212 | self.data, | 287 | ri_count: usize, |
| 213 | &state, | 288 | /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji |
| 214 | )) { | 289 | extend_end: uoffset, |
| 215 | self.cp_iter = saved_cp_iter; | 290 | }; |
| 216 | self.buf[0] = s0; | ||
| 217 | self.buf[1] = s1; | ||
| 218 | return Grapheme{ .len = gc_len, .offset = gc_start }; | ||
| 219 | } | ||
| 220 | 291 | ||
| 221 | while (true) { | 292 | const Self = @This(); |
| 222 | self.advance(); | ||
| 223 | if (self.buf[0] == null) break; | ||
| 224 | 293 | ||
| 225 | gc_len += self.buf[0].?.len; | 294 | pub fn init(str: []const u8, data: *const Graphemes) Self { |
| 295 | var self: Self = .{ .cp_iter = .init(str), .data = data }; | ||
| 296 | self.advance(); | ||
| 297 | self.advance(); | ||
| 298 | return self; | ||
| 299 | } | ||
| 300 | |||
| 301 | fn advance(self: *Self) void { | ||
| 302 | self.buf[1] = self.buf[0]; | ||
| 303 | self.buf[0] = self.cp_iter.prev(); | ||
| 304 | } | ||
| 305 | |||
| 306 | pub fn peek(self: *Self) ?Grapheme { | ||
| 307 | const cache = .{ self.buf, self.cp_iter, self.pending }; | ||
| 308 | defer self.buf, self.cp_iter, self.pending = cache; | ||
| 309 | return self.prev(); | ||
| 310 | } | ||
| 311 | |||
| 312 | pub fn prev(self: *Self) ?Grapheme { | ||
| 313 | if (self.buf[1] == null) return null; | ||
| 314 | |||
| 315 | const grapheme_end: uoffset = end: { | ||
| 316 | const codepoint = self.buf[1].?; | ||
| 317 | |||
| 318 | switch (self.pending) { | ||
| 319 | // BUF: [?Any, Any] | ||
| 320 | .none => break :end codepoint.offset + codepoint.len, | ||
| 321 | .ri_count => |ri_count| { | ||
| 322 | std.debug.assert(ri_count > 0); | ||
| 323 | std.debug.assert(ri_count % 2 == 0); | ||
| 324 | |||
| 325 | if (ri_count > 2) { | ||
| 326 | self.pending.ri_count -= 2; | ||
| 327 | |||
| 328 | // Use the fact that all RI have length 4 in utf8 encoding | ||
| 329 | // since they are in range 0x1f1e6...0x1f1ff | ||
| 330 | // https://en.wikipedia.org/wiki/UTF-8#Encoding | ||
| 331 | return Grapheme{ | ||
| 332 | .len = 8, | ||
| 333 | .offset = @intCast(codepoint.offset + self.pending.ri_count * 4), | ||
| 334 | }; | ||
| 335 | } else { | ||
| 336 | self.pending = .{ .none = {} }; | ||
| 337 | break :end codepoint.offset + codepoint.len + 4; | ||
| 338 | } | ||
| 339 | }, | ||
| 340 | // BUF: [?Any, Extend] Extend* ZWJ | ||
| 341 | .extend_end => |extend_end| { | ||
| 342 | self.pending = .{ .none = {} }; | ||
| 343 | break :end extend_end; | ||
| 344 | }, | ||
| 345 | } | ||
| 346 | }; | ||
| 347 | |||
| 348 | while (self.buf[0] != null) { | ||
| 349 | var state: IterState = .{}; | ||
| 350 | state.xpic = true; | ||
| 351 | state.regional = false; | ||
| 352 | state.indic = true; | ||
| 226 | 353 | ||
| 227 | if (graphemeBreak( | 354 | if (graphemeBreak( |
| 228 | self.buf[0].?.code, | 355 | self.buf[0].?.code, |
| 229 | if (self.buf[1]) |ncp| ncp.code else 0, | 356 | self.buf[1].?.code, |
| 230 | self.data, | 357 | self.data, |
| 231 | &state, | 358 | &state, |
| 232 | )) break; | 359 | )) break; |
| 360 | |||
| 361 | self.advance(); | ||
| 362 | |||
| 363 | if (!state.indic) { | ||
| 364 | |||
| 365 | // BUF: [?Any, Extend | Linker] Consonant | ||
| 366 | var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; | ||
| 367 | |||
| 368 | indic: while (true) { | ||
| 369 | if (self.buf[0] == null) { | ||
| 370 | self.pending = .{ .extend_end = indic_offset }; | ||
| 371 | return .{ | ||
| 372 | .len = @intCast(grapheme_end - indic_offset), | ||
| 373 | .offset = indic_offset, | ||
| 374 | }; | ||
| 375 | } | ||
| 376 | |||
| 377 | const codepoint = self.buf[0].?; | ||
| 378 | |||
| 379 | switch (self.data.indic(codepoint.code)) { | ||
| 380 | .Extend, .Linker => { | ||
| 381 | self.advance(); | ||
| 382 | continue :indic; | ||
| 383 | }, | ||
| 384 | .Consonant => { | ||
| 385 | // BUF: [Consonant, Extend | Linker] (Extend | Linker)* Consonant | ||
| 386 | indic_offset = codepoint.offset; | ||
| 387 | self.advance(); | ||
| 388 | |||
| 389 | if (self.buf[0]) |cp1| { | ||
| 390 | state.indic = true; | ||
| 391 | |||
| 392 | if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; | ||
| 393 | |||
| 394 | if (!state.indic) { | ||
| 395 | continue :indic; | ||
| 396 | } else { | ||
| 397 | break :indic; | ||
| 398 | } | ||
| 399 | } else { | ||
| 400 | break :indic; | ||
| 401 | } | ||
| 402 | }, | ||
| 403 | .none => { | ||
| 404 | // BUF: [Any, Extend | Linker] (Extend | Linker)* Consonant | ||
| 405 | self.pending = .{ .extend_end = indic_offset }; | ||
| 406 | return .{ | ||
| 407 | .len = @intCast(grapheme_end - indic_offset), | ||
| 408 | .offset = indic_offset, | ||
| 409 | }; | ||
| 410 | }, | ||
| 411 | } | ||
| 412 | } | ||
| 413 | } | ||
| 414 | |||
| 415 | if (!state.xpic) { | ||
| 416 | // BUF: [?Any, ZWJ] Emoji | ||
| 417 | var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; | ||
| 418 | |||
| 419 | // Look for previous Emoji | ||
| 420 | emoji: while (true) { | ||
| 421 | if (self.buf[0] == null) { | ||
| 422 | self.pending = .{ .extend_end = emoji_offset }; | ||
| 423 | return .{ | ||
| 424 | .len = @intCast(grapheme_end - emoji_offset), | ||
| 425 | .offset = emoji_offset, | ||
| 426 | }; | ||
| 427 | } | ||
| 428 | |||
| 429 | const codepoint = self.buf[0].?; | ||
| 430 | |||
| 431 | if (self.data.gbp(codepoint.code) == .Extend) { | ||
| 432 | self.advance(); | ||
| 433 | continue :emoji; | ||
| 434 | } | ||
| 435 | |||
| 436 | if (self.data.isEmoji(codepoint.code)) { | ||
| 437 | // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)* | ||
| 438 | emoji_offset = codepoint.offset; | ||
| 439 | self.advance(); | ||
| 440 | |||
| 441 | if (self.buf[0] != null and | ||
| 442 | // ZWJ = 0x200d | ||
| 443 | self.buf[0].?.code == 0x200d) | ||
| 444 | { | ||
| 445 | // BUF: [ZWJ, Emoji] (Extend* ZWJ Emoji)* | ||
| 446 | // Back at the beginning of the loop, "recursively" look for emoji | ||
| 447 | self.advance(); | ||
| 448 | continue :emoji; | ||
| 449 | } else { | ||
| 450 | // BUF: [?Any, Emoji] (Extend* ZWJ Emoji)* | ||
| 451 | break :emoji; | ||
| 452 | } | ||
| 453 | } else { | ||
| 454 | // BUF: [Any, Extend] (Extend* ZWJ Emoji)* | ||
| 455 | self.pending = .{ .extend_end = emoji_offset }; | ||
| 456 | return .{ | ||
| 457 | .len = @intCast(grapheme_end - emoji_offset), | ||
| 458 | .offset = emoji_offset, | ||
| 459 | }; | ||
| 460 | } | ||
| 461 | } | ||
| 462 | } | ||
| 463 | |||
| 464 | if (state.regional) { | ||
| 465 | var ri_count: usize = 0; | ||
| 466 | while (self.buf[0] != null and | ||
| 467 | self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) | ||
| 468 | { | ||
| 469 | ri_count += 1; | ||
| 470 | self.advance(); | ||
| 471 | } | ||
| 472 | |||
| 473 | // Use the fact that all RI have length 4 in utf8 encoding | ||
| 474 | // since they are in range 0x1f1e6...0x1f1ff | ||
| 475 | // https://en.wikipedia.org/wiki/UTF-8#Encoding | ||
| 476 | if (ri_count == 0) { | ||
| 477 | // There are no pending RI codepoints | ||
| 478 | } else if (ri_count % 2 == 0) { | ||
| 479 | self.pending = .{ .ri_count = ri_count }; | ||
| 480 | return .{ .len = 8, .offset = grapheme_end - 8 }; | ||
| 481 | } else { | ||
| 482 | // Add one to count for the unused RI | ||
| 483 | self.pending = .{ .ri_count = ri_count + 1 }; | ||
| 484 | return .{ .len = 4, .offset = grapheme_end - 4 }; | ||
| 485 | } | ||
| 486 | } | ||
| 233 | } | 487 | } |
| 234 | self.cp_iter = saved_cp_iter; | ||
| 235 | self.buf[0] = s0; | ||
| 236 | self.buf[1] = s1; | ||
| 237 | 488 | ||
| 238 | return Grapheme{ .len = gc_len, .offset = gc_start }; | 489 | const grapheme_start = if (self.buf[1]) |codepoint| codepoint.offset else 0; |
| 490 | self.advance(); | ||
| 491 | return .{ | ||
| 492 | .len = @intCast(grapheme_end - grapheme_start), | ||
| 493 | .offset = grapheme_start, | ||
| 494 | }; | ||
| 239 | } | 495 | } |
| 240 | }; | 496 | }; |
| 241 | 497 | ||
| 498 | /// Grapheme Iterator state. | ||
| 499 | pub const IterState = packed struct(u3) { | ||
| 500 | xpic: bool = false, | ||
| 501 | regional: bool = false, | ||
| 502 | indic: bool = false, | ||
| 503 | }; | ||
| 504 | |||
| 242 | // Predicates | 505 | // Predicates |
| 243 | fn isBreaker(cp: u21, data: *const Graphemes) bool { | 506 | fn isBreaker(cp: u21, data: *const Graphemes) bool { |
| 244 | // Extract relevant properties. | 507 | // Extract relevant properties. |
| @@ -246,44 +509,6 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool { | |||
| 246 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; | 509 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; |
| 247 | } | 510 | } |
| 248 | 511 | ||
| 249 | // Grapheme break state. | ||
| 250 | pub const State = struct { | ||
| 251 | bits: u3 = 0, | ||
| 252 | |||
| 253 | // Extended Pictographic (emoji) | ||
| 254 | fn hasXpic(self: State) bool { | ||
| 255 | return self.bits & 1 == 1; | ||
| 256 | } | ||
| 257 | fn setXpic(self: *State) void { | ||
| 258 | self.bits |= 1; | ||
| 259 | } | ||
| 260 | fn unsetXpic(self: *State) void { | ||
| 261 | self.bits ^= 1; | ||
| 262 | } | ||
| 263 | |||
| 264 | // Regional Indicatior (flags) | ||
| 265 | fn hasRegional(self: State) bool { | ||
| 266 | return self.bits & 2 == 2; | ||
| 267 | } | ||
| 268 | fn setRegional(self: *State) void { | ||
| 269 | self.bits |= 2; | ||
| 270 | } | ||
| 271 | fn unsetRegional(self: *State) void { | ||
| 272 | self.bits ^= 2; | ||
| 273 | } | ||
| 274 | |||
| 275 | // Indic Conjunct | ||
| 276 | fn hasIndic(self: State) bool { | ||
| 277 | return self.bits & 4 == 4; | ||
| 278 | } | ||
| 279 | fn setIndic(self: *State) void { | ||
| 280 | self.bits |= 4; | ||
| 281 | } | ||
| 282 | fn unsetIndic(self: *State) void { | ||
| 283 | self.bits ^= 4; | ||
| 284 | } | ||
| 285 | }; | ||
| 286 | |||
| 287 | /// `graphemeBreak` returns true only if a grapheme break point is required | 512 | /// `graphemeBreak` returns true only if a grapheme break point is required |
| 288 | /// between `cp1` and `cp2`. `state` should start out as 0. If calling | 513 | /// between `cp1` and `cp2`. `state` should start out as 0. If calling |
| 289 | /// iteratively over a sequence of code points, this function must be called | 514 | /// iteratively over a sequence of code points, this function must be called |
| @@ -294,7 +519,7 @@ pub fn graphemeBreak( | |||
| 294 | cp1: u21, | 519 | cp1: u21, |
| 295 | cp2: u21, | 520 | cp2: u21, |
| 296 | data: *const Graphemes, | 521 | data: *const Graphemes, |
| 297 | state: *State, | 522 | state: *IterState, |
| 298 | ) bool { | 523 | ) bool { |
| 299 | // Extract relevant properties. | 524 | // Extract relevant properties. |
| 300 | const cp1_gbp_prop = data.gbp(cp1); | 525 | const cp1_gbp_prop = data.gbp(cp1); |
| @@ -306,9 +531,9 @@ pub fn graphemeBreak( | |||
| 306 | const cp2_is_emoji = data.isEmoji(cp2); | 531 | const cp2_is_emoji = data.isEmoji(cp2); |
| 307 | 532 | ||
| 308 | // GB11: Emoji Extend* ZWJ x Emoji | 533 | // GB11: Emoji Extend* ZWJ x Emoji |
| 309 | if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); | 534 | if (!state.xpic and cp1_is_emoji) state.xpic = true; |
| 310 | // GB9c: Indic Conjunct Break | 535 | // GB9c: Indic Conjunct Break |
| 311 | if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic(); | 536 | if (!state.indic and cp1_indic_prop == .Consonant) state.indic = true; |
| 312 | 537 | ||
| 313 | // GB3: CR x LF | 538 | // GB3: CR x LF |
| 314 | if (cp1 == '\r' and cp2 == '\n') return false; | 539 | if (cp1 == '\r' and cp2 == '\n') return false; |
| @@ -317,11 +542,11 @@ pub fn graphemeBreak( | |||
| 317 | if (isBreaker(cp1, data)) return true; | 542 | if (isBreaker(cp1, data)) return true; |
| 318 | 543 | ||
| 319 | // GB11: Emoji Extend* ZWJ x Emoji | 544 | // GB11: Emoji Extend* ZWJ x Emoji |
| 320 | if (state.hasXpic() and | 545 | if (state.xpic and |
| 321 | cp1_gbp_prop == .ZWJ and | 546 | cp1_gbp_prop == .ZWJ and |
| 322 | cp2_is_emoji) | 547 | cp2_is_emoji) |
| 323 | { | 548 | { |
| 324 | state.unsetXpic(); | 549 | state.xpic = false; |
| 325 | return false; | 550 | return false; |
| 326 | } | 551 | } |
| 327 | 552 | ||
| @@ -336,11 +561,11 @@ pub fn graphemeBreak( | |||
| 336 | 561 | ||
| 337 | // GB12, GB13: RI x RI | 562 | // GB12, GB13: RI x RI |
| 338 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { | 563 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { |
| 339 | if (state.hasRegional()) { | 564 | if (state.regional) { |
| 340 | state.unsetRegional(); | 565 | state.regional = false; |
| 341 | return true; | 566 | return true; |
| 342 | } else { | 567 | } else { |
| 343 | state.setRegional(); | 568 | state.regional = true; |
| 344 | return false; | 569 | return false; |
| 345 | } | 570 | } |
| 346 | } | 571 | } |
| @@ -365,25 +590,25 @@ pub fn graphemeBreak( | |||
| 365 | } | 590 | } |
| 366 | 591 | ||
| 367 | // GB9c: Indic Conjunct Break | 592 | // GB9c: Indic Conjunct Break |
| 368 | if (state.hasIndic() and | 593 | if (state.indic and |
| 369 | cp1_indic_prop == .Consonant and | 594 | cp1_indic_prop == .Consonant and |
| 370 | (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) | 595 | (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) |
| 371 | { | 596 | { |
| 372 | return false; | 597 | return false; |
| 373 | } | 598 | } |
| 374 | 599 | ||
| 375 | if (state.hasIndic() and | 600 | if (state.indic and |
| 376 | cp1_indic_prop == .Extend and | 601 | cp1_indic_prop == .Extend and |
| 377 | cp2_indic_prop == .Linker) | 602 | cp2_indic_prop == .Linker) |
| 378 | { | 603 | { |
| 379 | return false; | 604 | return false; |
| 380 | } | 605 | } |
| 381 | 606 | ||
| 382 | if (state.hasIndic() and | 607 | if (state.indic and |
| 383 | (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and | 608 | (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and |
| 384 | cp2_indic_prop == .Consonant) | 609 | cp2_indic_prop == .Consonant) |
| 385 | { | 610 | { |
| 386 | state.unsetIndic(); | 611 | state.indic = false; |
| 387 | return false; | 612 | return false; |
| 388 | } | 613 | } |
| 389 | 614 | ||
| @@ -421,3 +646,39 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { | |||
| 421 | try std.testing.expectEqual(@as(usize, 2), i); | 646 | try std.testing.expectEqual(@as(usize, 2), i); |
| 422 | } | 647 | } |
| 423 | } | 648 | } |
| 649 | |||
| 650 | test "Iterator.peek" { | ||
| 651 | const peek_seq = "aΔ👨🏻🌾→"; | ||
| 652 | const data = try Graphemes.init(std.testing.allocator); | ||
| 653 | defer data.deinit(std.testing.allocator); | ||
| 654 | |||
| 655 | var iter = data.iterator(peek_seq); | ||
| 656 | const peek_a = iter.peek().?; | ||
| 657 | const next_a = iter.next().?; | ||
| 658 | try std.testing.expectEqual(peek_a, next_a); | ||
| 659 | try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq)); | ||
| 660 | const peek_d1 = iter.peek().?; | ||
| 661 | const peek_d2 = iter.peek().?; | ||
| 662 | try std.testing.expectEqual(peek_d1, peek_d2); | ||
| 663 | const next_d = iter.next().?; | ||
| 664 | try std.testing.expectEqual(peek_d2, next_d); | ||
| 665 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 666 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 667 | try std.testing.expectEqual(null, iter.peek()); | ||
| 668 | try std.testing.expectEqual(null, iter.peek()); | ||
| 669 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 670 | } | ||
| 671 | |||
| 672 | const std = @import("std"); | ||
| 673 | const builtin = @import("builtin"); | ||
| 674 | const assert = std.debug.assert; | ||
| 675 | const mem = std.mem; | ||
| 676 | const Allocator = mem.Allocator; | ||
| 677 | const compress = std.compress; | ||
| 678 | const unicode = std.unicode; | ||
| 679 | |||
| 680 | const code_point = @import("code_point"); | ||
| 681 | const CodePoint = code_point.CodePoint; | ||
| 682 | const CodePointIterator = code_point.Iterator; | ||
| 683 | const CodePointReverseIterator = code_point.ReverseIterator; | ||
| 684 | const uoffset = code_point.uoffset; | ||