diff options
| author | 2025-06-01 14:08:25 -0400 | |
|---|---|---|
| committer | 2025-06-01 14:08:25 -0400 | |
| commit | 8f5209fa095c2ed9114ce102b2f9b2cc90d66b13 (patch) | |
| tree | 4ec54815215a9a808be0ab9a2968159f144ba076 /src | |
| parent | Document "fat_offset" in README (diff) | |
| download | zg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.tar.gz zg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.tar.xz zg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.zip | |
Add graphemeAtIndex + iterate before and after
That completes the set. I do think it's possible to bum a few more
cycles from the implementation, but, I'm not going to. It passes
the acceptance suite and that's what it needs to do.
Diffstat (limited to 'src')
| -rw-r--r-- | src/Graphemes.zig | 220 | ||||
| -rw-r--r-- | src/Words.zig | 4 | ||||
| -rw-r--r-- | src/code_point.zig | 60 | ||||
| -rw-r--r-- | src/unicode_tests.zig | 69 |
4 files changed, 266 insertions, 87 deletions
diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 49fdbf3..f1c56ed 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig | |||
| @@ -1,15 +1,7 @@ | |||
| 1 | const std = @import("std"); | 1 | //! Graphemes Module |
| 2 | const builtin = @import("builtin"); | 2 | //! |
| 3 | const mem = std.mem; | 3 | //! Code for handling graphemes: fragments of string which should be |
| 4 | const Allocator = mem.Allocator; | 4 | //! treated as one unit. Like Farmer Bob here: 👨🏻🌾 |
| 5 | const compress = std.compress; | ||
| 6 | const unicode = std.unicode; | ||
| 7 | |||
| 8 | const code_point = @import("code_point"); | ||
| 9 | const CodePoint = code_point.CodePoint; | ||
| 10 | const CodePointIterator = code_point.Iterator; | ||
| 11 | const CodePointReverseIterator = code_point.ReverseIterator; | ||
| 12 | const uoffset = code_point.uoffset; | ||
| 13 | 5 | ||
| 14 | s1: []u16 = undefined, | 6 | s1: []u16 = undefined, |
| 15 | s2: []u16 = undefined, | 7 | s2: []u16 = undefined, |
| @@ -69,10 +61,12 @@ pub fn isEmoji(graphemes: Graphemes, cp: u21) bool { | |||
| 69 | return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; | 61 | return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; |
| 70 | } | 62 | } |
| 71 | 63 | ||
| 64 | /// Returns an iterator over the graphemes in `string`. | ||
| 72 | pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { | 65 | pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { |
| 73 | return Iterator.init(string, graphemes); | 66 | return Iterator.init(string, graphemes); |
| 74 | } | 67 | } |
| 75 | 68 | ||
| 69 | /// Returns a reverse iterator over the graphemes in `string`. | ||
| 76 | pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { | 70 | pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { |
| 77 | return ReverseIterator.init(string, graphemes); | 71 | return ReverseIterator.init(string, graphemes); |
| 78 | } | 72 | } |
| @@ -116,6 +110,96 @@ pub const Grapheme = struct { | |||
| 116 | } | 110 | } |
| 117 | }; | 111 | }; |
| 118 | 112 | ||
| 113 | // NOTE: graphemeAtIndex is, probably, not in an optimal form. It has the advantage | ||
| 114 | // of being composed of other parts, but the constant factor can _probably_ be improved | ||
| 115 | // by a bespoke implmentation using graphemes.graphemeBreak directly. There's a limit | ||
| 116 | // to how much cycle-bumming I'm willing to do at any given moment; that limit has been | ||
| 117 | // reached. Perhaps you, Dear Reader, might pick up the torch? | ||
| 118 | |||
| 119 | /// Returns the `Grapheme` at `string[index]`, which does not have to be a | ||
| 120 | /// valid start of a codepoint. Asserts the string is not empty. Index must be | ||
| 121 | /// less than `string.len`. Always returns a `Grapheme`. | ||
| 122 | pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme { | ||
| 123 | assert(string.len != 0); | ||
| 124 | if (index == 0 or (index > 0 and | ||
| 125 | string[index] < 0x80 and | ||
| 126 | string[index - 1] < 0x80) and | ||
| 127 | (string[index - 1] != '\r' and string[index] != '\n')) | ||
| 128 | { | ||
| 129 | // There's always a grapheme break between two ASCII code points (except CRLF) | ||
| 130 | var iter = graphemes.iterator(string[index..]); | ||
| 131 | const next = iter.next().?; | ||
| 132 | return Grapheme{ | ||
| 133 | .len = next.len, | ||
| 134 | .offset = @as(u32, @intCast(index)) + next.offset, | ||
| 135 | }; | ||
| 136 | } // Otherwise it gets hairy. | ||
| 137 | const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset; | ||
| 138 | if (idx == string.len) { | ||
| 139 | var iter = graphemes.reverseIterator(string); | ||
| 140 | return iter.prev().?; | ||
| 141 | } | ||
| 142 | // We're on a valid codepoint boundary, we go back from here | ||
| 143 | var r_iter = graphemes.reverseIterAtIndex(string, idx); | ||
| 144 | if (r_iter.prev()) |g| { | ||
| 145 | if (g.offset == 0) { | ||
| 146 | var iter = graphemes.iterator(string); | ||
| 147 | while (iter.next()) |g2| { | ||
| 148 | if (g2.offset <= idx and idx < g2.offset + g2.len) return g2; | ||
| 149 | } | ||
| 150 | } | ||
| 151 | } | ||
| 152 | // We need to toss one, because otherwise we might not be pending when | ||
| 153 | // we in fact need to be. | ||
| 154 | _ = r_iter.prev(); | ||
| 155 | while (r_iter.pending != .none) : (_ = r_iter.prev()) {} | ||
| 156 | var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0); | ||
| 157 | while (iter.next()) |g| { | ||
| 158 | if (g.offset <= idx and idx < g.offset + g.len) return g; | ||
| 159 | } | ||
| 160 | unreachable; | ||
| 161 | } | ||
| 162 | |||
| 163 | /// Return a (forward) iterator of `string` after `grapheme`. | ||
| 164 | pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator { | ||
| 165 | return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len); | ||
| 166 | } | ||
| 167 | |||
| 168 | /// Return a reverse iterator of `string` before `grapheme`. | ||
| 169 | pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator { | ||
| 170 | // This bit of weirdness is because reverse iterators are "advance last", | ||
| 171 | // while forward iterators are "advance first". This leaves some room for | ||
| 172 | // further optimization, if anyone dares. | ||
| 173 | var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1); | ||
| 174 | _ = r_iter.prev(); | ||
| 175 | return r_iter; | ||
| 176 | } | ||
| 177 | |||
| 178 | fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator { | ||
| 179 | var r_iter: ReverseIterator = undefined; | ||
| 180 | r_iter.data = graphemes; | ||
| 181 | var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; | ||
| 182 | r_iter.buf[1] = rcp_iter.prev(); | ||
| 183 | r_iter.buf[0] = rcp_iter.prev(); | ||
| 184 | r_iter.pending = .none; | ||
| 185 | r_iter.cp_iter = rcp_iter; | ||
| 186 | return r_iter; | ||
| 187 | } | ||
| 188 | |||
| 189 | fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator { | ||
| 190 | var iter: Iterator = undefined; | ||
| 191 | iter.data = graphemes; | ||
| 192 | iter.buf[0] = first: { | ||
| 193 | if (idx == string.len) break :first null; | ||
| 194 | var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; | ||
| 195 | break :first r_cp_iter.prev(); | ||
| 196 | }; | ||
| 197 | var cp_iter: CodePointIterator = .{ .bytes = string, .i = idx }; | ||
| 198 | iter.buf[1] = cp_iter.next(); | ||
| 199 | iter.cp_iter = cp_iter; | ||
| 200 | return iter; | ||
| 201 | } | ||
| 202 | |||
| 119 | /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. | 203 | /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. |
| 120 | pub const Iterator = struct { | 204 | pub const Iterator = struct { |
| 121 | buf: [2]?CodePoint = .{ null, null }, | 205 | buf: [2]?CodePoint = .{ null, null }, |
| @@ -150,7 +234,7 @@ pub const Iterator = struct { | |||
| 150 | 234 | ||
| 151 | const gc_start = self.buf[0].?.offset; | 235 | const gc_start = self.buf[0].?.offset; |
| 152 | var gc_len: u8 = self.buf[0].?.len; | 236 | var gc_len: u8 = self.buf[0].?.len; |
| 153 | var state = State{}; | 237 | var state = IterState{}; |
| 154 | 238 | ||
| 155 | if (graphemeBreak( | 239 | if (graphemeBreak( |
| 156 | self.buf[0].?.code, | 240 | self.buf[0].?.code, |
| @@ -189,12 +273,13 @@ pub const Iterator = struct { | |||
| 189 | } | 273 | } |
| 190 | }; | 274 | }; |
| 191 | 275 | ||
| 276 | /// Iterate a string backward by Grapheme. | ||
| 192 | pub const ReverseIterator = struct { | 277 | pub const ReverseIterator = struct { |
| 193 | buf: [2]?CodePoint = .{ null, null }, | 278 | buf: [2]?CodePoint = .{ null, null }, |
| 194 | cp_iter: CodePointReverseIterator, | 279 | cp_iter: CodePointReverseIterator, |
| 195 | data: *const Graphemes, | 280 | data: *const Graphemes, |
| 196 | /// Codepoint read from `cp_iter` but not returned by `previous` | 281 | /// Codepoint read from `cp_iter` but not returned by `previous` |
| 197 | pending: Pending = .{ .none = {} }, | 282 | pending: Pending = .none, |
| 198 | 283 | ||
| 199 | const Pending = union(enum) { | 284 | const Pending = union(enum) { |
| 200 | none: void, | 285 | none: void, |
| @@ -218,6 +303,12 @@ pub const ReverseIterator = struct { | |||
| 218 | self.buf[0] = self.cp_iter.prev(); | 303 | self.buf[0] = self.cp_iter.prev(); |
| 219 | } | 304 | } |
| 220 | 305 | ||
| 306 | pub fn peek(self: *Self) ?Grapheme { | ||
| 307 | const cache = .{ self.buf, self.cp_iter, self.pending }; | ||
| 308 | defer self.buf, self.cp_iter, self.pending = cache; | ||
| 309 | return self.prev(); | ||
| 310 | } | ||
| 311 | |||
| 221 | pub fn prev(self: *Self) ?Grapheme { | 312 | pub fn prev(self: *Self) ?Grapheme { |
| 222 | if (self.buf[1] == null) return null; | 313 | if (self.buf[1] == null) return null; |
| 223 | 314 | ||
| @@ -255,10 +346,10 @@ pub const ReverseIterator = struct { | |||
| 255 | }; | 346 | }; |
| 256 | 347 | ||
| 257 | while (self.buf[0] != null) { | 348 | while (self.buf[0] != null) { |
| 258 | var state: State = .{}; | 349 | var state: IterState = .{}; |
| 259 | state.setXpic(); | 350 | state.xpic = true; |
| 260 | state.unsetRegional(); | 351 | state.regional = false; |
| 261 | state.setIndic(); | 352 | state.indic = true; |
| 262 | 353 | ||
| 263 | if (graphemeBreak( | 354 | if (graphemeBreak( |
| 264 | self.buf[0].?.code, | 355 | self.buf[0].?.code, |
| @@ -269,7 +360,7 @@ pub const ReverseIterator = struct { | |||
| 269 | 360 | ||
| 270 | self.advance(); | 361 | self.advance(); |
| 271 | 362 | ||
| 272 | if (!state.hasIndic()) { | 363 | if (!state.indic) { |
| 273 | 364 | ||
| 274 | // BUF: [?Any, Extend | Linker] Consonant | 365 | // BUF: [?Any, Extend | Linker] Consonant |
| 275 | var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; | 366 | var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; |
| @@ -296,11 +387,11 @@ pub const ReverseIterator = struct { | |||
| 296 | self.advance(); | 387 | self.advance(); |
| 297 | 388 | ||
| 298 | if (self.buf[0]) |cp1| { | 389 | if (self.buf[0]) |cp1| { |
| 299 | state.setIndic(); | 390 | state.indic = true; |
| 300 | 391 | ||
| 301 | if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; | 392 | if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; |
| 302 | 393 | ||
| 303 | if (!state.hasIndic()) { | 394 | if (!state.indic) { |
| 304 | continue :indic; | 395 | continue :indic; |
| 305 | } else { | 396 | } else { |
| 306 | break :indic; | 397 | break :indic; |
| @@ -321,7 +412,7 @@ pub const ReverseIterator = struct { | |||
| 321 | } | 412 | } |
| 322 | } | 413 | } |
| 323 | 414 | ||
| 324 | if (!state.hasXpic()) { | 415 | if (!state.xpic) { |
| 325 | // BUF: [?Any, ZWJ] Emoji | 416 | // BUF: [?Any, ZWJ] Emoji |
| 326 | var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; | 417 | var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; |
| 327 | 418 | ||
| @@ -370,7 +461,7 @@ pub const ReverseIterator = struct { | |||
| 370 | } | 461 | } |
| 371 | } | 462 | } |
| 372 | 463 | ||
| 373 | if (state.hasRegional()) { | 464 | if (state.regional) { |
| 374 | var ri_count: usize = 0; | 465 | var ri_count: usize = 0; |
| 375 | while (self.buf[0] != null and | 466 | while (self.buf[0] != null and |
| 376 | self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) | 467 | self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) |
| @@ -404,6 +495,13 @@ pub const ReverseIterator = struct { | |||
| 404 | } | 495 | } |
| 405 | }; | 496 | }; |
| 406 | 497 | ||
| 498 | /// Grapheme Iterator state. | ||
| 499 | pub const IterState = packed struct(u3) { | ||
| 500 | xpic: bool = false, | ||
| 501 | regional: bool = false, | ||
| 502 | indic: bool = false, | ||
| 503 | }; | ||
| 504 | |||
| 407 | // Predicates | 505 | // Predicates |
| 408 | fn isBreaker(cp: u21, data: *const Graphemes) bool { | 506 | fn isBreaker(cp: u21, data: *const Graphemes) bool { |
| 409 | // Extract relevant properties. | 507 | // Extract relevant properties. |
| @@ -411,44 +509,6 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool { | |||
| 411 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; | 509 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; |
| 412 | } | 510 | } |
| 413 | 511 | ||
| 414 | // Grapheme break state. | ||
| 415 | pub const State = struct { | ||
| 416 | bits: u3 = 0, | ||
| 417 | |||
| 418 | // Extended Pictographic (emoji) | ||
| 419 | fn hasXpic(self: State) bool { | ||
| 420 | return self.bits & 1 == 1; | ||
| 421 | } | ||
| 422 | fn setXpic(self: *State) void { | ||
| 423 | self.bits |= 1; | ||
| 424 | } | ||
| 425 | fn unsetXpic(self: *State) void { | ||
| 426 | self.bits &= ~@as(u3, 1); | ||
| 427 | } | ||
| 428 | |||
| 429 | // Regional Indicatior (flags) | ||
| 430 | fn hasRegional(self: State) bool { | ||
| 431 | return self.bits & 2 == 2; | ||
| 432 | } | ||
| 433 | fn setRegional(self: *State) void { | ||
| 434 | self.bits |= 2; | ||
| 435 | } | ||
| 436 | fn unsetRegional(self: *State) void { | ||
| 437 | self.bits &= ~@as(u3, 2); | ||
| 438 | } | ||
| 439 | |||
| 440 | // Indic Conjunct | ||
| 441 | fn hasIndic(self: State) bool { | ||
| 442 | return self.bits & 4 == 4; | ||
| 443 | } | ||
| 444 | fn setIndic(self: *State) void { | ||
| 445 | self.bits |= 4; | ||
| 446 | } | ||
| 447 | fn unsetIndic(self: *State) void { | ||
| 448 | self.bits &= ~@as(u3, 4); | ||
| 449 | } | ||
| 450 | }; | ||
| 451 | |||
| 452 | /// `graphemeBreak` returns true only if a grapheme break point is required | 512 | /// `graphemeBreak` returns true only if a grapheme break point is required |
| 453 | /// between `cp1` and `cp2`. `state` should start out as 0. If calling | 513 | /// between `cp1` and `cp2`. `state` should start out as 0. If calling |
| 454 | /// iteratively over a sequence of code points, this function must be called | 514 | /// iteratively over a sequence of code points, this function must be called |
| @@ -459,7 +519,7 @@ pub fn graphemeBreak( | |||
| 459 | cp1: u21, | 519 | cp1: u21, |
| 460 | cp2: u21, | 520 | cp2: u21, |
| 461 | data: *const Graphemes, | 521 | data: *const Graphemes, |
| 462 | state: *State, | 522 | state: *IterState, |
| 463 | ) bool { | 523 | ) bool { |
| 464 | // Extract relevant properties. | 524 | // Extract relevant properties. |
| 465 | const cp1_gbp_prop = data.gbp(cp1); | 525 | const cp1_gbp_prop = data.gbp(cp1); |
| @@ -471,9 +531,9 @@ pub fn graphemeBreak( | |||
| 471 | const cp2_is_emoji = data.isEmoji(cp2); | 531 | const cp2_is_emoji = data.isEmoji(cp2); |
| 472 | 532 | ||
| 473 | // GB11: Emoji Extend* ZWJ x Emoji | 533 | // GB11: Emoji Extend* ZWJ x Emoji |
| 474 | if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); | 534 | if (!state.xpic and cp1_is_emoji) state.xpic = true; |
| 475 | // GB9c: Indic Conjunct Break | 535 | // GB9c: Indic Conjunct Break |
| 476 | if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic(); | 536 | if (!state.indic and cp1_indic_prop == .Consonant) state.indic = true; |
| 477 | 537 | ||
| 478 | // GB3: CR x LF | 538 | // GB3: CR x LF |
| 479 | if (cp1 == '\r' and cp2 == '\n') return false; | 539 | if (cp1 == '\r' and cp2 == '\n') return false; |
| @@ -482,11 +542,11 @@ pub fn graphemeBreak( | |||
| 482 | if (isBreaker(cp1, data)) return true; | 542 | if (isBreaker(cp1, data)) return true; |
| 483 | 543 | ||
| 484 | // GB11: Emoji Extend* ZWJ x Emoji | 544 | // GB11: Emoji Extend* ZWJ x Emoji |
| 485 | if (state.hasXpic() and | 545 | if (state.xpic and |
| 486 | cp1_gbp_prop == .ZWJ and | 546 | cp1_gbp_prop == .ZWJ and |
| 487 | cp2_is_emoji) | 547 | cp2_is_emoji) |
| 488 | { | 548 | { |
| 489 | state.unsetXpic(); | 549 | state.xpic = false; |
| 490 | return false; | 550 | return false; |
| 491 | } | 551 | } |
| 492 | 552 | ||
| @@ -501,11 +561,11 @@ pub fn graphemeBreak( | |||
| 501 | 561 | ||
| 502 | // GB12, GB13: RI x RI | 562 | // GB12, GB13: RI x RI |
| 503 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { | 563 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { |
| 504 | if (state.hasRegional()) { | 564 | if (state.regional) { |
| 505 | state.unsetRegional(); | 565 | state.regional = false; |
| 506 | return true; | 566 | return true; |
| 507 | } else { | 567 | } else { |
| 508 | state.setRegional(); | 568 | state.regional = true; |
| 509 | return false; | 569 | return false; |
| 510 | } | 570 | } |
| 511 | } | 571 | } |
| @@ -530,25 +590,25 @@ pub fn graphemeBreak( | |||
| 530 | } | 590 | } |
| 531 | 591 | ||
| 532 | // GB9c: Indic Conjunct Break | 592 | // GB9c: Indic Conjunct Break |
| 533 | if (state.hasIndic() and | 593 | if (state.indic and |
| 534 | cp1_indic_prop == .Consonant and | 594 | cp1_indic_prop == .Consonant and |
| 535 | (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) | 595 | (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) |
| 536 | { | 596 | { |
| 537 | return false; | 597 | return false; |
| 538 | } | 598 | } |
| 539 | 599 | ||
| 540 | if (state.hasIndic() and | 600 | if (state.indic and |
| 541 | cp1_indic_prop == .Extend and | 601 | cp1_indic_prop == .Extend and |
| 542 | cp2_indic_prop == .Linker) | 602 | cp2_indic_prop == .Linker) |
| 543 | { | 603 | { |
| 544 | return false; | 604 | return false; |
| 545 | } | 605 | } |
| 546 | 606 | ||
| 547 | if (state.hasIndic() and | 607 | if (state.indic and |
| 548 | (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and | 608 | (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and |
| 549 | cp2_indic_prop == .Consonant) | 609 | cp2_indic_prop == .Consonant) |
| 550 | { | 610 | { |
| 551 | state.unsetIndic(); | 611 | state.indic = false; |
| 552 | return false; | 612 | return false; |
| 553 | } | 613 | } |
| 554 | 614 | ||
| @@ -608,3 +668,17 @@ test "Iterator.peek" { | |||
| 608 | try std.testing.expectEqual(null, iter.peek()); | 668 | try std.testing.expectEqual(null, iter.peek()); |
| 609 | try std.testing.expectEqual(iter.peek(), iter.next()); | 669 | try std.testing.expectEqual(iter.peek(), iter.next()); |
| 610 | } | 670 | } |
| 671 | |||
| 672 | const std = @import("std"); | ||
| 673 | const builtin = @import("builtin"); | ||
| 674 | const assert = std.debug.assert; | ||
| 675 | const mem = std.mem; | ||
| 676 | const Allocator = mem.Allocator; | ||
| 677 | const compress = std.compress; | ||
| 678 | const unicode = std.unicode; | ||
| 679 | |||
| 680 | const code_point = @import("code_point"); | ||
| 681 | const CodePoint = code_point.CodePoint; | ||
| 682 | const CodePointIterator = code_point.Iterator; | ||
| 683 | const CodePointReverseIterator = code_point.ReverseIterator; | ||
| 684 | const uoffset = code_point.uoffset; | ||
diff --git a/src/Words.zig b/src/Words.zig index 1707881..af82562 100644 --- a/src/Words.zig +++ b/src/Words.zig | |||
| @@ -124,12 +124,12 @@ pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator { | |||
| 124 | } | 124 | } |
| 125 | 125 | ||
| 126 | /// Returns an iterator after the `word` in `slice`. | 126 | /// Returns an iterator after the `word` in `slice`. |
| 127 | pub fn iterateAfter(words: *const Words, slice: []const u8, word: Word) Iterator { | 127 | pub fn iterateAfterWord(words: *const Words, slice: []const u8, word: Word) Iterator { |
| 128 | return forwardFromIndex(words, slice, word.offset + word.len); | 128 | return forwardFromIndex(words, slice, word.offset + word.len); |
| 129 | } | 129 | } |
| 130 | 130 | ||
| 131 | /// Returns a reverse iterator before the `word` in `slice`. | 131 | /// Returns a reverse iterator before the `word` in `slice`. |
| 132 | pub fn iterateBefore(words: *const Words, slice: []const u8, word: Word) ReverseIterator { | 132 | pub fn iterateBeforeWord(words: *const Words, slice: []const u8, word: Word) ReverseIterator { |
| 133 | return reverseFromIndex(words, slice, word.offset); | 133 | return reverseFromIndex(words, slice, word.offset); |
| 134 | } | 134 | } |
| 135 | 135 | ||
diff --git a/src/code_point.zig b/src/code_point.zig index 8bd3d5b..16648af 100644 --- a/src/code_point.zig +++ b/src/code_point.zig | |||
| @@ -39,9 +39,17 @@ pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint { | |||
| 39 | return null; | 39 | return null; |
| 40 | } | 40 | } |
| 41 | 41 | ||
| 42 | /// Return the codepoint at `index`, even if `index` is in the middle | ||
| 43 | /// of that codepoint. | ||
| 44 | pub fn codepointAtIndex(bytes: []const u8, index: uoffset) ?CodePoint { | ||
| 45 | var idx = index; | ||
| 46 | while (idx > 0 and 0x80 <= bytes[idx] and bytes[idx] <= 0xbf) : (idx -= 1) {} | ||
| 47 | return decodeAtIndex(bytes, idx); | ||
| 48 | } | ||
| 49 | |||
| 42 | /// Decode the CodePoint, if any, at `bytes[idx]`. | 50 | /// Decode the CodePoint, if any, at `bytes[idx]`. |
| 43 | pub fn decodeAtIndex(bytes: []const u8, idx: uoffset) ?CodePoint { | 51 | pub fn decodeAtIndex(bytes: []const u8, index: uoffset) ?CodePoint { |
| 44 | var off = idx; | 52 | var off = index; |
| 45 | return decodeAtCursor(bytes, &off); | 53 | return decodeAtCursor(bytes, &off); |
| 46 | } | 54 | } |
| 47 | 55 | ||
| @@ -329,6 +337,54 @@ test Iterator { | |||
| 329 | try expectEqual(@as(?CodePoint, null), iter.next()); | 337 | try expectEqual(@as(?CodePoint, null), iter.next()); |
| 330 | } | 338 | } |
| 331 | 339 | ||
| 340 | const code_point = @This(); | ||
| 341 | |||
| 342 | // Keep this in sync with the README | ||
| 343 | test "Code point iterator" { | ||
| 344 | const str = "Hi 😊"; | ||
| 345 | var iter: code_point.Iterator = .init(str); | ||
| 346 | var i: usize = 0; | ||
| 347 | |||
| 348 | while (iter.next()) |cp| : (i += 1) { | ||
| 349 | // The `code` field is the actual code point scalar as a `u21`. | ||
| 350 | if (i == 0) try expect(cp.code == 'H'); | ||
| 351 | if (i == 1) try expect(cp.code == 'i'); | ||
| 352 | if (i == 2) try expect(cp.code == ' '); | ||
| 353 | |||
| 354 | if (i == 3) { | ||
| 355 | try expect(cp.code == '😊'); | ||
| 356 | // The `offset` field is the byte offset in the | ||
| 357 | // source string. | ||
| 358 | try expect(cp.offset == 3); | ||
| 359 | try expectEqual(cp, code_point.decodeAtIndex(str, cp.offset).?); | ||
| 360 | // The `len` field is the length in bytes of the | ||
| 361 | // code point in the source string. | ||
| 362 | try expect(cp.len == 4); | ||
| 363 | // There is also a 'cursor' decode, like so: | ||
| 364 | { | ||
| 365 | var cursor = cp.offset; | ||
| 366 | try expectEqual(cp, code_point.decodeAtCursor(str, &cursor).?); | ||
| 367 | // Which advances the cursor variable to the next possible | ||
| 368 | // offset, in this case, `str.len`. Don't forget to account | ||
| 369 | // for this possibility! | ||
| 370 | try expectEqual(cp.offset + cp.len, cursor); | ||
| 371 | } | ||
| 372 | // There's also this, for when you aren't sure if you have the | ||
| 373 | // correct start for a code point: | ||
| 374 | try expectEqual(cp, code_point.codepointAtIndex(str, cp.offset + 1).?); | ||
| 375 | } | ||
| 376 | // Reverse iteration is also an option: | ||
| 377 | var r_iter: code_point.ReverseIterator = .init(str); | ||
| 378 | // Both iterators can be peeked: | ||
| 379 | try expectEqual('😊', r_iter.peek().?.code); | ||
| 380 | try expectEqual('😊', r_iter.prev().?.code); | ||
| 381 | // Both kinds of iterators can be reversed: | ||
| 382 | var fwd_iter = r_iter.forwardIterator(); // or iter.reverseIterator(); | ||
| 383 | // This will always return the last codepoint from | ||
| 384 | // the prior iterator, _if_ it yielded one: | ||
| 385 | try expectEqual('😊', fwd_iter.next().?.code); | ||
| 386 | } | ||
| 387 | } | ||
| 332 | test "overlongs" { | 388 | test "overlongs" { |
| 333 | // None of these should equal `/`, all should be byte-for-byte | 389 | // None of these should equal `/`, all should be byte-for-byte |
| 334 | // handled as replacement characters. | 390 | // handled as replacement characters. |
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index c463dcc..ae177a9 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -162,20 +162,51 @@ test "Segmentation GraphemeIterator" { | |||
| 162 | bytes_index += cp_index; | 162 | bytes_index += cp_index; |
| 163 | } | 163 | } |
| 164 | 164 | ||
| 165 | const this_str = all_bytes.items; | ||
| 166 | |||
| 165 | { | 167 | { |
| 166 | var iter = graph.iterator(all_bytes.items); | 168 | var iter = graph.iterator(this_str); |
| 167 | 169 | ||
| 168 | // Check. | 170 | // Check. |
| 169 | for (want.items) |want_gc| { | 171 | for (want.items, 1..) |want_gc, idx| { |
| 170 | const got_gc = (iter.next()).?; | 172 | const got_gc = (iter.next()).?; |
| 171 | try std.testing.expectEqualStrings( | 173 | try std.testing.expectEqualStrings( |
| 172 | want_gc.bytes(all_bytes.items), | 174 | want_gc.bytes(this_str), |
| 173 | got_gc.bytes(all_bytes.items), | 175 | got_gc.bytes(this_str), |
| 174 | ); | 176 | ); |
| 177 | for (got_gc.offset..got_gc.offset + got_gc.len) |i| { | ||
| 178 | const this_gc = graph.graphemeAtIndex(this_str, i); | ||
| 179 | std.testing.expectEqualSlices( | ||
| 180 | u8, | ||
| 181 | got_gc.bytes(this_str), | ||
| 182 | this_gc.bytes(this_str), | ||
| 183 | ) catch |err| { | ||
| 184 | debug.print("Wrong grapheme on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i }); | ||
| 185 | return err; | ||
| 186 | }; | ||
| 187 | } | ||
| 188 | var after_iter = graph.iterateAfterGrapheme(this_str, got_gc); | ||
| 189 | if (after_iter.next()) |next_gc| { | ||
| 190 | if (iter.peek()) |next_peek| { | ||
| 191 | std.testing.expectEqualSlices( | ||
| 192 | u8, | ||
| 193 | next_gc.bytes(this_str), | ||
| 194 | next_peek.bytes(this_str), | ||
| 195 | ) catch |err| { | ||
| 196 | debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, idx }); | ||
| 197 | return err; | ||
| 198 | }; | ||
| 199 | } else { | ||
| 200 | debug.print("Mismatch: peek missing, next found, line {d} #{d}\n", .{ line_iter.line, idx }); | ||
| 201 | try testing.expect(false); | ||
| 202 | } | ||
| 203 | } else { | ||
| 204 | try testing.expectEqual(null, iter.peek()); | ||
| 205 | } | ||
| 175 | } | 206 | } |
| 176 | } | 207 | } |
| 177 | { | 208 | { |
| 178 | var iter = graph.reverseIterator(all_bytes.items); | 209 | var iter = graph.reverseIterator(this_str); |
| 179 | 210 | ||
| 180 | // Check. | 211 | // Check. |
| 181 | var i: usize = want.items.len; | 212 | var i: usize = want.items.len; |
| @@ -190,8 +221,8 @@ test "Segmentation GraphemeIterator" { | |||
| 190 | return error.TestExpectedEqual; | 221 | return error.TestExpectedEqual; |
| 191 | }; | 222 | }; |
| 192 | std.testing.expectEqualStrings( | 223 | std.testing.expectEqualStrings( |
| 193 | want_gc.bytes(all_bytes.items), | 224 | want_gc.bytes(this_str), |
| 194 | got_gc.bytes(all_bytes.items), | 225 | got_gc.bytes(this_str), |
| 195 | ) catch |err| { | 226 | ) catch |err| { |
| 196 | std.debug.print( | 227 | std.debug.print( |
| 197 | "line {d} grapheme {d}: expected {any} found {any}\n", | 228 | "line {d} grapheme {d}: expected {any} found {any}\n", |
| @@ -199,6 +230,24 @@ test "Segmentation GraphemeIterator" { | |||
| 199 | ); | 230 | ); |
| 200 | return err; | 231 | return err; |
| 201 | }; | 232 | }; |
| 233 | var before_iter = graph.iterateBeforeGrapheme(this_str, got_gc); | ||
| 234 | if (before_iter.prev()) |prev_gc| { | ||
| 235 | if (iter.peek()) |prev_peek| { | ||
| 236 | std.testing.expectEqualSlices( | ||
| 237 | u8, | ||
| 238 | prev_gc.bytes(this_str), | ||
| 239 | prev_peek.bytes(this_str), | ||
| 240 | ) catch |err| { | ||
| 241 | debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, i }); | ||
| 242 | return err; | ||
| 243 | }; | ||
| 244 | } else { | ||
| 245 | debug.print("Mismatch: peek missing, prev found, line {d} #{d}\n", .{ line_iter.line, i }); | ||
| 246 | try testing.expect(false); | ||
| 247 | } | ||
| 248 | } else { | ||
| 249 | try testing.expectEqual(null, iter.peek()); | ||
| 250 | } | ||
| 202 | } | 251 | } |
| 203 | } | 252 | } |
| 204 | } | 253 | } |
| @@ -287,7 +336,7 @@ test "Segmentation Word Iterator" { | |||
| 287 | } else { | 336 | } else { |
| 288 | try testing.expect(false); | 337 | try testing.expect(false); |
| 289 | } | 338 | } |
| 290 | var peek_iter = wb.iterateAfter(this_str, got_word); | 339 | var peek_iter = wb.iterateAfterWord(this_str, got_word); |
| 291 | const peek_1 = peek_iter.next(); | 340 | const peek_1 = peek_iter.next(); |
| 292 | if (peek_1) |p1| { | 341 | if (peek_1) |p1| { |
| 293 | const peek_2 = iter.peek(); | 342 | const peek_2 = iter.peek(); |
| @@ -313,7 +362,7 @@ test "Segmentation Word Iterator" { | |||
| 313 | got_word.bytes(this_str), | 362 | got_word.bytes(this_str), |
| 314 | this_word.bytes(this_str), | 363 | this_word.bytes(this_str), |
| 315 | ) catch |err| { | 364 | ) catch |err| { |
| 316 | debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i }); | 365 | debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i }); |
| 317 | return err; | 366 | return err; |
| 318 | }; | 367 | }; |
| 319 | } | 368 | } |
| @@ -356,7 +405,7 @@ test "Segmentation Word Iterator" { | |||
| 356 | } else { | 405 | } else { |
| 357 | try testing.expect(false); | 406 | try testing.expect(false); |
| 358 | } | 407 | } |
| 359 | var peek_iter = wb.iterateBefore(this_str, got_word); | 408 | var peek_iter = wb.iterateBeforeWord(this_str, got_word); |
| 360 | const peek_1 = peek_iter.prev(); | 409 | const peek_1 = peek_iter.prev(); |
| 361 | if (peek_1) |p1| { | 410 | if (peek_1) |p1| { |
| 362 | const peek_2 = r_iter.peek(); | 411 | const peek_2 = r_iter.peek(); |