diff options
| author | 2025-06-01 14:08:25 -0400 | |
|---|---|---|
| committer | 2025-06-01 14:08:25 -0400 | |
| commit | 8f5209fa095c2ed9114ce102b2f9b2cc90d66b13 (patch) | |
| tree | 4ec54815215a9a808be0ab9a2968159f144ba076 /src/code_point.zig | |
| parent | Document "fat_offset" in README (diff) | |
| download | zg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.tar.gz zg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.tar.xz zg-8f5209fa095c2ed9114ce102b2f9b2cc90d66b13.zip | |
Add graphemeAtIndex + iterate before and after
That completes the set. I do think it's possible to bum a few more
cycles from the implementation, but, I'm not going to. It passes
the acceptance suite and that's what it needs to do.
Diffstat (limited to 'src/code_point.zig')
| -rw-r--r-- | src/code_point.zig | 60 |
1 files changed, 58 insertions, 2 deletions
diff --git a/src/code_point.zig b/src/code_point.zig index 8bd3d5b..16648af 100644 --- a/src/code_point.zig +++ b/src/code_point.zig | |||
| @@ -39,9 +39,17 @@ pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint { | |||
| 39 | return null; | 39 | return null; |
| 40 | } | 40 | } |
| 41 | 41 | ||
| 42 | /// Return the codepoint at `index`, even if `index` is in the middle | ||
| 43 | /// of that codepoint. | ||
| 44 | pub fn codepointAtIndex(bytes: []const u8, index: uoffset) ?CodePoint { | ||
| 45 | var idx = index; | ||
| 46 | while (idx > 0 and 0x80 <= bytes[idx] and bytes[idx] <= 0xbf) : (idx -= 1) {} | ||
| 47 | return decodeAtIndex(bytes, idx); | ||
| 48 | } | ||
| 49 | |||
| 42 | /// Decode the CodePoint, if any, at `bytes[idx]`. | 50 | /// Decode the CodePoint, if any, at `bytes[idx]`. |
| 43 | pub fn decodeAtIndex(bytes: []const u8, idx: uoffset) ?CodePoint { | 51 | pub fn decodeAtIndex(bytes: []const u8, index: uoffset) ?CodePoint { |
| 44 | var off = idx; | 52 | var off = index; |
| 45 | return decodeAtCursor(bytes, &off); | 53 | return decodeAtCursor(bytes, &off); |
| 46 | } | 54 | } |
| 47 | 55 | ||
| @@ -329,6 +337,54 @@ test Iterator { | |||
| 329 | try expectEqual(@as(?CodePoint, null), iter.next()); | 337 | try expectEqual(@as(?CodePoint, null), iter.next()); |
| 330 | } | 338 | } |
| 331 | 339 | ||
| 340 | const code_point = @This(); | ||
| 341 | |||
| 342 | // Keep this in sync with the README | ||
| 343 | test "Code point iterator" { | ||
| 344 | const str = "Hi 😊"; | ||
| 345 | var iter: code_point.Iterator = .init(str); | ||
| 346 | var i: usize = 0; | ||
| 347 | |||
| 348 | while (iter.next()) |cp| : (i += 1) { | ||
| 349 | // The `code` field is the actual code point scalar as a `u21`. | ||
| 350 | if (i == 0) try expect(cp.code == 'H'); | ||
| 351 | if (i == 1) try expect(cp.code == 'i'); | ||
| 352 | if (i == 2) try expect(cp.code == ' '); | ||
| 353 | |||
| 354 | if (i == 3) { | ||
| 355 | try expect(cp.code == '😊'); | ||
| 356 | // The `offset` field is the byte offset in the | ||
| 357 | // source string. | ||
| 358 | try expect(cp.offset == 3); | ||
| 359 | try expectEqual(cp, code_point.decodeAtIndex(str, cp.offset).?); | ||
| 360 | // The `len` field is the length in bytes of the | ||
| 361 | // code point in the source string. | ||
| 362 | try expect(cp.len == 4); | ||
| 363 | // There is also a 'cursor' decode, like so: | ||
| 364 | { | ||
| 365 | var cursor = cp.offset; | ||
| 366 | try expectEqual(cp, code_point.decodeAtCursor(str, &cursor).?); | ||
| 367 | // Which advances the cursor variable to the next possible | ||
| 368 | // offset, in this case, `str.len`. Don't forget to account | ||
| 369 | // for this possibility! | ||
| 370 | try expectEqual(cp.offset + cp.len, cursor); | ||
| 371 | } | ||
| 372 | // There's also this, for when you aren't sure if you have the | ||
| 373 | // correct start for a code point: | ||
| 374 | try expectEqual(cp, code_point.codepointAtIndex(str, cp.offset + 1).?); | ||
| 375 | } | ||
| 376 | // Reverse iteration is also an option: | ||
| 377 | var r_iter: code_point.ReverseIterator = .init(str); | ||
| 378 | // Both iterators can be peeked: | ||
| 379 | try expectEqual('😊', r_iter.peek().?.code); | ||
| 380 | try expectEqual('😊', r_iter.prev().?.code); | ||
| 381 | // Both kinds of iterators can be reversed: | ||
| 382 | var fwd_iter = r_iter.forwardIterator(); // or iter.reverseIterator(); | ||
| 383 | // This will always return the last codepoint from | ||
| 384 | // the prior iterator, _if_ it yielded one: | ||
| 385 | try expectEqual('😊', fwd_iter.next().?.code); | ||
| 386 | } | ||
| 387 | } | ||
| 332 | test "overlongs" { | 388 | test "overlongs" { |
| 333 | // None of these should equal `/`, all should be byte-for-byte | 389 | // None of these should equal `/`, all should be byte-for-byte |
| 334 | // handled as replacement characters. | 390 | // handled as replacement characters. |