From 9042273383de60f36a7938f0f0b49102117eef85 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 16 May 2025 12:03:33 -0400 Subject: Proofread --- NEWS.md | 6 +++--- src/WordBreak.zig | 11 ++++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index a432c2f..8131878 100644 --- a/NEWS.md +++ b/NEWS.md @@ -52,9 +52,9 @@ UTF-8 into codepoints. Concerningly, this interpreted overlong sequences, which has been forbidden by Unicode for more than 20 years due to the security risks involved. -This has been replaced with a DFA decoder based on the work of [Björn -Höhrmann][UTF], which has proven itself fast[^1] and reliable. This is -a breaking change; sequences such as `"\xc0\xaf"` will no longer +This has been replaced with a DFA decoder based on the work of +[Björn Höhrmann][UTF], which has proven itself fast[^1] and reliable. +This is a breaking change; sequences such as `"\xc0\xaf"` will no longer produce the code `'/'`, nor will surrogates return their codepoint value. diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 6ada7e1..6a532f5 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig @@ -151,7 +151,8 @@ pub const Iterator = struct { } /// Returns a reverse iterator from the point this iterator is paused - /// at. Usually, calling `prev()` will return the word just seen. + /// at. Usually, and always when using the API to create iterators, + /// calling `prev()` will return the word just seen. pub fn reverseIterator(iter: *Iterator) ReverseIterator { var cp_it = iter.cp_iter.reverseIterator(); if (iter.that) |_| @@ -333,7 +334,8 @@ pub const ReverseIterator = struct { } /// Return a forward iterator from where this iterator paused. Usually, - /// calling `next()` will return the word just seen. + /// and always when using the API to create iterators, calling `next()` + /// will return the word just seen. pub fn forwardIterator(iter: *ReverseIterator) Iterator { var cp_it = iter.cp_iter.forwardIterator(); if (iter.before) |_| @@ -508,9 +510,10 @@ pub const ReverseIterator = struct { //| Implementation Details -/// Initialize a ReverseIterator at the provided index. Used in wordAtIndex. +/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator { var idx: u32 = @intCast(index); + // Find the next lead byte: while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} if (idx == string.len) return wb.reverseIterator(string); var iter: ReverseIterator = undefined; @@ -630,8 +633,6 @@ test "Word Break Properties" { try testing.expectEqual(.LF, wb.breakProperty('\n')); try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); - var iter = wb.iterator("xxx"); - _ = iter.peek(); } test "ext_pict" { -- cgit v1.2.3