diff options
| author | 2025-07-08 12:15:32 -0400 | |
|---|---|---|
| committer | 2025-07-08 12:15:32 -0400 | |
| commit | 9427a9e53aaa29ee071f4dcb35b809a699d75aa9 (patch) | |
| tree | 2607c185fd8053b84d60041fadc35c05a0225d34 /src/Words.zig | |
| parent | Merge pull request 'Fix benchmarks' (#56) from jacobsandlund/zg:benchmarks in... (diff) | |
| parent | Add Words.zig example to README (diff) | |
| download | zg-master.tar.gz zg-master.tar.xz zg-master.zip | |
Diffstat (limited to 'src/Words.zig')
| -rw-r--r-- | src/Words.zig | 773 |
1 files changed, 773 insertions, 0 deletions
diff --git a/src/Words.zig b/src/Words.zig new file mode 100644 index 0000000..617c34d --- /dev/null +++ b/src/Words.zig | |||
| @@ -0,0 +1,773 @@ | |||
| 1 | //! Word Breaking Algorithm. | ||
| 2 | //! | ||
| 3 | //! https://www.unicode.org/reports/tr29/#Word_Boundaries | ||
| 4 | //! | ||
| 5 | |||
| 6 | const WordBreakProperty = enum(u5) { | ||
| 7 | none, | ||
| 8 | Double_Quote, | ||
| 9 | Single_Quote, | ||
| 10 | Hebrew_Letter, | ||
| 11 | CR, | ||
| 12 | LF, | ||
| 13 | Newline, | ||
| 14 | Extend, | ||
| 15 | Regional_Indicator, | ||
| 16 | Format, | ||
| 17 | Katakana, | ||
| 18 | ALetter, | ||
| 19 | MidLetter, | ||
| 20 | MidNum, | ||
| 21 | MidNumLet, | ||
| 22 | Numeric, | ||
| 23 | ExtendNumLet, | ||
| 24 | ZWJ, | ||
| 25 | WSegSpace, | ||
| 26 | }; | ||
| 27 | |||
| 28 | s1: []u16 = undefined, | ||
| 29 | s2: []u5 = undefined, | ||
| 30 | |||
| 31 | const Words = @This(); | ||
| 32 | |||
| 33 | pub fn init(allocator: Allocator) Allocator.Error!Words { | ||
| 34 | var wb: Words = undefined; | ||
| 35 | try wb.setup(allocator); | ||
| 36 | return wb; | ||
| 37 | } | ||
| 38 | |||
| 39 | pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void { | ||
| 40 | wb.setupImpl(allocator) catch |err| { | ||
| 41 | switch (err) { | ||
| 42 | error.OutOfMemory => |e| return e, | ||
| 43 | else => unreachable, | ||
| 44 | } | ||
| 45 | }; | ||
| 46 | } | ||
| 47 | |||
| 48 | pub fn deinit(words: *const Words, allocator: mem.Allocator) void { | ||
| 49 | allocator.free(words.s1); | ||
| 50 | allocator.free(words.s2); | ||
| 51 | } | ||
| 52 | |||
| 53 | /// Represents a Unicode word span, as an offset into the source string | ||
| 54 | /// and the length of the word. | ||
| 55 | pub const Word = struct { | ||
| 56 | offset: uoffset, | ||
| 57 | len: uoffset, | ||
| 58 | |||
| 59 | /// Returns a slice of the word given the source string. | ||
| 60 | pub fn bytes(word: Word, src: []const u8) []const u8 { | ||
| 61 | return src[word.offset..][0..word.len]; | ||
| 62 | } | ||
| 63 | }; | ||
| 64 | |||
| 65 | /// Returns the word break property type for `cp`. | ||
| 66 | pub fn breakProperty(words: *const Words, cp: u21) WordBreakProperty { | ||
| 67 | return @enumFromInt(words.s2[words.s1[cp >> 8] + (cp & 0xff)]); | ||
| 68 | } | ||
| 69 | |||
| 70 | /// Convenience function for working with CodePoints | ||
| 71 | fn breakProp(words: *const Words, point: CodePoint) WordBreakProperty { | ||
| 72 | return @enumFromInt(words.s2[words.s1[point.code >> 8] + (point.code & 0xff)]); | ||
| 73 | } | ||
| 74 | |||
| 75 | /// Returns the Word at the given index. Asserts that the index is less than | ||
| 76 | /// `string.len`, and that the string is not empty. Always returns a word. | ||
| 77 | /// The index does not have to be the start of a codepoint in the word. | ||
| 78 | pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word { | ||
| 79 | assert(index < string.len and string.len > 0); | ||
| 80 | var iter_back: ReverseIterator = reverseFromIndex(words, string, index); | ||
| 81 | const first_back = iter_back.prev(); | ||
| 82 | if (first_back) |back| { | ||
| 83 | if (back.offset == 0) { | ||
| 84 | var iter_fwd = words.iterator(string); | ||
| 85 | while (iter_fwd.next()) |word| { | ||
| 86 | if (word.offset <= index and index < word.offset + word.len) | ||
| 87 | return word; | ||
| 88 | } | ||
| 89 | } | ||
| 90 | } else { | ||
| 91 | var iter_fwd = words.iterator(string); | ||
| 92 | while (iter_fwd.next()) |word| { | ||
| 93 | if (word.offset <= index and index < word.offset + word.len) | ||
| 94 | return word; | ||
| 95 | } | ||
| 96 | } | ||
| 97 | _ = iter_back.prev(); | ||
| 98 | // There's sometimes flags: | ||
| 99 | if (iter_back.flags > 0) { | ||
| 100 | while (iter_back.flags > 0) { | ||
| 101 | if (iter_back.prev()) |_| { | ||
| 102 | continue; | ||
| 103 | } else { | ||
| 104 | break; | ||
| 105 | } | ||
| 106 | } | ||
| 107 | } | ||
| 108 | var iter_fwd = iter_back.forwardIterator(); | ||
| 109 | while (iter_fwd.next()) |word| { | ||
| 110 | if (word.offset <= index and index < word.offset + word.len) | ||
| 111 | return word; | ||
| 112 | } | ||
| 113 | unreachable; | ||
| 114 | } | ||
| 115 | |||
| 116 | /// Returns an iterator over words in `slice`. | ||
| 117 | pub fn iterator(words: *const Words, slice: []const u8) Iterator { | ||
| 118 | return Iterator.init(words, slice); | ||
| 119 | } | ||
| 120 | |||
| 121 | /// Returns a reverse iterator over the words in `slice`. | ||
| 122 | pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator { | ||
| 123 | return ReverseIterator.init(words, slice); | ||
| 124 | } | ||
| 125 | |||
| 126 | /// Returns an iterator after the `word` in `slice`. | ||
| 127 | pub fn iterateAfterWord(words: *const Words, slice: []const u8, word: Word) Iterator { | ||
| 128 | return forwardFromIndex(words, slice, word.offset + word.len); | ||
| 129 | } | ||
| 130 | |||
| 131 | /// Returns a reverse iterator before the `word` in `slice`. | ||
| 132 | pub fn iterateBeforeWord(words: *const Words, slice: []const u8, word: Word) ReverseIterator { | ||
| 133 | return reverseFromIndex(words, slice, word.offset); | ||
| 134 | } | ||
| 135 | |||
| 136 | /// An iterator, forward, over all words in a provided string. | ||
| 137 | pub const Iterator = struct { | ||
| 138 | this: ?CodePoint = null, | ||
| 139 | that: ?CodePoint = null, | ||
| 140 | cp_iter: CodepointIterator, | ||
| 141 | wb: *const Words, | ||
| 142 | |||
| 143 | /// Assumes `str` is valid UTF-8. | ||
| 144 | pub fn init(words: *const Words, str: []const u8) Iterator { | ||
| 145 | var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = words }; | ||
| 146 | wb_iter.advance(); | ||
| 147 | return wb_iter; | ||
| 148 | } | ||
| 149 | |||
| 150 | /// Returns the next word segment, without advancing. | ||
| 151 | pub fn peek(iter: *Iterator) ?Word { | ||
| 152 | const cache = .{ iter.this, iter.that, iter.cp_iter }; | ||
| 153 | defer { | ||
| 154 | iter.this, iter.that, iter.cp_iter = cache; | ||
| 155 | } | ||
| 156 | return iter.next(); | ||
| 157 | } | ||
| 158 | |||
| 159 | /// Returns a reverse iterator from the point this iterator is paused | ||
| 160 | /// at. Usually, and always when using the API to create iterators, | ||
| 161 | /// calling `prev()` will return the word just seen. | ||
| 162 | pub fn reverseIterator(iter: *Iterator) ReverseIterator { | ||
| 163 | var cp_it = iter.cp_iter.reverseIterator(); | ||
| 164 | if (iter.that) |_| | ||
| 165 | _ = cp_it.prev(); | ||
| 166 | if (iter.cp_iter.peek()) |_| | ||
| 167 | _ = cp_it.prev(); | ||
| 168 | return .{ | ||
| 169 | .wb = iter.wb, | ||
| 170 | .before = cp_it.prev(), | ||
| 171 | .after = iter.that, | ||
| 172 | .cp_iter = cp_it, | ||
| 173 | }; | ||
| 174 | } | ||
| 175 | |||
| 176 | /// Returns the next word segment, if any. | ||
| 177 | pub fn next(iter: *Iterator) ?Word { | ||
| 178 | iter.advance(); | ||
| 179 | |||
| 180 | // Done? | ||
| 181 | if (iter.this == null) return null; | ||
| 182 | // Last? | ||
| 183 | if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; | ||
| 184 | |||
| 185 | const word_start = iter.this.?.offset; | ||
| 186 | var word_len: uoffset = 0; | ||
| 187 | |||
| 188 | // State variables. | ||
| 189 | var last_p: WordBreakProperty = .none; | ||
| 190 | var last_last_p: WordBreakProperty = .none; | ||
| 191 | var ri_count: usize = 0; | ||
| 192 | |||
| 193 | scan: while (true) : (iter.advance()) { | ||
| 194 | const this = iter.this.?; | ||
| 195 | word_len += this.len; | ||
| 196 | if (iter.that) |that| { | ||
| 197 | const this_p = iter.wb.breakProp(this); | ||
| 198 | const that_p = iter.wb.breakProp(that); | ||
| 199 | if (!isIgnorable(this_p)) { | ||
| 200 | last_last_p = last_p; | ||
| 201 | last_p = this_p; | ||
| 202 | } | ||
| 203 | // WB3 CR × LF | ||
| 204 | if (this_p == .CR and that_p == .LF) continue :scan; | ||
| 205 | // WB3a (Newline | CR | LF) ÷ | ||
| 206 | if (isNewline(this_p)) break :scan; | ||
| 207 | // WB3b ÷ (Newline | CR | LF) | ||
| 208 | if (isNewline(that_p)) break :scan; | ||
| 209 | // WB3c ZWJ × \p{Extended_Pictographic} | ||
| 210 | if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { | ||
| 211 | continue :scan; | ||
| 212 | } | ||
| 213 | // WB3d WSegSpace × WSegSpace | ||
| 214 | if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; | ||
| 215 | // WB4 X (Extend | Format | ZWJ)* → X | ||
| 216 | if (isIgnorable(that_p)) { | ||
| 217 | continue :scan; | ||
| 218 | } // Now we use last_p instead of this_p for ignorable's sake | ||
| 219 | if (isAHLetter(last_p)) { | ||
| 220 | // WB5 AHLetter × AHLetter | ||
| 221 | if (isAHLetter(that_p)) continue :scan; | ||
| 222 | // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter | ||
| 223 | if (isMidVal(that_p)) { | ||
| 224 | const next_val = iter.peekPast(); | ||
| 225 | if (next_val) |next_cp| { | ||
| 226 | const next_p = iter.wb.breakProp(next_cp); | ||
| 227 | if (isAHLetter(next_p)) { | ||
| 228 | continue :scan; | ||
| 229 | } | ||
| 230 | } | ||
| 231 | } | ||
| 232 | } | ||
| 233 | // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter | ||
| 234 | if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) { | ||
| 235 | continue :scan; | ||
| 236 | } | ||
| 237 | if (last_p == .Hebrew_Letter) { | ||
| 238 | // WB7a Hebrew_Letter × Single_Quote | ||
| 239 | if (that_p == .Single_Quote) continue :scan; | ||
| 240 | // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter | ||
| 241 | if (that_p == .Double_Quote) { | ||
| 242 | const next_val = iter.peekPast(); | ||
| 243 | if (next_val) |next_cp| { | ||
| 244 | const next_p = iter.wb.breakProp(next_cp); | ||
| 245 | if (next_p == .Hebrew_Letter) { | ||
| 246 | continue :scan; | ||
| 247 | } | ||
| 248 | } | ||
| 249 | } | ||
| 250 | } | ||
| 251 | // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter | ||
| 252 | if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter) | ||
| 253 | continue :scan; | ||
| 254 | // WB8 Numeric × Numeric | ||
| 255 | if (last_p == .Numeric and that_p == .Numeric) continue :scan; | ||
| 256 | // WB9 AHLetter × Numeric | ||
| 257 | if (isAHLetter(last_p) and that_p == .Numeric) continue :scan; | ||
| 258 | // WB10 Numeric × AHLetter | ||
| 259 | if (last_p == .Numeric and isAHLetter(that_p)) continue :scan; | ||
| 260 | // WB11 Numeric (MidNum | MidNumLetQ) × Numeric | ||
| 261 | if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric) | ||
| 262 | continue :scan; | ||
| 263 | // WB12 Numeric × (MidNum | MidNumLetQ) Numeric | ||
| 264 | if (last_p == .Numeric and isMidNum(that_p)) { | ||
| 265 | const next_val = iter.peekPast(); | ||
| 266 | if (next_val) |next_cp| { | ||
| 267 | const next_p = iter.wb.breakProp(next_cp); | ||
| 268 | if (next_p == .Numeric) { | ||
| 269 | continue :scan; | ||
| 270 | } | ||
| 271 | } | ||
| 272 | } | ||
| 273 | // WB13 Katakana × Katakana | ||
| 274 | if (last_p == .Katakana and that_p == .Katakana) continue :scan; | ||
| 275 | // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet | ||
| 276 | if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan; | ||
| 277 | // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) | ||
| 278 | if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; | ||
| 279 | // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI | ||
| 280 | const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator; | ||
| 281 | if (maybe_flag) { | ||
| 282 | ri_count += 1; | ||
| 283 | if (ri_count % 2 == 1) continue :scan; | ||
| 284 | } | ||
| 285 | // WB999 Any ÷ Any | ||
| 286 | break :scan; | ||
| 287 | } else { // iter.that == null | ||
| 288 | break :scan; | ||
| 289 | } | ||
| 290 | } | ||
| 291 | |||
| 292 | return Word{ .len = word_len, .offset = word_start }; | ||
| 293 | } | ||
| 294 | |||
| 295 | pub fn format(iter: Iterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { | ||
| 296 | try writer.print( | ||
| 297 | "Iterator {{ .this = {any}, .that = {any} }}", | ||
| 298 | .{ iter.this, iter.that }, | ||
| 299 | ); | ||
| 300 | } | ||
| 301 | |||
| 302 | fn advance(iter: *Iterator) void { | ||
| 303 | iter.this = iter.that; | ||
| 304 | iter.that = iter.cp_iter.next(); | ||
| 305 | } | ||
| 306 | |||
| 307 | fn peekPast(iter: *Iterator) ?CodePoint { | ||
| 308 | const save_cp = iter.cp_iter; | ||
| 309 | defer iter.cp_iter = save_cp; | ||
| 310 | while (iter.cp_iter.peek()) |peeked| { | ||
| 311 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | ||
| 312 | _ = iter.cp_iter.next(); | ||
| 313 | } | ||
| 314 | return null; | ||
| 315 | } | ||
| 316 | }; | ||
| 317 | |||
| 318 | /// An iterator, backward, over all words in a provided string. | ||
| 319 | pub const ReverseIterator = struct { | ||
| 320 | after: ?CodePoint = null, | ||
| 321 | before: ?CodePoint = null, | ||
| 322 | cp_iter: ReverseCodepointIterator, | ||
| 323 | wb: *const Words, | ||
| 324 | flags: usize = 0, | ||
| 325 | |||
| 326 | /// Assumes `str` is valid UTF-8. | ||
| 327 | pub fn init(words: *const Words, str: []const u8) ReverseIterator { | ||
| 328 | var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = words }; | ||
| 329 | wb_iter.advance(); | ||
| 330 | return wb_iter; | ||
| 331 | } | ||
| 332 | |||
| 333 | /// Returns the previous word segment, if any, without advancing. | ||
| 334 | pub fn peek(iter: *ReverseIterator) ?Word { | ||
| 335 | const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; | ||
| 336 | defer { | ||
| 337 | iter.before, iter.after, iter.cp_iter, iter.flags = cache; | ||
| 338 | } | ||
| 339 | return iter.prev(); | ||
| 340 | } | ||
| 341 | |||
| 342 | /// Return a forward iterator from where this iterator paused. Usually, | ||
| 343 | /// and always when using the API to create iterators, calling `next()` | ||
| 344 | /// will return the word just seen. | ||
| 345 | pub fn forwardIterator(iter: *ReverseIterator) Iterator { | ||
| 346 | var cp_it = iter.cp_iter.forwardIterator(); | ||
| 347 | if (iter.before) |_| | ||
| 348 | _ = cp_it.next(); | ||
| 349 | return .{ | ||
| 350 | .wb = iter.wb, | ||
| 351 | .this = cp_it.next(), | ||
| 352 | .that = iter.after, | ||
| 353 | .cp_iter = cp_it, | ||
| 354 | }; | ||
| 355 | } | ||
| 356 | |||
| 357 | /// Return the previous word, if any. | ||
| 358 | pub fn prev(iter: *ReverseIterator) ?Word { | ||
| 359 | iter.advance(); | ||
| 360 | |||
| 361 | // Done? | ||
| 362 | if (iter.after == null) return null; | ||
| 363 | // Last? | ||
| 364 | if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 }; | ||
| 365 | |||
| 366 | const word_end = iter.after.?.offset + iter.after.?.len; | ||
| 367 | var word_len: uoffset = 0; | ||
| 368 | |||
| 369 | // State variables. | ||
| 370 | var last_p: WordBreakProperty = .none; | ||
| 371 | var last_last_p: WordBreakProperty = .none; | ||
| 372 | |||
| 373 | scan: while (true) : (iter.advance()) { | ||
| 374 | const after = iter.after.?; | ||
| 375 | word_len += after.len; | ||
| 376 | if (iter.before) |before| { | ||
| 377 | var sneak = sneaky(iter); // 'sneaks' past ignorables | ||
| 378 | const after_p = iter.wb.breakProp(after); | ||
| 379 | var before_p = iter.wb.breakProp(before); | ||
| 380 | if (!isIgnorable(after_p)) { | ||
| 381 | last_last_p = last_p; | ||
| 382 | last_p = after_p; | ||
| 383 | } | ||
| 384 | // WB3 CR × LF | ||
| 385 | if (before_p == .CR and after_p == .LF) continue :scan; | ||
| 386 | // WB3a (Newline | CR | LF) ÷ | ||
| 387 | if (isNewline(before_p)) break :scan; | ||
| 388 | // WB3b ÷ (Newline | CR | LF) | ||
| 389 | if (isNewline(after_p)) break :scan; | ||
| 390 | // WB3c ZWJ × \p{Extended_Pictographic} | ||
| 391 | if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) { | ||
| 392 | continue :scan; | ||
| 393 | } | ||
| 394 | // WB3d WSegSpace × WSegSpace | ||
| 395 | if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; | ||
| 396 | // WB4 X (Extend | Format | ZWJ)* → X | ||
| 397 | if (isIgnorable(before_p)) { | ||
| 398 | const maybe_before = sneak.prev(); | ||
| 399 | if (maybe_before) |valid_before| { | ||
| 400 | before_p = iter.wb.breakProp(valid_before); | ||
| 401 | } else if (!isIgnorable(after_p)) { | ||
| 402 | // We're done | ||
| 403 | break :scan; | ||
| 404 | } | ||
| 405 | } | ||
| 406 | if (isIgnorable(after_p)) continue :scan; | ||
| 407 | // WB5 AHLetter × AHLetter | ||
| 408 | if (isAHLetter(last_p) and isAHLetter(before_p)) { | ||
| 409 | continue :scan; | ||
| 410 | } | ||
| 411 | // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter | ||
| 412 | if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) { | ||
| 413 | continue :scan; | ||
| 414 | } | ||
| 415 | // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter | ||
| 416 | if (isMidVal(before_p) and isAHLetter(last_p)) { | ||
| 417 | const prev_val = sneak.peek(); | ||
| 418 | if (prev_val) |prev_cp| { | ||
| 419 | const prev_p = iter.wb.breakProp(prev_cp); | ||
| 420 | if (isAHLetter(prev_p)) { | ||
| 421 | continue :scan; | ||
| 422 | } | ||
| 423 | } | ||
| 424 | } | ||
| 425 | // WB7a Hebrew_Letter × Single_Quote | ||
| 426 | if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan; | ||
| 427 | // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter | ||
| 428 | if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) { | ||
| 429 | continue :scan; | ||
| 430 | } | ||
| 431 | // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter | ||
| 432 | if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { | ||
| 433 | const prev_val = sneak.peek(); | ||
| 434 | if (prev_val) |prev_cp| { | ||
| 435 | const prev_p = iter.wb.breakProp(prev_cp); | ||
| 436 | if (prev_p == .Hebrew_Letter) { | ||
| 437 | continue :scan; | ||
| 438 | } | ||
| 439 | } | ||
| 440 | } | ||
| 441 | // WB8 Numeric × Numeric | ||
| 442 | if (before_p == .Numeric and last_p == .Numeric) continue :scan; | ||
| 443 | // WB9 AHLetter × Numeric | ||
| 444 | if (isAHLetter(before_p) and last_p == .Numeric) continue :scan; | ||
| 445 | // WB10 Numeric × AHLetter | ||
| 446 | if (before_p == .Numeric and isAHLetter(last_p)) continue :scan; | ||
| 447 | // WB11 Numeric (MidNum | MidNumLetQ) × Numeric | ||
| 448 | if (isMidNum(before_p) and last_p == .Numeric) { | ||
| 449 | const prev_val = sneak.peek(); | ||
| 450 | if (prev_val) |prev_cp| { | ||
| 451 | const prev_p = iter.wb.breakProp(prev_cp); | ||
| 452 | if (prev_p == .Numeric) { | ||
| 453 | continue :scan; | ||
| 454 | } | ||
| 455 | } | ||
| 456 | } | ||
| 457 | // WB12 Numeric × (MidNum | MidNumLetQ) Numeric | ||
| 458 | if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) { | ||
| 459 | continue :scan; | ||
| 460 | } | ||
| 461 | // WB13 Katakana × Katakana | ||
| 462 | if (before_p == .Katakana and last_p == .Katakana) continue :scan; | ||
| 463 | // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet | ||
| 464 | if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan; | ||
| 465 | // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) | ||
| 466 | if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan; | ||
| 467 | // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI | ||
| 468 | // NOTE: | ||
| 469 | // So here we simply have to know whether a run of flags is even or odd. | ||
| 470 | // The whole run. To avoid quadratic behavior (and long flag runs are | ||
| 471 | // actually a thing in the wild), we have to count them once, store that | ||
| 472 | // on the iterator, and decrement each time we see two, possibly breaking | ||
| 473 | // once extra at the beginning. They break up one per flag, once we hit | ||
| 474 | // zero, that's all the flags. If we see another flag we do it again. | ||
| 475 | if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) { | ||
| 476 | defer { | ||
| 477 | if (iter.flags > 0) iter.flags -= 1; | ||
| 478 | } | ||
| 479 | if (iter.flags == 0) { | ||
| 480 | iter.flags = sneak.countFlags(); | ||
| 481 | } | ||
| 482 | if (iter.flags % 2 == 0) { | ||
| 483 | continue :scan; | ||
| 484 | } | ||
| 485 | } | ||
| 486 | // WB999 Any ÷ Any | ||
| 487 | break :scan; | ||
| 488 | } | ||
| 489 | break :scan; | ||
| 490 | } | ||
| 491 | return Word{ .len = word_len, .offset = word_end - word_len }; | ||
| 492 | } | ||
| 493 | |||
| 494 | pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { | ||
| 495 | try writer.print( | ||
| 496 | "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}", | ||
| 497 | .{ iter.before, iter.after, iter.flags }, | ||
| 498 | ); | ||
| 499 | } | ||
| 500 | |||
| 501 | fn peekPast(iter: *ReverseIterator) ?CodePoint { | ||
| 502 | const save_cp = iter.cp_iter; | ||
| 503 | defer iter.cp_iter = save_cp; | ||
| 504 | while (iter.cp_iter.peek()) |peeked| { | ||
| 505 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | ||
| 506 | _ = iter.cp_iter.prev(); | ||
| 507 | } | ||
| 508 | return null; | ||
| 509 | } | ||
| 510 | |||
| 511 | fn advance(iter: *ReverseIterator) void { | ||
| 512 | iter.after = iter.before; | ||
| 513 | iter.before = iter.cp_iter.prev(); | ||
| 514 | } | ||
| 515 | }; | ||
| 516 | |||
| 517 | //| Implementation Details | ||
| 518 | |||
| 519 | /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. | ||
| 520 | fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator { | ||
| 521 | var idx: uoffset = @intCast(index); | ||
| 522 | // Find the next lead byte: | ||
| 523 | while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} | ||
| 524 | if (idx == string.len) return words.reverseIterator(string); | ||
| 525 | var iter: ReverseIterator = undefined; | ||
| 526 | iter.wb = words; | ||
| 527 | iter.flags = 0; | ||
| 528 | // We need to populate the CodePoints, and the codepoint iterator. | ||
| 529 | // Consider "abc| def" with the cursor as |. | ||
| 530 | // We need `before` to be `c` and `after` to be ' ', | ||
| 531 | // and `cp_iter.prev()` to be `b`. | ||
| 532 | var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx }; | ||
| 533 | iter.after = cp_iter.prev(); | ||
| 534 | iter.before = cp_iter.prev(); | ||
| 535 | iter.cp_iter = cp_iter; | ||
| 536 | return iter; | ||
| 537 | } | ||
| 538 | |||
| 539 | fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator { | ||
| 540 | var idx: uoffset = @intCast(index); | ||
| 541 | if (idx == string.len) { | ||
| 542 | return .{ | ||
| 543 | .cp_iter = .{ .bytes = string, .i = idx }, | ||
| 544 | .this = null, | ||
| 545 | .that = null, | ||
| 546 | .wb = words, | ||
| 547 | }; | ||
| 548 | } | ||
| 549 | while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} | ||
| 550 | if (idx == 0) return words.iterator(string); | ||
| 551 | var iter: Iterator = undefined; | ||
| 552 | iter.wb = words; | ||
| 553 | // We need to populate the CodePoints, and the codepoint iterator. | ||
| 554 | // Consider "abc |def" with the cursor as |. | ||
| 555 | // We need `this` to be ` ` and `that` to be 'd', | ||
| 556 | // and `cp_iter.next()` to be `d`. | ||
| 557 | idx -= 1; | ||
| 558 | while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} | ||
| 559 | // "abc| def" | ||
| 560 | var cp_iter: CodepointIterator = .{ .bytes = string, .i = idx }; | ||
| 561 | iter.this = cp_iter.next(); | ||
| 562 | iter.that = cp_iter.next(); | ||
| 563 | iter.cp_iter = cp_iter; | ||
| 564 | return iter; | ||
| 565 | } | ||
| 566 | |||
| 567 | fn sneaky(iter: *const ReverseIterator) SneakIterator { | ||
| 568 | return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; | ||
| 569 | } | ||
| 570 | |||
| 571 | const SneakIterator = struct { | ||
| 572 | cp_iter: ReverseCodepointIterator, | ||
| 573 | wb: *const Words, | ||
| 574 | |||
| 575 | fn peek(iter: *SneakIterator) ?CodePoint { | ||
| 576 | const save_cp = iter.cp_iter; | ||
| 577 | defer iter.cp_iter = save_cp; | ||
| 578 | while (iter.cp_iter.peek()) |peeked| { | ||
| 579 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | ||
| 580 | _ = iter.cp_iter.prev(); | ||
| 581 | } | ||
| 582 | return null; | ||
| 583 | } | ||
| 584 | |||
| 585 | fn countFlags(iter: *SneakIterator) usize { | ||
| 586 | var flags: usize = 0; | ||
| 587 | const save_cp = iter.cp_iter; | ||
| 588 | defer iter.cp_iter = save_cp; | ||
| 589 | while (iter.cp_iter.prev()) |cp| { | ||
| 590 | const prop = iter.wb.breakProp(cp); | ||
| 591 | if (isIgnorable(prop)) continue; | ||
| 592 | if (prop == .Regional_Indicator) { | ||
| 593 | flags += 1; | ||
| 594 | } else break; | ||
| 595 | } | ||
| 596 | return flags; | ||
| 597 | } | ||
| 598 | |||
| 599 | fn prev(iter: *SneakIterator) ?CodePoint { | ||
| 600 | while (iter.cp_iter.prev()) |peeked| { | ||
| 601 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | ||
| 602 | } | ||
| 603 | return null; | ||
| 604 | } | ||
| 605 | }; | ||
| 606 | |||
| 607 | inline fn setupImpl(wb: *Words, allocator: Allocator) !void { | ||
| 608 | const decompressor = compress.flate.inflate.decompressor; | ||
| 609 | const in_bytes = @embedFile("wbp"); | ||
| 610 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 611 | var in_decomp = decompressor(.raw, in_fbs.reader()); | ||
| 612 | var reader = in_decomp.reader(); | ||
| 613 | |||
| 614 | const endian = builtin.cpu.arch.endian(); | ||
| 615 | |||
| 616 | const stage_1_len: u16 = try reader.readInt(u16, endian); | ||
| 617 | wb.s1 = try allocator.alloc(u16, stage_1_len); | ||
| 618 | errdefer allocator.free(wb.s1); | ||
| 619 | for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); | ||
| 620 | |||
| 621 | const stage_2_len: u16 = try reader.readInt(u16, endian); | ||
| 622 | wb.s2 = try allocator.alloc(u5, stage_2_len); | ||
| 623 | errdefer allocator.free(wb.s2); | ||
| 624 | for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); | ||
| 625 | var count_0: usize = 0; | ||
| 626 | for (wb.s2) |nyb| { | ||
| 627 | if (nyb == 0) count_0 += 1; | ||
| 628 | } | ||
| 629 | } | ||
| 630 | |||
| 631 | //| Predicates | ||
| 632 | |||
| 633 | inline fn isNewline(wbp: WordBreakProperty) bool { | ||
| 634 | return wbp == .CR or wbp == .LF or wbp == .Newline; | ||
| 635 | } | ||
| 636 | |||
| 637 | inline fn isIgnorable(wbp: WordBreakProperty) bool { | ||
| 638 | return switch (wbp) { | ||
| 639 | .Format, .Extend, .ZWJ => true, | ||
| 640 | else => false, | ||
| 641 | }; | ||
| 642 | } | ||
| 643 | |||
| 644 | inline fn isAHLetter(wbp: WordBreakProperty) bool { | ||
| 645 | return wbp == .ALetter or wbp == .Hebrew_Letter; | ||
| 646 | } | ||
| 647 | |||
| 648 | inline fn isMidVal(wbp: WordBreakProperty) bool { | ||
| 649 | return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; | ||
| 650 | } | ||
| 651 | |||
| 652 | inline fn isMidNum(wbp: WordBreakProperty) bool { | ||
| 653 | return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote; | ||
| 654 | } | ||
| 655 | |||
| 656 | inline fn isExtensible(wbp: WordBreakProperty) bool { | ||
| 657 | return switch (wbp) { | ||
| 658 | .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, | ||
| 659 | else => false, | ||
| 660 | }; | ||
| 661 | } | ||
| 662 | |||
| 663 | test "Word Break Properties" { | ||
| 664 | const wb = try Words.init(testing.allocator); | ||
| 665 | defer wb.deinit(testing.allocator); | ||
| 666 | try testing.expectEqual(.CR, wb.breakProperty('\r')); | ||
| 667 | try testing.expectEqual(.LF, wb.breakProperty('\n')); | ||
| 668 | try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); | ||
| 669 | try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); | ||
| 670 | } | ||
| 671 | |||
| 672 | test "ext_pict" { | ||
| 673 | try testing.expect(ext_pict.isMatch("👇")); | ||
| 674 | try testing.expect(ext_pict.isMatch("\u{2701}")); | ||
| 675 | } | ||
| 676 | |||
| 677 | test "Words" { | ||
| 678 | const wb = try Words.init(testing.allocator); | ||
| 679 | defer wb.deinit(testing.allocator); | ||
| 680 | const word_str = "Metonym Μετωνύμιο メトニム"; | ||
| 681 | var w_iter = wb.iterator(word_str); | ||
| 682 | try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str)); | ||
| 683 | // Spaces are "words" too! | ||
| 684 | try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str)); | ||
| 685 | const in_greek = w_iter.next().?; | ||
| 686 | for (in_greek.offset..in_greek.offset + in_greek.len) |i| { | ||
| 687 | const at_index = wb.wordAtIndex(word_str, i).bytes(word_str); | ||
| 688 | try testing.expectEqualStrings("Μετωνύμιο", at_index); | ||
| 689 | } | ||
| 690 | _ = w_iter.next(); | ||
| 691 | try testing.expectEqualStrings("メトニム", w_iter.next().?.bytes(word_str)); | ||
| 692 | } | ||
| 693 | |||
| 694 | test wordAtIndex { | ||
| 695 | const wb = try Words.init(testing.allocator); | ||
| 696 | defer wb.deinit(testing.allocator); | ||
| 697 | const t_string = "first second third"; | ||
| 698 | const second = wb.wordAtIndex(t_string, 8); | ||
| 699 | try testing.expectEqualStrings("second", second.bytes(t_string)); | ||
| 700 | const third = wb.wordAtIndex(t_string, 14); | ||
| 701 | try testing.expectEqualStrings("third", third.bytes(t_string)); | ||
| 702 | { | ||
| 703 | const first = wb.wordAtIndex(t_string, 3); | ||
| 704 | try testing.expectEqualStrings("first", first.bytes(t_string)); | ||
| 705 | } | ||
| 706 | { | ||
| 707 | const first = wb.wordAtIndex(t_string, 0); | ||
| 708 | try testing.expectEqualStrings("first", first.bytes(t_string)); | ||
| 709 | } | ||
| 710 | const last = wb.wordAtIndex(t_string, 14); | ||
| 711 | try testing.expectEqualStrings("third", last.bytes(t_string)); | ||
| 712 | } | ||
| 713 | |||
| 714 | const testr = "don't a:ka fin!"; | ||
| 715 | |||
| 716 | test "reversal" { | ||
| 717 | const wb = try Words.init(testing.allocator); | ||
| 718 | defer wb.deinit(testing.allocator); | ||
| 719 | { | ||
| 720 | var fwd = wb.iterator(testr); | ||
| 721 | var this_word: ?Word = fwd.next(); | ||
| 722 | |||
| 723 | while (this_word) |this| : (this_word = fwd.next()) { | ||
| 724 | var back = fwd.reverseIterator(); | ||
| 725 | const that_word = back.prev(); | ||
| 726 | if (that_word) |that| { | ||
| 727 | try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); | ||
| 728 | } else { | ||
| 729 | try testing.expect(false); | ||
| 730 | } | ||
| 731 | } | ||
| 732 | } | ||
| 733 | { | ||
| 734 | var back = wb.reverseIterator(testr); | ||
| 735 | var this_word: ?Word = back.prev(); | ||
| 736 | |||
| 737 | while (this_word) |this| : (this_word = back.prev()) { | ||
| 738 | var fwd = back.forwardIterator(); | ||
| 739 | const that_word = fwd.next(); | ||
| 740 | if (that_word) |that| { | ||
| 741 | try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); | ||
| 742 | } else { | ||
| 743 | try testing.expect(false); | ||
| 744 | } | ||
| 745 | } | ||
| 746 | } | ||
| 747 | } | ||
| 748 | |||
| 749 | fn testAllocations(allocator: Allocator) !void { | ||
| 750 | const wb = try Words.init(allocator); | ||
| 751 | wb.deinit(allocator); | ||
| 752 | } | ||
| 753 | |||
| 754 | test "allocation safety" { | ||
| 755 | try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{}); | ||
| 756 | } | ||
| 757 | |||
| 758 | const std = @import("std"); | ||
| 759 | const builtin = @import("builtin"); | ||
| 760 | const compress = std.compress; | ||
| 761 | const mem = std.mem; | ||
| 762 | const Allocator = mem.Allocator; | ||
| 763 | const assert = std.debug.assert; | ||
| 764 | const testing = std.testing; | ||
| 765 | |||
| 766 | const uoffset = code_point.uoffset; | ||
| 767 | |||
| 768 | const code_point = @import("code_point"); | ||
| 769 | const CodepointIterator = code_point.Iterator; | ||
| 770 | const ReverseCodepointIterator = code_point.ReverseIterator; | ||
| 771 | const CodePoint = code_point.CodePoint; | ||
| 772 | |||
| 773 | const ext_pict = @import("micro_runeset.zig").Extended_Pictographic; | ||