diff options
| author | 2025-05-16 12:06:32 -0400 | |
|---|---|---|
| committer | 2025-05-16 12:06:32 -0400 | |
| commit | 21d55b730e25733c98fa58de42d256adf5f80d88 (patch) | |
| tree | eb4745e5e25b122dcc2b603e87f132eb3a17b02c /src/Words.zig | |
| parent | Proofread (diff) | |
| download | zg-21d55b730e25733c98fa58de42d256adf5f80d88.tar.gz zg-21d55b730e25733c98fa58de42d256adf5f80d88.tar.xz zg-21d55b730e25733c98fa58de42d256adf5f80d88.zip | |
Move WordBreak to Words
Diffstat (limited to 'src/Words.zig')
| -rw-r--r-- | src/Words.zig | 720 |
1 files changed, 720 insertions, 0 deletions
diff --git a/src/Words.zig b/src/Words.zig new file mode 100644 index 0000000..6a532f5 --- /dev/null +++ b/src/Words.zig | |||
| @@ -0,0 +1,720 @@ | |||
| 1 | //! Word Breaking Algorithm. | ||
| 2 | |||
| 3 | const WordBreakProperty = enum(u5) { | ||
| 4 | none, | ||
| 5 | Double_Quote, | ||
| 6 | Single_Quote, | ||
| 7 | Hebrew_Letter, | ||
| 8 | CR, | ||
| 9 | LF, | ||
| 10 | Newline, | ||
| 11 | Extend, | ||
| 12 | Regional_Indicator, | ||
| 13 | Format, | ||
| 14 | Katakana, | ||
| 15 | ALetter, | ||
| 16 | MidLetter, | ||
| 17 | MidNum, | ||
| 18 | MidNumLet, | ||
| 19 | Numeric, | ||
| 20 | ExtendNumLet, | ||
| 21 | ZWJ, | ||
| 22 | WSegSpace, | ||
| 23 | }; | ||
| 24 | |||
| 25 | s1: []u16 = undefined, | ||
| 26 | s2: []u5 = undefined, | ||
| 27 | |||
| 28 | const WordBreak = @This(); | ||
| 29 | |||
| 30 | pub fn init(allocator: Allocator) Allocator.Error!WordBreak { | ||
| 31 | var wb: WordBreak = undefined; | ||
| 32 | try wb.setup(allocator); | ||
| 33 | return wb; | ||
| 34 | } | ||
| 35 | |||
| 36 | pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { | ||
| 37 | wb.setupImpl(allocator) catch |err| { | ||
| 38 | switch (err) { | ||
| 39 | error.OutOfMemory => |e| return e, | ||
| 40 | else => unreachable, | ||
| 41 | } | ||
| 42 | }; | ||
| 43 | } | ||
| 44 | |||
| 45 | pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { | ||
| 46 | allocator.free(wordbreak.s1); | ||
| 47 | allocator.free(wordbreak.s2); | ||
| 48 | } | ||
| 49 | |||
| 50 | /// Represents a Unicode word span, as an offset into the source string | ||
| 51 | /// and the length of the word. | ||
| 52 | pub const Word = struct { | ||
| 53 | offset: u32, | ||
| 54 | len: u32, | ||
| 55 | |||
| 56 | /// Returns a slice of the word given the source string. | ||
| 57 | pub fn bytes(self: Word, src: []const u8) []const u8 { | ||
| 58 | return src[self.offset..][0..self.len]; | ||
| 59 | } | ||
| 60 | }; | ||
| 61 | |||
| 62 | /// Returns the word break property type for `cp`. | ||
| 63 | pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { | ||
| 64 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); | ||
| 65 | } | ||
| 66 | |||
| 67 | /// Convenience function for working with CodePoints | ||
| 68 | fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { | ||
| 69 | return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); | ||
| 70 | } | ||
| 71 | |||
| 72 | /// Returns the Word at the given index. Asserts that the index is less than | ||
| 73 | /// `string.len`, and that the string is not empty. Always returns a word. | ||
| 74 | /// The index does not have to be the start of a codepoint in the word. | ||
| 75 | pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { | ||
| 76 | assert(index < string.len and string.len > 0); | ||
| 77 | var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); | ||
| 78 | const first_back = iter_back.prev(); | ||
| 79 | if (first_back) |back| { | ||
| 80 | if (back.offset == 0) { | ||
| 81 | var iter_fwd = wordbreak.iterator(string); | ||
| 82 | while (iter_fwd.next()) |word| { | ||
| 83 | if (word.offset <= index and index < word.offset + word.len) | ||
| 84 | return word; | ||
| 85 | } | ||
| 86 | } | ||
| 87 | } else { | ||
| 88 | var iter_fwd = wordbreak.iterator(string); | ||
| 89 | while (iter_fwd.next()) |word| { | ||
| 90 | if (word.offset <= index and index < word.offset + word.len) | ||
| 91 | return word; | ||
| 92 | } | ||
| 93 | } | ||
| 94 | const second_back = iter_back.prev(); | ||
| 95 | if (second_back) |back| if (back.offset == 0) { | ||
| 96 | var iter_fwd = wordbreak.iterator(string); | ||
| 97 | while (iter_fwd.next()) |word| { | ||
| 98 | if (word.offset <= index and index < word.offset + word.len) | ||
| 99 | return word; | ||
| 100 | } | ||
| 101 | }; | ||
| 102 | // There's sometimes flags: | ||
| 103 | if (iter_back.flags > 0) { | ||
| 104 | while (iter_back.flags > 0) { | ||
| 105 | if (iter_back.prev()) |_| { | ||
| 106 | continue; | ||
| 107 | } else { | ||
| 108 | break; | ||
| 109 | } | ||
| 110 | } | ||
| 111 | } | ||
| 112 | var iter_fwd = iter_back.forwardIterator(); | ||
| 113 | while (iter_fwd.next()) |word| { | ||
| 114 | if (word.offset <= index and index < word.offset + word.len) | ||
| 115 | return word; | ||
| 116 | } | ||
| 117 | unreachable; | ||
| 118 | } | ||
| 119 | |||
| 120 | /// Returns an iterator over words in `slice`. | ||
| 121 | pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { | ||
| 122 | return Iterator.init(wordbreak, slice); | ||
| 123 | } | ||
| 124 | |||
| 125 | /// Returns a reverse iterator over the words in `slice`. | ||
| 126 | pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator { | ||
| 127 | return ReverseIterator.init(wordbreak, slice); | ||
| 128 | } | ||
| 129 | |||
| 130 | /// An iterator, forward, over all words in a provided string. | ||
| 131 | pub const Iterator = struct { | ||
| 132 | this: ?CodePoint = null, | ||
| 133 | that: ?CodePoint = null, | ||
| 134 | cp_iter: CodepointIterator, | ||
| 135 | wb: *const WordBreak, | ||
| 136 | |||
| 137 | /// Assumes `str` is valid UTF-8. | ||
| 138 | pub fn init(wb: *const WordBreak, str: []const u8) Iterator { | ||
| 139 | var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; | ||
| 140 | wb_iter.advance(); | ||
| 141 | return wb_iter; | ||
| 142 | } | ||
| 143 | |||
| 144 | /// Returns the next word segment, without advancing. | ||
| 145 | pub fn peek(iter: *Iterator) ?Word { | ||
| 146 | const cache = .{ iter.this, iter.that, iter.cp_iter }; | ||
| 147 | defer { | ||
| 148 | iter.this, iter.that, iter.cp_iter = cache; | ||
| 149 | } | ||
| 150 | return iter.next(); | ||
| 151 | } | ||
| 152 | |||
| 153 | /// Returns a reverse iterator from the point this iterator is paused | ||
| 154 | /// at. Usually, and always when using the API to create iterators, | ||
| 155 | /// calling `prev()` will return the word just seen. | ||
| 156 | pub fn reverseIterator(iter: *Iterator) ReverseIterator { | ||
| 157 | var cp_it = iter.cp_iter.reverseIterator(); | ||
| 158 | if (iter.that) |_| | ||
| 159 | _ = cp_it.prev(); | ||
| 160 | if (iter.cp_iter.peek()) |_| | ||
| 161 | _ = cp_it.prev(); | ||
| 162 | return .{ | ||
| 163 | .wb = iter.wb, | ||
| 164 | .before = cp_it.prev(), | ||
| 165 | .after = iter.that, | ||
| 166 | .cp_iter = cp_it, | ||
| 167 | }; | ||
| 168 | } | ||
| 169 | |||
| 170 | /// Returns the next word segment, if any. | ||
| 171 | pub fn next(iter: *Iterator) ?Word { | ||
| 172 | iter.advance(); | ||
| 173 | |||
| 174 | // Done? | ||
| 175 | if (iter.this == null) return null; | ||
| 176 | // Last? | ||
| 177 | if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; | ||
| 178 | |||
| 179 | const word_start = iter.this.?.offset; | ||
| 180 | var word_len: u32 = 0; | ||
| 181 | |||
| 182 | // State variables. | ||
| 183 | var last_p: WordBreakProperty = .none; | ||
| 184 | var last_last_p: WordBreakProperty = .none; | ||
| 185 | var ri_count: usize = 0; | ||
| 186 | |||
| 187 | scan: while (true) : (iter.advance()) { | ||
| 188 | const this = iter.this.?; | ||
| 189 | word_len += this.len; | ||
| 190 | if (iter.that) |that| { | ||
| 191 | const this_p = iter.wb.breakProp(this); | ||
| 192 | const that_p = iter.wb.breakProp(that); | ||
| 193 | if (!isIgnorable(this_p)) { | ||
| 194 | last_last_p = last_p; | ||
| 195 | last_p = this_p; | ||
| 196 | } | ||
| 197 | // WB3 CR × LF | ||
| 198 | if (this_p == .CR and that_p == .LF) continue :scan; | ||
| 199 | // WB3a (Newline | CR | LF) ÷ | ||
| 200 | if (isNewline(this_p)) break :scan; | ||
| 201 | // WB3b ÷ (Newline | CR | LF) | ||
| 202 | if (isNewline(that_p)) break :scan; | ||
| 203 | // WB3c ZWJ × \p{Extended_Pictographic} | ||
| 204 | if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { | ||
| 205 | continue :scan; | ||
| 206 | } | ||
| 207 | // WB3d WSegSpace × WSegSpace | ||
| 208 | if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; | ||
| 209 | // WB4 X (Extend | Format | ZWJ)* → X | ||
| 210 | if (isIgnorable(that_p)) { | ||
| 211 | continue :scan; | ||
| 212 | } // Now we use last_p instead of this_p for ignorable's sake | ||
| 213 | if (isAHLetter(last_p)) { | ||
| 214 | // WB5 AHLetter × AHLetter | ||
| 215 | if (isAHLetter(that_p)) continue :scan; | ||
| 216 | // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter | ||
| 217 | if (isMidVal(that_p)) { | ||
| 218 | const next_val = iter.peekPast(); | ||
| 219 | if (next_val) |next_cp| { | ||
| 220 | const next_p = iter.wb.breakProp(next_cp); | ||
| 221 | if (isAHLetter(next_p)) { | ||
| 222 | continue :scan; | ||
| 223 | } | ||
| 224 | } | ||
| 225 | } | ||
| 226 | } | ||
| 227 | // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter | ||
| 228 | if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) { | ||
| 229 | continue :scan; | ||
| 230 | } | ||
| 231 | if (last_p == .Hebrew_Letter) { | ||
| 232 | // WB7a Hebrew_Letter × Single_Quote | ||
| 233 | if (that_p == .Single_Quote) continue :scan; | ||
| 234 | // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter | ||
| 235 | if (that_p == .Double_Quote) { | ||
| 236 | const next_val = iter.peekPast(); | ||
| 237 | if (next_val) |next_cp| { | ||
| 238 | const next_p = iter.wb.breakProp(next_cp); | ||
| 239 | if (next_p == .Hebrew_Letter) { | ||
| 240 | continue :scan; | ||
| 241 | } | ||
| 242 | } | ||
| 243 | } | ||
| 244 | } | ||
| 245 | // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter | ||
| 246 | if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter) | ||
| 247 | continue :scan; | ||
| 248 | // WB8 Numeric × Numeric | ||
| 249 | if (last_p == .Numeric and that_p == .Numeric) continue :scan; | ||
| 250 | // WB9 AHLetter × Numeric | ||
| 251 | if (isAHLetter(last_p) and that_p == .Numeric) continue :scan; | ||
| 252 | // WB10 Numeric × AHLetter | ||
| 253 | if (last_p == .Numeric and isAHLetter(that_p)) continue :scan; | ||
| 254 | // WB11 Numeric (MidNum | MidNumLetQ) × Numeric | ||
| 255 | if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric) | ||
| 256 | continue :scan; | ||
| 257 | // WB12 Numeric × (MidNum | MidNumLetQ) Numeric | ||
| 258 | if (last_p == .Numeric and isMidNum(that_p)) { | ||
| 259 | const next_val = iter.peekPast(); | ||
| 260 | if (next_val) |next_cp| { | ||
| 261 | const next_p = iter.wb.breakProp(next_cp); | ||
| 262 | if (next_p == .Numeric) { | ||
| 263 | continue :scan; | ||
| 264 | } | ||
| 265 | } | ||
| 266 | } | ||
| 267 | // WB13 Katakana × Katakana | ||
| 268 | if (last_p == .Katakana and that_p == .Katakana) continue :scan; | ||
| 269 | // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet | ||
| 270 | if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan; | ||
| 271 | // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) | ||
| 272 | if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; | ||
| 273 | // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI | ||
| 274 | const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator; | ||
| 275 | if (maybe_flag) { | ||
| 276 | ri_count += 1; | ||
| 277 | if (ri_count % 2 == 1) continue :scan; | ||
| 278 | } | ||
| 279 | // WB999 Any ÷ Any | ||
| 280 | break :scan; | ||
| 281 | } else { // iter.that == null | ||
| 282 | break :scan; | ||
| 283 | } | ||
| 284 | } | ||
| 285 | |||
| 286 | return Word{ .len = word_len, .offset = word_start }; | ||
| 287 | } | ||
| 288 | |||
| 289 | pub fn format(iter: Iterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { | ||
| 290 | try writer.print( | ||
| 291 | "Iterator {{ .this = {any}, .that = {any} }}", | ||
| 292 | .{ iter.this, iter.that }, | ||
| 293 | ); | ||
| 294 | } | ||
| 295 | |||
| 296 | fn advance(iter: *Iterator) void { | ||
| 297 | iter.this = iter.that; | ||
| 298 | iter.that = iter.cp_iter.next(); | ||
| 299 | } | ||
| 300 | |||
| 301 | fn peekPast(iter: *Iterator) ?CodePoint { | ||
| 302 | const save_cp = iter.cp_iter; | ||
| 303 | defer iter.cp_iter = save_cp; | ||
| 304 | while (iter.cp_iter.peek()) |peeked| { | ||
| 305 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | ||
| 306 | _ = iter.cp_iter.next(); | ||
| 307 | } | ||
| 308 | return null; | ||
| 309 | } | ||
| 310 | }; | ||
| 311 | |||
| 312 | /// An iterator, backward, over all words in a provided string. | ||
| 313 | pub const ReverseIterator = struct { | ||
| 314 | after: ?CodePoint = null, | ||
| 315 | before: ?CodePoint = null, | ||
| 316 | cp_iter: ReverseCodepointIterator, | ||
| 317 | wb: *const WordBreak, | ||
| 318 | flags: usize = 0, | ||
| 319 | |||
| 320 | /// Assumes `str` is valid UTF-8. | ||
| 321 | pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { | ||
| 322 | var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; | ||
| 323 | wb_iter.advance(); | ||
| 324 | return wb_iter; | ||
| 325 | } | ||
| 326 | |||
| 327 | /// Returns the previous word segment, if any, without advancing. | ||
| 328 | pub fn peek(iter: *ReverseIterator) ?Word { | ||
| 329 | const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; | ||
| 330 | defer { | ||
| 331 | iter.before, iter.after, iter.cp_iter, iter.flags = cache; | ||
| 332 | } | ||
| 333 | return iter.prev(); | ||
| 334 | } | ||
| 335 | |||
| 336 | /// Return a forward iterator from where this iterator paused. Usually, | ||
| 337 | /// and always when using the API to create iterators, calling `next()` | ||
| 338 | /// will return the word just seen. | ||
| 339 | pub fn forwardIterator(iter: *ReverseIterator) Iterator { | ||
| 340 | var cp_it = iter.cp_iter.forwardIterator(); | ||
| 341 | if (iter.before) |_| | ||
| 342 | _ = cp_it.next(); | ||
| 343 | return .{ | ||
| 344 | .wb = iter.wb, | ||
| 345 | .this = cp_it.next(), | ||
| 346 | .that = iter.after, | ||
| 347 | .cp_iter = cp_it, | ||
| 348 | }; | ||
| 349 | } | ||
| 350 | |||
| 351 | /// Return the previous word, if any. | ||
| 352 | pub fn prev(iter: *ReverseIterator) ?Word { | ||
| 353 | iter.advance(); | ||
| 354 | |||
| 355 | // Done? | ||
| 356 | if (iter.after == null) return null; | ||
| 357 | // Last? | ||
| 358 | if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 }; | ||
| 359 | |||
| 360 | const word_end = iter.after.?.offset + iter.after.?.len; | ||
| 361 | var word_len: u32 = 0; | ||
| 362 | |||
| 363 | // State variables. | ||
| 364 | var last_p: WordBreakProperty = .none; | ||
| 365 | var last_last_p: WordBreakProperty = .none; | ||
| 366 | |||
| 367 | scan: while (true) : (iter.advance()) { | ||
| 368 | const after = iter.after.?; | ||
| 369 | word_len += after.len; | ||
| 370 | if (iter.before) |before| { | ||
| 371 | var sneak = sneaky(iter); // 'sneaks' past ignorables | ||
| 372 | const after_p = iter.wb.breakProp(after); | ||
| 373 | var before_p = iter.wb.breakProp(before); | ||
| 374 | if (!isIgnorable(after_p)) { | ||
| 375 | last_last_p = last_p; | ||
| 376 | last_p = after_p; | ||
| 377 | } | ||
| 378 | // WB3 CR × LF | ||
| 379 | if (before_p == .CR and after_p == .LF) continue :scan; | ||
| 380 | // WB3a (Newline | CR | LF) ÷ | ||
| 381 | if (isNewline(before_p)) break :scan; | ||
| 382 | // WB3b ÷ (Newline | CR | LF) | ||
| 383 | if (isNewline(after_p)) break :scan; | ||
| 384 | // WB3c ZWJ × \p{Extended_Pictographic} | ||
| 385 | if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) { | ||
| 386 | continue :scan; | ||
| 387 | } | ||
| 388 | // WB3d WSegSpace × WSegSpace | ||
| 389 | if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; | ||
| 390 | // WB4 X (Extend | Format | ZWJ)* → X | ||
| 391 | if (isIgnorable(before_p)) { | ||
| 392 | const maybe_before = sneak.prev(); | ||
| 393 | if (maybe_before) |valid_before| { | ||
| 394 | before_p = iter.wb.breakProp(valid_before); | ||
| 395 | } else if (!isIgnorable(after_p)) { | ||
| 396 | // We're done | ||
| 397 | break :scan; | ||
| 398 | } | ||
| 399 | } | ||
| 400 | if (isIgnorable(after_p)) continue :scan; | ||
| 401 | // WB5 AHLetter × AHLetter | ||
| 402 | if (isAHLetter(last_p) and isAHLetter(before_p)) { | ||
| 403 | continue :scan; | ||
| 404 | } | ||
| 405 | // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter | ||
| 406 | if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) { | ||
| 407 | continue :scan; | ||
| 408 | } | ||
| 409 | // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter | ||
| 410 | if (isMidVal(before_p) and isAHLetter(last_p)) { | ||
| 411 | const prev_val = sneak.peek(); | ||
| 412 | if (prev_val) |prev_cp| { | ||
| 413 | const prev_p = iter.wb.breakProp(prev_cp); | ||
| 414 | if (isAHLetter(prev_p)) { | ||
| 415 | continue :scan; | ||
| 416 | } | ||
| 417 | } | ||
| 418 | } | ||
| 419 | // WB7a Hebrew_Letter × Single_Quote | ||
| 420 | if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan; | ||
| 421 | // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter | ||
| 422 | if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) { | ||
| 423 | continue :scan; | ||
| 424 | } | ||
| 425 | // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter | ||
| 426 | if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { | ||
| 427 | const prev_val = sneak.peek(); | ||
| 428 | if (prev_val) |prev_cp| { | ||
| 429 | const prev_p = iter.wb.breakProp(prev_cp); | ||
| 430 | if (prev_p == .Hebrew_Letter) { | ||
| 431 | continue :scan; | ||
| 432 | } | ||
| 433 | } | ||
| 434 | } | ||
| 435 | // WB8 Numeric × Numeric | ||
| 436 | if (before_p == .Numeric and last_p == .Numeric) continue :scan; | ||
| 437 | // WB9 AHLetter × Numeric | ||
| 438 | if (isAHLetter(before_p) and last_p == .Numeric) continue :scan; | ||
| 439 | // WB10 Numeric × AHLetter | ||
| 440 | if (before_p == .Numeric and isAHLetter(last_p)) continue :scan; | ||
| 441 | // WB11 Numeric (MidNum | MidNumLetQ) × Numeric | ||
| 442 | if (isMidNum(before_p) and last_p == .Numeric) { | ||
| 443 | const prev_val = sneak.peek(); | ||
| 444 | if (prev_val) |prev_cp| { | ||
| 445 | const prev_p = iter.wb.breakProp(prev_cp); | ||
| 446 | if (prev_p == .Numeric) { | ||
| 447 | continue :scan; | ||
| 448 | } | ||
| 449 | } | ||
| 450 | } | ||
| 451 | // WB12 Numeric × (MidNum | MidNumLetQ) Numeric | ||
| 452 | if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) { | ||
| 453 | continue :scan; | ||
| 454 | } | ||
| 455 | // WB13 Katakana × Katakana | ||
| 456 | if (before_p == .Katakana and last_p == .Katakana) continue :scan; | ||
| 457 | // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet | ||
| 458 | if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan; | ||
| 459 | // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) | ||
| 460 | if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan; | ||
| 461 | // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI | ||
| 462 | // NOTE: | ||
| 463 | // So here we simply have to know whether a run of flags is even or odd. | ||
| 464 | // The whole run. To avoid quadratic behavior (and long flag runs are | ||
| 465 | // actually a thing in the wild), we have to count them once, store that | ||
| 466 | // on the iterator, and decrement each time we see two, possibly breaking | ||
| 467 | // once extra at the beginning. They break up one per flag, once we hit | ||
| 468 | // zero, that's all the flags. If we see another flag we do it again. | ||
| 469 | if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) { | ||
| 470 | defer { | ||
| 471 | if (iter.flags > 0) iter.flags -= 1; | ||
| 472 | } | ||
| 473 | if (iter.flags == 0) { | ||
| 474 | iter.flags = sneak.countFlags(); | ||
| 475 | } | ||
| 476 | if (iter.flags % 2 == 0) { | ||
| 477 | continue :scan; | ||
| 478 | } | ||
| 479 | } | ||
| 480 | // WB999 Any ÷ Any | ||
| 481 | break :scan; | ||
| 482 | } | ||
| 483 | break :scan; | ||
| 484 | } | ||
| 485 | return Word{ .len = word_len, .offset = word_end - word_len }; | ||
| 486 | } | ||
| 487 | |||
| 488 | pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { | ||
| 489 | try writer.print( | ||
| 490 | "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}", | ||
| 491 | .{ iter.before, iter.after, iter.flags }, | ||
| 492 | ); | ||
| 493 | } | ||
| 494 | |||
| 495 | fn peekPast(iter: *ReverseIterator) ?CodePoint { | ||
| 496 | const save_cp = iter.cp_iter; | ||
| 497 | defer iter.cp_iter = save_cp; | ||
| 498 | while (iter.cp_iter.peek()) |peeked| { | ||
| 499 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | ||
| 500 | _ = iter.cp_iter.prev(); | ||
| 501 | } | ||
| 502 | return null; | ||
| 503 | } | ||
| 504 | |||
| 505 | fn advance(iter: *ReverseIterator) void { | ||
| 506 | iter.after = iter.before; | ||
| 507 | iter.before = iter.cp_iter.prev(); | ||
| 508 | } | ||
| 509 | }; | ||
| 510 | |||
| 511 | //| Implementation Details | ||
| 512 | |||
| 513 | /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. | ||
| 514 | fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator { | ||
| 515 | var idx: u32 = @intCast(index); | ||
| 516 | // Find the next lead byte: | ||
| 517 | while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} | ||
| 518 | if (idx == string.len) return wb.reverseIterator(string); | ||
| 519 | var iter: ReverseIterator = undefined; | ||
| 520 | iter.wb = wb; | ||
| 521 | iter.flags = 0; | ||
| 522 | // We need to populate the CodePoints, and the codepoint iterator. | ||
| 523 | // Consider "abc| def" with the cursor as |. | ||
| 524 | // We need `before` to be `c` and `after` to be ' ', | ||
| 525 | // and `cp_iter.prev()` to be `b`. | ||
| 526 | var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx }; | ||
| 527 | iter.after = cp_iter.prev(); | ||
| 528 | iter.before = cp_iter.prev(); | ||
| 529 | iter.cp_iter = cp_iter; | ||
| 530 | return iter; | ||
| 531 | } | ||
| 532 | |||
| 533 | fn sneaky(iter: *const ReverseIterator) SneakIterator { | ||
| 534 | return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; | ||
| 535 | } | ||
| 536 | |||
| 537 | const SneakIterator = struct { | ||
| 538 | cp_iter: ReverseCodepointIterator, | ||
| 539 | wb: *const WordBreak, | ||
| 540 | |||
| 541 | fn peek(iter: *SneakIterator) ?CodePoint { | ||
| 542 | const save_cp = iter.cp_iter; | ||
| 543 | defer iter.cp_iter = save_cp; | ||
| 544 | while (iter.cp_iter.peek()) |peeked| { | ||
| 545 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | ||
| 546 | _ = iter.cp_iter.prev(); | ||
| 547 | } | ||
| 548 | return null; | ||
| 549 | } | ||
| 550 | |||
| 551 | fn countFlags(iter: *SneakIterator) usize { | ||
| 552 | var flags: usize = 0; | ||
| 553 | const save_cp = iter.cp_iter; | ||
| 554 | defer iter.cp_iter = save_cp; | ||
| 555 | while (iter.cp_iter.prev()) |cp| { | ||
| 556 | const prop = iter.wb.breakProp(cp); | ||
| 557 | if (isIgnorable(prop)) continue; | ||
| 558 | if (prop == .Regional_Indicator) { | ||
| 559 | flags += 1; | ||
| 560 | } else break; | ||
| 561 | } | ||
| 562 | return flags; | ||
| 563 | } | ||
| 564 | |||
| 565 | fn prev(iter: *SneakIterator) ?CodePoint { | ||
| 566 | while (iter.cp_iter.prev()) |peeked| { | ||
| 567 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | ||
| 568 | } | ||
| 569 | return null; | ||
| 570 | } | ||
| 571 | }; | ||
| 572 | |||
| 573 | inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { | ||
| 574 | const decompressor = compress.flate.inflate.decompressor; | ||
| 575 | const in_bytes = @embedFile("wbp"); | ||
| 576 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 577 | var in_decomp = decompressor(.raw, in_fbs.reader()); | ||
| 578 | var reader = in_decomp.reader(); | ||
| 579 | |||
| 580 | const endian = builtin.cpu.arch.endian(); | ||
| 581 | |||
| 582 | const stage_1_len: u16 = try reader.readInt(u16, endian); | ||
| 583 | wb.s1 = try allocator.alloc(u16, stage_1_len); | ||
| 584 | errdefer allocator.free(wb.s1); | ||
| 585 | for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); | ||
| 586 | |||
| 587 | const stage_2_len: u16 = try reader.readInt(u16, endian); | ||
| 588 | wb.s2 = try allocator.alloc(u5, stage_2_len); | ||
| 589 | errdefer allocator.free(wb.s2); | ||
| 590 | for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); | ||
| 591 | var count_0: usize = 0; | ||
| 592 | for (wb.s2) |nyb| { | ||
| 593 | if (nyb == 0) count_0 += 1; | ||
| 594 | } | ||
| 595 | } | ||
| 596 | |||
| 597 | //| Predicates | ||
| 598 | |||
| 599 | inline fn isNewline(wbp: WordBreakProperty) bool { | ||
| 600 | return wbp == .CR or wbp == .LF or wbp == .Newline; | ||
| 601 | } | ||
| 602 | |||
| 603 | inline fn isIgnorable(wbp: WordBreakProperty) bool { | ||
| 604 | return switch (wbp) { | ||
| 605 | .Format, .Extend, .ZWJ => true, | ||
| 606 | else => false, | ||
| 607 | }; | ||
| 608 | } | ||
| 609 | |||
| 610 | inline fn isAHLetter(wbp: WordBreakProperty) bool { | ||
| 611 | return wbp == .ALetter or wbp == .Hebrew_Letter; | ||
| 612 | } | ||
| 613 | |||
| 614 | inline fn isMidVal(wbp: WordBreakProperty) bool { | ||
| 615 | return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; | ||
| 616 | } | ||
| 617 | |||
| 618 | inline fn isMidNum(wbp: WordBreakProperty) bool { | ||
| 619 | return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote; | ||
| 620 | } | ||
| 621 | |||
| 622 | inline fn isExtensible(wbp: WordBreakProperty) bool { | ||
| 623 | return switch (wbp) { | ||
| 624 | .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, | ||
| 625 | else => false, | ||
| 626 | }; | ||
| 627 | } | ||
| 628 | |||
| 629 | test "Word Break Properties" { | ||
| 630 | const wb = try WordBreak.init(testing.allocator); | ||
| 631 | defer wb.deinit(testing.allocator); | ||
| 632 | try testing.expectEqual(.CR, wb.breakProperty('\r')); | ||
| 633 | try testing.expectEqual(.LF, wb.breakProperty('\n')); | ||
| 634 | try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); | ||
| 635 | try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); | ||
| 636 | } | ||
| 637 | |||
| 638 | test "ext_pict" { | ||
| 639 | try testing.expect(ext_pict.isMatch("👇")); | ||
| 640 | try testing.expect(ext_pict.isMatch("\u{2701}")); | ||
| 641 | } | ||
| 642 | |||
| 643 | test wordAtIndex { | ||
| 644 | const wb = try WordBreak.init(testing.allocator); | ||
| 645 | defer wb.deinit(testing.allocator); | ||
| 646 | const t_string = "first second third"; | ||
| 647 | const second = wb.wordAtIndex(t_string, 8); | ||
| 648 | try testing.expectEqualStrings("second", second.bytes(t_string)); | ||
| 649 | const third = wb.wordAtIndex(t_string, 14); | ||
| 650 | try testing.expectEqualStrings("third", third.bytes(t_string)); | ||
| 651 | { | ||
| 652 | const first = wb.wordAtIndex(t_string, 3); | ||
| 653 | try testing.expectEqualStrings("first", first.bytes(t_string)); | ||
| 654 | } | ||
| 655 | { | ||
| 656 | const first = wb.wordAtIndex(t_string, 0); | ||
| 657 | try testing.expectEqualStrings("first", first.bytes(t_string)); | ||
| 658 | } | ||
| 659 | const last = wb.wordAtIndex(t_string, 14); | ||
| 660 | try testing.expectEqualStrings("third", last.bytes(t_string)); | ||
| 661 | } | ||
| 662 | |||
| 663 | const testr = "don't a:ka fin!"; | ||
| 664 | |||
| 665 | test "reversal" { | ||
| 666 | const wb = try WordBreak.init(testing.allocator); | ||
| 667 | defer wb.deinit(testing.allocator); | ||
| 668 | { | ||
| 669 | var fwd = wb.iterator(testr); | ||
| 670 | var this_word: ?Word = fwd.next(); | ||
| 671 | |||
| 672 | while (this_word) |this| : (this_word = fwd.next()) { | ||
| 673 | var back = fwd.reverseIterator(); | ||
| 674 | const that_word = back.prev(); | ||
| 675 | if (that_word) |that| { | ||
| 676 | try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); | ||
| 677 | } else { | ||
| 678 | try testing.expect(false); | ||
| 679 | } | ||
| 680 | } | ||
| 681 | } | ||
| 682 | { | ||
| 683 | var back = wb.reverseIterator(testr); | ||
| 684 | var this_word: ?Word = back.prev(); | ||
| 685 | |||
| 686 | while (this_word) |this| : (this_word = back.prev()) { | ||
| 687 | var fwd = back.forwardIterator(); | ||
| 688 | const that_word = fwd.next(); | ||
| 689 | if (that_word) |that| { | ||
| 690 | try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); | ||
| 691 | } else { | ||
| 692 | try testing.expect(false); | ||
| 693 | } | ||
| 694 | } | ||
| 695 | } | ||
| 696 | } | ||
| 697 | |||
| 698 | fn testAllocations(allocator: Allocator) !void { | ||
| 699 | const wb = try WordBreak.init(allocator); | ||
| 700 | wb.deinit(allocator); | ||
| 701 | } | ||
| 702 | |||
| 703 | test "allocation safety" { | ||
| 704 | try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{}); | ||
| 705 | } | ||
| 706 | |||
| 707 | const std = @import("std"); | ||
| 708 | const builtin = @import("builtin"); | ||
| 709 | const compress = std.compress; | ||
| 710 | const mem = std.mem; | ||
| 711 | const Allocator = mem.Allocator; | ||
| 712 | const assert = std.debug.assert; | ||
| 713 | const testing = std.testing; | ||
| 714 | |||
| 715 | const code_point = @import("code_point"); | ||
| 716 | const CodepointIterator = code_point.Iterator; | ||
| 717 | const ReverseCodepointIterator = code_point.ReverseIterator; | ||
| 718 | const CodePoint = code_point.CodePoint; | ||
| 719 | |||
| 720 | const ext_pict = @import("micro_runeset.zig").Extended_Pictographic; | ||