summaryrefslogtreecommitdiff
path: root/src/Words.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/Words.zig')
-rw-r--r--src/Words.zig773
1 files changed, 773 insertions, 0 deletions
diff --git a/src/Words.zig b/src/Words.zig
new file mode 100644
index 0000000..617c34d
--- /dev/null
+++ b/src/Words.zig
@@ -0,0 +1,773 @@
1//! Word Breaking Algorithm.
2//!
3//! https://www.unicode.org/reports/tr29/#Word_Boundaries
4//!
5
6const WordBreakProperty = enum(u5) {
7 none,
8 Double_Quote,
9 Single_Quote,
10 Hebrew_Letter,
11 CR,
12 LF,
13 Newline,
14 Extend,
15 Regional_Indicator,
16 Format,
17 Katakana,
18 ALetter,
19 MidLetter,
20 MidNum,
21 MidNumLet,
22 Numeric,
23 ExtendNumLet,
24 ZWJ,
25 WSegSpace,
26};
27
28s1: []u16 = undefined,
29s2: []u5 = undefined,
30
31const Words = @This();
32
33pub fn init(allocator: Allocator) Allocator.Error!Words {
34 var wb: Words = undefined;
35 try wb.setup(allocator);
36 return wb;
37}
38
39pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void {
40 wb.setupImpl(allocator) catch |err| {
41 switch (err) {
42 error.OutOfMemory => |e| return e,
43 else => unreachable,
44 }
45 };
46}
47
48pub fn deinit(words: *const Words, allocator: mem.Allocator) void {
49 allocator.free(words.s1);
50 allocator.free(words.s2);
51}
52
53/// Represents a Unicode word span, as an offset into the source string
54/// and the length of the word.
55pub const Word = struct {
56 offset: uoffset,
57 len: uoffset,
58
59 /// Returns a slice of the word given the source string.
60 pub fn bytes(word: Word, src: []const u8) []const u8 {
61 return src[word.offset..][0..word.len];
62 }
63};
64
65/// Returns the word break property type for `cp`.
66pub fn breakProperty(words: *const Words, cp: u21) WordBreakProperty {
67 return @enumFromInt(words.s2[words.s1[cp >> 8] + (cp & 0xff)]);
68}
69
70/// Convenience function for working with CodePoints
71fn breakProp(words: *const Words, point: CodePoint) WordBreakProperty {
72 return @enumFromInt(words.s2[words.s1[point.code >> 8] + (point.code & 0xff)]);
73}
74
75/// Returns the Word at the given index. Asserts that the index is less than
76/// `string.len`, and that the string is not empty. Always returns a word.
77/// The index does not have to be the start of a codepoint in the word.
78pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word {
79 assert(index < string.len and string.len > 0);
80 var iter_back: ReverseIterator = reverseFromIndex(words, string, index);
81 const first_back = iter_back.prev();
82 if (first_back) |back| {
83 if (back.offset == 0) {
84 var iter_fwd = words.iterator(string);
85 while (iter_fwd.next()) |word| {
86 if (word.offset <= index and index < word.offset + word.len)
87 return word;
88 }
89 }
90 } else {
91 var iter_fwd = words.iterator(string);
92 while (iter_fwd.next()) |word| {
93 if (word.offset <= index and index < word.offset + word.len)
94 return word;
95 }
96 }
97 _ = iter_back.prev();
98 // There's sometimes flags:
99 if (iter_back.flags > 0) {
100 while (iter_back.flags > 0) {
101 if (iter_back.prev()) |_| {
102 continue;
103 } else {
104 break;
105 }
106 }
107 }
108 var iter_fwd = iter_back.forwardIterator();
109 while (iter_fwd.next()) |word| {
110 if (word.offset <= index and index < word.offset + word.len)
111 return word;
112 }
113 unreachable;
114}
115
116/// Returns an iterator over words in `slice`.
117pub fn iterator(words: *const Words, slice: []const u8) Iterator {
118 return Iterator.init(words, slice);
119}
120
121/// Returns a reverse iterator over the words in `slice`.
122pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator {
123 return ReverseIterator.init(words, slice);
124}
125
126/// Returns an iterator after the `word` in `slice`.
127pub fn iterateAfterWord(words: *const Words, slice: []const u8, word: Word) Iterator {
128 return forwardFromIndex(words, slice, word.offset + word.len);
129}
130
131/// Returns a reverse iterator before the `word` in `slice`.
132pub fn iterateBeforeWord(words: *const Words, slice: []const u8, word: Word) ReverseIterator {
133 return reverseFromIndex(words, slice, word.offset);
134}
135
136/// An iterator, forward, over all words in a provided string.
137pub const Iterator = struct {
138 this: ?CodePoint = null,
139 that: ?CodePoint = null,
140 cp_iter: CodepointIterator,
141 wb: *const Words,
142
143 /// Assumes `str` is valid UTF-8.
144 pub fn init(words: *const Words, str: []const u8) Iterator {
145 var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = words };
146 wb_iter.advance();
147 return wb_iter;
148 }
149
150 /// Returns the next word segment, without advancing.
151 pub fn peek(iter: *Iterator) ?Word {
152 const cache = .{ iter.this, iter.that, iter.cp_iter };
153 defer {
154 iter.this, iter.that, iter.cp_iter = cache;
155 }
156 return iter.next();
157 }
158
159 /// Returns a reverse iterator from the point this iterator is paused
160 /// at. Usually, and always when using the API to create iterators,
161 /// calling `prev()` will return the word just seen.
162 pub fn reverseIterator(iter: *Iterator) ReverseIterator {
163 var cp_it = iter.cp_iter.reverseIterator();
164 if (iter.that) |_|
165 _ = cp_it.prev();
166 if (iter.cp_iter.peek()) |_|
167 _ = cp_it.prev();
168 return .{
169 .wb = iter.wb,
170 .before = cp_it.prev(),
171 .after = iter.that,
172 .cp_iter = cp_it,
173 };
174 }
175
176 /// Returns the next word segment, if any.
177 pub fn next(iter: *Iterator) ?Word {
178 iter.advance();
179
180 // Done?
181 if (iter.this == null) return null;
182 // Last?
183 if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
184
185 const word_start = iter.this.?.offset;
186 var word_len: uoffset = 0;
187
188 // State variables.
189 var last_p: WordBreakProperty = .none;
190 var last_last_p: WordBreakProperty = .none;
191 var ri_count: usize = 0;
192
193 scan: while (true) : (iter.advance()) {
194 const this = iter.this.?;
195 word_len += this.len;
196 if (iter.that) |that| {
197 const this_p = iter.wb.breakProp(this);
198 const that_p = iter.wb.breakProp(that);
199 if (!isIgnorable(this_p)) {
200 last_last_p = last_p;
201 last_p = this_p;
202 }
203 // WB3 CR × LF
204 if (this_p == .CR and that_p == .LF) continue :scan;
205 // WB3a (Newline | CR | LF) ÷
206 if (isNewline(this_p)) break :scan;
207 // WB3b ÷ (Newline | CR | LF)
208 if (isNewline(that_p)) break :scan;
209 // WB3c ZWJ × \p{Extended_Pictographic}
210 if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) {
211 continue :scan;
212 }
213 // WB3d WSegSpace × WSegSpace
214 if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
215 // WB4 X (Extend | Format | ZWJ)* → X
216 if (isIgnorable(that_p)) {
217 continue :scan;
218 } // Now we use last_p instead of this_p for ignorable's sake
219 if (isAHLetter(last_p)) {
220 // WB5 AHLetter × AHLetter
221 if (isAHLetter(that_p)) continue :scan;
222 // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter
223 if (isMidVal(that_p)) {
224 const next_val = iter.peekPast();
225 if (next_val) |next_cp| {
226 const next_p = iter.wb.breakProp(next_cp);
227 if (isAHLetter(next_p)) {
228 continue :scan;
229 }
230 }
231 }
232 }
233 // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
234 if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) {
235 continue :scan;
236 }
237 if (last_p == .Hebrew_Letter) {
238 // WB7a Hebrew_Letter × Single_Quote
239 if (that_p == .Single_Quote) continue :scan;
240 // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
241 if (that_p == .Double_Quote) {
242 const next_val = iter.peekPast();
243 if (next_val) |next_cp| {
244 const next_p = iter.wb.breakProp(next_cp);
245 if (next_p == .Hebrew_Letter) {
246 continue :scan;
247 }
248 }
249 }
250 }
251 // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
252 if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter)
253 continue :scan;
254 // WB8 Numeric × Numeric
255 if (last_p == .Numeric and that_p == .Numeric) continue :scan;
256 // WB9 AHLetter × Numeric
257 if (isAHLetter(last_p) and that_p == .Numeric) continue :scan;
258 // WB10 Numeric × AHLetter
259 if (last_p == .Numeric and isAHLetter(that_p)) continue :scan;
260 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric
261 if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric)
262 continue :scan;
263 // WB12 Numeric × (MidNum | MidNumLetQ) Numeric
264 if (last_p == .Numeric and isMidNum(that_p)) {
265 const next_val = iter.peekPast();
266 if (next_val) |next_cp| {
267 const next_p = iter.wb.breakProp(next_cp);
268 if (next_p == .Numeric) {
269 continue :scan;
270 }
271 }
272 }
273 // WB13 Katakana × Katakana
274 if (last_p == .Katakana and that_p == .Katakana) continue :scan;
275 // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
276 if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan;
277 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana)
278 if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
279 // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI
280 const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator;
281 if (maybe_flag) {
282 ri_count += 1;
283 if (ri_count % 2 == 1) continue :scan;
284 }
285 // WB999 Any ÷ Any
286 break :scan;
287 } else { // iter.that == null
288 break :scan;
289 }
290 }
291
292 return Word{ .len = word_len, .offset = word_start };
293 }
294
295 pub fn format(iter: Iterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
296 try writer.print(
297 "Iterator {{ .this = {any}, .that = {any} }}",
298 .{ iter.this, iter.that },
299 );
300 }
301
302 fn advance(iter: *Iterator) void {
303 iter.this = iter.that;
304 iter.that = iter.cp_iter.next();
305 }
306
307 fn peekPast(iter: *Iterator) ?CodePoint {
308 const save_cp = iter.cp_iter;
309 defer iter.cp_iter = save_cp;
310 while (iter.cp_iter.peek()) |peeked| {
311 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
312 _ = iter.cp_iter.next();
313 }
314 return null;
315 }
316};
317
318/// An iterator, backward, over all words in a provided string.
319pub const ReverseIterator = struct {
320 after: ?CodePoint = null,
321 before: ?CodePoint = null,
322 cp_iter: ReverseCodepointIterator,
323 wb: *const Words,
324 flags: usize = 0,
325
326 /// Assumes `str` is valid UTF-8.
327 pub fn init(words: *const Words, str: []const u8) ReverseIterator {
328 var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = words };
329 wb_iter.advance();
330 return wb_iter;
331 }
332
333 /// Returns the previous word segment, if any, without advancing.
334 pub fn peek(iter: *ReverseIterator) ?Word {
335 const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags };
336 defer {
337 iter.before, iter.after, iter.cp_iter, iter.flags = cache;
338 }
339 return iter.prev();
340 }
341
342 /// Return a forward iterator from where this iterator paused. Usually,
343 /// and always when using the API to create iterators, calling `next()`
344 /// will return the word just seen.
345 pub fn forwardIterator(iter: *ReverseIterator) Iterator {
346 var cp_it = iter.cp_iter.forwardIterator();
347 if (iter.before) |_|
348 _ = cp_it.next();
349 return .{
350 .wb = iter.wb,
351 .this = cp_it.next(),
352 .that = iter.after,
353 .cp_iter = cp_it,
354 };
355 }
356
357 /// Return the previous word, if any.
358 pub fn prev(iter: *ReverseIterator) ?Word {
359 iter.advance();
360
361 // Done?
362 if (iter.after == null) return null;
363 // Last?
364 if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 };
365
366 const word_end = iter.after.?.offset + iter.after.?.len;
367 var word_len: uoffset = 0;
368
369 // State variables.
370 var last_p: WordBreakProperty = .none;
371 var last_last_p: WordBreakProperty = .none;
372
373 scan: while (true) : (iter.advance()) {
374 const after = iter.after.?;
375 word_len += after.len;
376 if (iter.before) |before| {
377 var sneak = sneaky(iter); // 'sneaks' past ignorables
378 const after_p = iter.wb.breakProp(after);
379 var before_p = iter.wb.breakProp(before);
380 if (!isIgnorable(after_p)) {
381 last_last_p = last_p;
382 last_p = after_p;
383 }
384 // WB3 CR × LF
385 if (before_p == .CR and after_p == .LF) continue :scan;
386 // WB3a (Newline | CR | LF) ÷
387 if (isNewline(before_p)) break :scan;
388 // WB3b ÷ (Newline | CR | LF)
389 if (isNewline(after_p)) break :scan;
390 // WB3c ZWJ × \p{Extended_Pictographic}
391 if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) {
392 continue :scan;
393 }
394 // WB3d WSegSpace × WSegSpace
395 if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan;
396 // WB4 X (Extend | Format | ZWJ)* → X
397 if (isIgnorable(before_p)) {
398 const maybe_before = sneak.prev();
399 if (maybe_before) |valid_before| {
400 before_p = iter.wb.breakProp(valid_before);
401 } else if (!isIgnorable(after_p)) {
402 // We're done
403 break :scan;
404 }
405 }
406 if (isIgnorable(after_p)) continue :scan;
407 // WB5 AHLetter × AHLetter
408 if (isAHLetter(last_p) and isAHLetter(before_p)) {
409 continue :scan;
410 }
411 // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter
412 if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) {
413 continue :scan;
414 }
415 // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
416 if (isMidVal(before_p) and isAHLetter(last_p)) {
417 const prev_val = sneak.peek();
418 if (prev_val) |prev_cp| {
419 const prev_p = iter.wb.breakProp(prev_cp);
420 if (isAHLetter(prev_p)) {
421 continue :scan;
422 }
423 }
424 }
425 // WB7a Hebrew_Letter × Single_Quote
426 if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan;
427 // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
428 if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) {
429 continue :scan;
430 }
431 // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
432 if (before_p == .Double_Quote and last_p == .Hebrew_Letter) {
433 const prev_val = sneak.peek();
434 if (prev_val) |prev_cp| {
435 const prev_p = iter.wb.breakProp(prev_cp);
436 if (prev_p == .Hebrew_Letter) {
437 continue :scan;
438 }
439 }
440 }
441 // WB8 Numeric × Numeric
442 if (before_p == .Numeric and last_p == .Numeric) continue :scan;
443 // WB9 AHLetter × Numeric
444 if (isAHLetter(before_p) and last_p == .Numeric) continue :scan;
445 // WB10 Numeric × AHLetter
446 if (before_p == .Numeric and isAHLetter(last_p)) continue :scan;
447 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric
448 if (isMidNum(before_p) and last_p == .Numeric) {
449 const prev_val = sneak.peek();
450 if (prev_val) |prev_cp| {
451 const prev_p = iter.wb.breakProp(prev_cp);
452 if (prev_p == .Numeric) {
453 continue :scan;
454 }
455 }
456 }
457 // WB12 Numeric × (MidNum | MidNumLetQ) Numeric
458 if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) {
459 continue :scan;
460 }
461 // WB13 Katakana × Katakana
462 if (before_p == .Katakana and last_p == .Katakana) continue :scan;
463 // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
464 if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan;
465 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana)
466 if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan;
467 // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI
468 // NOTE:
469 // So here we simply have to know whether a run of flags is even or odd.
470 // The whole run. To avoid quadratic behavior (and long flag runs are
471 // actually a thing in the wild), we have to count them once, store that
472 // on the iterator, and decrement each time we see two, possibly breaking
473 // once extra at the beginning. They break up one per flag, once we hit
474 // zero, that's all the flags. If we see another flag we do it again.
475 if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) {
476 defer {
477 if (iter.flags > 0) iter.flags -= 1;
478 }
479 if (iter.flags == 0) {
480 iter.flags = sneak.countFlags();
481 }
482 if (iter.flags % 2 == 0) {
483 continue :scan;
484 }
485 }
486 // WB999 Any ÷ Any
487 break :scan;
488 }
489 break :scan;
490 }
491 return Word{ .len = word_len, .offset = word_end - word_len };
492 }
493
494 pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
495 try writer.print(
496 "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}",
497 .{ iter.before, iter.after, iter.flags },
498 );
499 }
500
501 fn peekPast(iter: *ReverseIterator) ?CodePoint {
502 const save_cp = iter.cp_iter;
503 defer iter.cp_iter = save_cp;
504 while (iter.cp_iter.peek()) |peeked| {
505 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
506 _ = iter.cp_iter.prev();
507 }
508 return null;
509 }
510
511 fn advance(iter: *ReverseIterator) void {
512 iter.after = iter.before;
513 iter.before = iter.cp_iter.prev();
514 }
515};
516
517//| Implementation Details
518
519/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`.
520fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator {
521 var idx: uoffset = @intCast(index);
522 // Find the next lead byte:
523 while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
524 if (idx == string.len) return words.reverseIterator(string);
525 var iter: ReverseIterator = undefined;
526 iter.wb = words;
527 iter.flags = 0;
528 // We need to populate the CodePoints, and the codepoint iterator.
529 // Consider "abc| def" with the cursor as |.
530 // We need `before` to be `c` and `after` to be ' ',
531 // and `cp_iter.prev()` to be `b`.
532 var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
533 iter.after = cp_iter.prev();
534 iter.before = cp_iter.prev();
535 iter.cp_iter = cp_iter;
536 return iter;
537}
538
539fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator {
540 var idx: uoffset = @intCast(index);
541 if (idx == string.len) {
542 return .{
543 .cp_iter = .{ .bytes = string, .i = idx },
544 .this = null,
545 .that = null,
546 .wb = words,
547 };
548 }
549 while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
550 if (idx == 0) return words.iterator(string);
551 var iter: Iterator = undefined;
552 iter.wb = words;
553 // We need to populate the CodePoints, and the codepoint iterator.
554 // Consider "abc |def" with the cursor as |.
555 // We need `this` to be ` ` and `that` to be 'd',
556 // and `cp_iter.next()` to be `d`.
557 idx -= 1;
558 while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
559 // "abc| def"
560 var cp_iter: CodepointIterator = .{ .bytes = string, .i = idx };
561 iter.this = cp_iter.next();
562 iter.that = cp_iter.next();
563 iter.cp_iter = cp_iter;
564 return iter;
565}
566
567fn sneaky(iter: *const ReverseIterator) SneakIterator {
568 return .{ .cp_iter = iter.cp_iter, .wb = iter.wb };
569}
570
571const SneakIterator = struct {
572 cp_iter: ReverseCodepointIterator,
573 wb: *const Words,
574
575 fn peek(iter: *SneakIterator) ?CodePoint {
576 const save_cp = iter.cp_iter;
577 defer iter.cp_iter = save_cp;
578 while (iter.cp_iter.peek()) |peeked| {
579 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
580 _ = iter.cp_iter.prev();
581 }
582 return null;
583 }
584
585 fn countFlags(iter: *SneakIterator) usize {
586 var flags: usize = 0;
587 const save_cp = iter.cp_iter;
588 defer iter.cp_iter = save_cp;
589 while (iter.cp_iter.prev()) |cp| {
590 const prop = iter.wb.breakProp(cp);
591 if (isIgnorable(prop)) continue;
592 if (prop == .Regional_Indicator) {
593 flags += 1;
594 } else break;
595 }
596 return flags;
597 }
598
599 fn prev(iter: *SneakIterator) ?CodePoint {
600 while (iter.cp_iter.prev()) |peeked| {
601 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
602 }
603 return null;
604 }
605};
606
607inline fn setupImpl(wb: *Words, allocator: Allocator) !void {
608 const decompressor = compress.flate.inflate.decompressor;
609 const in_bytes = @embedFile("wbp");
610 var in_fbs = std.io.fixedBufferStream(in_bytes);
611 var in_decomp = decompressor(.raw, in_fbs.reader());
612 var reader = in_decomp.reader();
613
614 const endian = builtin.cpu.arch.endian();
615
616 const stage_1_len: u16 = try reader.readInt(u16, endian);
617 wb.s1 = try allocator.alloc(u16, stage_1_len);
618 errdefer allocator.free(wb.s1);
619 for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian);
620
621 const stage_2_len: u16 = try reader.readInt(u16, endian);
622 wb.s2 = try allocator.alloc(u5, stage_2_len);
623 errdefer allocator.free(wb.s2);
624 for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian));
625 var count_0: usize = 0;
626 for (wb.s2) |nyb| {
627 if (nyb == 0) count_0 += 1;
628 }
629}
630
631//| Predicates
632
633inline fn isNewline(wbp: WordBreakProperty) bool {
634 return wbp == .CR or wbp == .LF or wbp == .Newline;
635}
636
637inline fn isIgnorable(wbp: WordBreakProperty) bool {
638 return switch (wbp) {
639 .Format, .Extend, .ZWJ => true,
640 else => false,
641 };
642}
643
644inline fn isAHLetter(wbp: WordBreakProperty) bool {
645 return wbp == .ALetter or wbp == .Hebrew_Letter;
646}
647
648inline fn isMidVal(wbp: WordBreakProperty) bool {
649 return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
650}
651
652inline fn isMidNum(wbp: WordBreakProperty) bool {
653 return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote;
654}
655
656inline fn isExtensible(wbp: WordBreakProperty) bool {
657 return switch (wbp) {
658 .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
659 else => false,
660 };
661}
662
663test "Word Break Properties" {
664 const wb = try Words.init(testing.allocator);
665 defer wb.deinit(testing.allocator);
666 try testing.expectEqual(.CR, wb.breakProperty('\r'));
667 try testing.expectEqual(.LF, wb.breakProperty('\n'));
668 try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש'));
669 try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}'));
670}
671
672test "ext_pict" {
673 try testing.expect(ext_pict.isMatch("👇"));
674 try testing.expect(ext_pict.isMatch("\u{2701}"));
675}
676
677test "Words" {
678 const wb = try Words.init(testing.allocator);
679 defer wb.deinit(testing.allocator);
680 const word_str = "Metonym Μετωνύμιο メトニム";
681 var w_iter = wb.iterator(word_str);
682 try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str));
683 // Spaces are "words" too!
684 try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str));
685 const in_greek = w_iter.next().?;
686 for (in_greek.offset..in_greek.offset + in_greek.len) |i| {
687 const at_index = wb.wordAtIndex(word_str, i).bytes(word_str);
688 try testing.expectEqualStrings("Μετωνύμιο", at_index);
689 }
690 _ = w_iter.next();
691 try testing.expectEqualStrings("メトニム", w_iter.next().?.bytes(word_str));
692}
693
694test wordAtIndex {
695 const wb = try Words.init(testing.allocator);
696 defer wb.deinit(testing.allocator);
697 const t_string = "first second third";
698 const second = wb.wordAtIndex(t_string, 8);
699 try testing.expectEqualStrings("second", second.bytes(t_string));
700 const third = wb.wordAtIndex(t_string, 14);
701 try testing.expectEqualStrings("third", third.bytes(t_string));
702 {
703 const first = wb.wordAtIndex(t_string, 3);
704 try testing.expectEqualStrings("first", first.bytes(t_string));
705 }
706 {
707 const first = wb.wordAtIndex(t_string, 0);
708 try testing.expectEqualStrings("first", first.bytes(t_string));
709 }
710 const last = wb.wordAtIndex(t_string, 14);
711 try testing.expectEqualStrings("third", last.bytes(t_string));
712}
713
714const testr = "don't a:ka fin!";
715
716test "reversal" {
717 const wb = try Words.init(testing.allocator);
718 defer wb.deinit(testing.allocator);
719 {
720 var fwd = wb.iterator(testr);
721 var this_word: ?Word = fwd.next();
722
723 while (this_word) |this| : (this_word = fwd.next()) {
724 var back = fwd.reverseIterator();
725 const that_word = back.prev();
726 if (that_word) |that| {
727 try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr));
728 } else {
729 try testing.expect(false);
730 }
731 }
732 }
733 {
734 var back = wb.reverseIterator(testr);
735 var this_word: ?Word = back.prev();
736
737 while (this_word) |this| : (this_word = back.prev()) {
738 var fwd = back.forwardIterator();
739 const that_word = fwd.next();
740 if (that_word) |that| {
741 try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr));
742 } else {
743 try testing.expect(false);
744 }
745 }
746 }
747}
748
749fn testAllocations(allocator: Allocator) !void {
750 const wb = try Words.init(allocator);
751 wb.deinit(allocator);
752}
753
754test "allocation safety" {
755 try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{});
756}
757
758const std = @import("std");
759const builtin = @import("builtin");
760const compress = std.compress;
761const mem = std.mem;
762const Allocator = mem.Allocator;
763const assert = std.debug.assert;
764const testing = std.testing;
765
766const uoffset = code_point.uoffset;
767
768const code_point = @import("code_point");
769const CodepointIterator = code_point.Iterator;
770const ReverseCodepointIterator = code_point.ReverseIterator;
771const CodePoint = code_point.CodePoint;
772
773const ext_pict = @import("micro_runeset.zig").Extended_Pictographic;