summaryrefslogtreecommitdiff
path: root/src/Words.zig
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-16 12:06:32 -0400
committerGravatar Sam Atman2025-05-16 12:06:32 -0400
commit21d55b730e25733c98fa58de42d256adf5f80d88 (patch)
treeeb4745e5e25b122dcc2b603e87f132eb3a17b02c /src/Words.zig
parentProofread (diff)
downloadzg-21d55b730e25733c98fa58de42d256adf5f80d88.tar.gz
zg-21d55b730e25733c98fa58de42d256adf5f80d88.tar.xz
zg-21d55b730e25733c98fa58de42d256adf5f80d88.zip
Move WordBreak to Words
Diffstat (limited to 'src/Words.zig')
-rw-r--r--src/Words.zig720
1 files changed, 720 insertions, 0 deletions
diff --git a/src/Words.zig b/src/Words.zig
new file mode 100644
index 0000000..6a532f5
--- /dev/null
+++ b/src/Words.zig
@@ -0,0 +1,720 @@
1//! Word Breaking Algorithm.
2
3const WordBreakProperty = enum(u5) {
4 none,
5 Double_Quote,
6 Single_Quote,
7 Hebrew_Letter,
8 CR,
9 LF,
10 Newline,
11 Extend,
12 Regional_Indicator,
13 Format,
14 Katakana,
15 ALetter,
16 MidLetter,
17 MidNum,
18 MidNumLet,
19 Numeric,
20 ExtendNumLet,
21 ZWJ,
22 WSegSpace,
23};
24
25s1: []u16 = undefined,
26s2: []u5 = undefined,
27
28const WordBreak = @This();
29
30pub fn init(allocator: Allocator) Allocator.Error!WordBreak {
31 var wb: WordBreak = undefined;
32 try wb.setup(allocator);
33 return wb;
34}
35
36pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void {
37 wb.setupImpl(allocator) catch |err| {
38 switch (err) {
39 error.OutOfMemory => |e| return e,
40 else => unreachable,
41 }
42 };
43}
44
45pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void {
46 allocator.free(wordbreak.s1);
47 allocator.free(wordbreak.s2);
48}
49
50/// Represents a Unicode word span, as an offset into the source string
51/// and the length of the word.
52pub const Word = struct {
53 offset: u32,
54 len: u32,
55
56 /// Returns a slice of the word given the source string.
57 pub fn bytes(self: Word, src: []const u8) []const u8 {
58 return src[self.offset..][0..self.len];
59 }
60};
61
62/// Returns the word break property type for `cp`.
63pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
64 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
65}
66
67/// Convenience function for working with CodePoints
68fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty {
69 return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]);
70}
71
72/// Returns the Word at the given index. Asserts that the index is less than
73/// `string.len`, and that the string is not empty. Always returns a word.
74/// The index does not have to be the start of a codepoint in the word.
75pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word {
76 assert(index < string.len and string.len > 0);
77 var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index);
78 const first_back = iter_back.prev();
79 if (first_back) |back| {
80 if (back.offset == 0) {
81 var iter_fwd = wordbreak.iterator(string);
82 while (iter_fwd.next()) |word| {
83 if (word.offset <= index and index < word.offset + word.len)
84 return word;
85 }
86 }
87 } else {
88 var iter_fwd = wordbreak.iterator(string);
89 while (iter_fwd.next()) |word| {
90 if (word.offset <= index and index < word.offset + word.len)
91 return word;
92 }
93 }
94 const second_back = iter_back.prev();
95 if (second_back) |back| if (back.offset == 0) {
96 var iter_fwd = wordbreak.iterator(string);
97 while (iter_fwd.next()) |word| {
98 if (word.offset <= index and index < word.offset + word.len)
99 return word;
100 }
101 };
102 // There's sometimes flags:
103 if (iter_back.flags > 0) {
104 while (iter_back.flags > 0) {
105 if (iter_back.prev()) |_| {
106 continue;
107 } else {
108 break;
109 }
110 }
111 }
112 var iter_fwd = iter_back.forwardIterator();
113 while (iter_fwd.next()) |word| {
114 if (word.offset <= index and index < word.offset + word.len)
115 return word;
116 }
117 unreachable;
118}
119
120/// Returns an iterator over words in `slice`.
121pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator {
122 return Iterator.init(wordbreak, slice);
123}
124
125/// Returns a reverse iterator over the words in `slice`.
126pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator {
127 return ReverseIterator.init(wordbreak, slice);
128}
129
130/// An iterator, forward, over all words in a provided string.
131pub const Iterator = struct {
132 this: ?CodePoint = null,
133 that: ?CodePoint = null,
134 cp_iter: CodepointIterator,
135 wb: *const WordBreak,
136
137 /// Assumes `str` is valid UTF-8.
138 pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
139 var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb };
140 wb_iter.advance();
141 return wb_iter;
142 }
143
144 /// Returns the next word segment, without advancing.
145 pub fn peek(iter: *Iterator) ?Word {
146 const cache = .{ iter.this, iter.that, iter.cp_iter };
147 defer {
148 iter.this, iter.that, iter.cp_iter = cache;
149 }
150 return iter.next();
151 }
152
153 /// Returns a reverse iterator from the point this iterator is paused
154 /// at. Usually, and always when using the API to create iterators,
155 /// calling `prev()` will return the word just seen.
156 pub fn reverseIterator(iter: *Iterator) ReverseIterator {
157 var cp_it = iter.cp_iter.reverseIterator();
158 if (iter.that) |_|
159 _ = cp_it.prev();
160 if (iter.cp_iter.peek()) |_|
161 _ = cp_it.prev();
162 return .{
163 .wb = iter.wb,
164 .before = cp_it.prev(),
165 .after = iter.that,
166 .cp_iter = cp_it,
167 };
168 }
169
170 /// Returns the next word segment, if any.
171 pub fn next(iter: *Iterator) ?Word {
172 iter.advance();
173
174 // Done?
175 if (iter.this == null) return null;
176 // Last?
177 if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
178
179 const word_start = iter.this.?.offset;
180 var word_len: u32 = 0;
181
182 // State variables.
183 var last_p: WordBreakProperty = .none;
184 var last_last_p: WordBreakProperty = .none;
185 var ri_count: usize = 0;
186
187 scan: while (true) : (iter.advance()) {
188 const this = iter.this.?;
189 word_len += this.len;
190 if (iter.that) |that| {
191 const this_p = iter.wb.breakProp(this);
192 const that_p = iter.wb.breakProp(that);
193 if (!isIgnorable(this_p)) {
194 last_last_p = last_p;
195 last_p = this_p;
196 }
197 // WB3 CR × LF
198 if (this_p == .CR and that_p == .LF) continue :scan;
199 // WB3a (Newline | CR | LF) ÷
200 if (isNewline(this_p)) break :scan;
201 // WB3b ÷ (Newline | CR | LF)
202 if (isNewline(that_p)) break :scan;
203 // WB3c ZWJ × \p{Extended_Pictographic}
204 if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) {
205 continue :scan;
206 }
207 // WB3d WSegSpace × WSegSpace
208 if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
209 // WB4 X (Extend | Format | ZWJ)* → X
210 if (isIgnorable(that_p)) {
211 continue :scan;
212 } // Now we use last_p instead of this_p for ignorable's sake
213 if (isAHLetter(last_p)) {
214 // WB5 AHLetter × AHLetter
215 if (isAHLetter(that_p)) continue :scan;
216 // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter
217 if (isMidVal(that_p)) {
218 const next_val = iter.peekPast();
219 if (next_val) |next_cp| {
220 const next_p = iter.wb.breakProp(next_cp);
221 if (isAHLetter(next_p)) {
222 continue :scan;
223 }
224 }
225 }
226 }
227 // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
228 if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) {
229 continue :scan;
230 }
231 if (last_p == .Hebrew_Letter) {
232 // WB7a Hebrew_Letter × Single_Quote
233 if (that_p == .Single_Quote) continue :scan;
234 // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
235 if (that_p == .Double_Quote) {
236 const next_val = iter.peekPast();
237 if (next_val) |next_cp| {
238 const next_p = iter.wb.breakProp(next_cp);
239 if (next_p == .Hebrew_Letter) {
240 continue :scan;
241 }
242 }
243 }
244 }
245 // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
246 if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter)
247 continue :scan;
248 // WB8 Numeric × Numeric
249 if (last_p == .Numeric and that_p == .Numeric) continue :scan;
250 // WB9 AHLetter × Numeric
251 if (isAHLetter(last_p) and that_p == .Numeric) continue :scan;
252 // WB10 Numeric × AHLetter
253 if (last_p == .Numeric and isAHLetter(that_p)) continue :scan;
254 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric
255 if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric)
256 continue :scan;
257 // WB12 Numeric × (MidNum | MidNumLetQ) Numeric
258 if (last_p == .Numeric and isMidNum(that_p)) {
259 const next_val = iter.peekPast();
260 if (next_val) |next_cp| {
261 const next_p = iter.wb.breakProp(next_cp);
262 if (next_p == .Numeric) {
263 continue :scan;
264 }
265 }
266 }
267 // WB13 Katakana × Katakana
268 if (last_p == .Katakana and that_p == .Katakana) continue :scan;
269 // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
270 if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan;
271 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana)
272 if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
273 // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI
274 const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator;
275 if (maybe_flag) {
276 ri_count += 1;
277 if (ri_count % 2 == 1) continue :scan;
278 }
279 // WB999 Any ÷ Any
280 break :scan;
281 } else { // iter.that == null
282 break :scan;
283 }
284 }
285
286 return Word{ .len = word_len, .offset = word_start };
287 }
288
289 pub fn format(iter: Iterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
290 try writer.print(
291 "Iterator {{ .this = {any}, .that = {any} }}",
292 .{ iter.this, iter.that },
293 );
294 }
295
296 fn advance(iter: *Iterator) void {
297 iter.this = iter.that;
298 iter.that = iter.cp_iter.next();
299 }
300
301 fn peekPast(iter: *Iterator) ?CodePoint {
302 const save_cp = iter.cp_iter;
303 defer iter.cp_iter = save_cp;
304 while (iter.cp_iter.peek()) |peeked| {
305 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
306 _ = iter.cp_iter.next();
307 }
308 return null;
309 }
310};
311
312/// An iterator, backward, over all words in a provided string.
313pub const ReverseIterator = struct {
314 after: ?CodePoint = null,
315 before: ?CodePoint = null,
316 cp_iter: ReverseCodepointIterator,
317 wb: *const WordBreak,
318 flags: usize = 0,
319
320 /// Assumes `str` is valid UTF-8.
321 pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator {
322 var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb };
323 wb_iter.advance();
324 return wb_iter;
325 }
326
327 /// Returns the previous word segment, if any, without advancing.
328 pub fn peek(iter: *ReverseIterator) ?Word {
329 const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags };
330 defer {
331 iter.before, iter.after, iter.cp_iter, iter.flags = cache;
332 }
333 return iter.prev();
334 }
335
336 /// Return a forward iterator from where this iterator paused. Usually,
337 /// and always when using the API to create iterators, calling `next()`
338 /// will return the word just seen.
339 pub fn forwardIterator(iter: *ReverseIterator) Iterator {
340 var cp_it = iter.cp_iter.forwardIterator();
341 if (iter.before) |_|
342 _ = cp_it.next();
343 return .{
344 .wb = iter.wb,
345 .this = cp_it.next(),
346 .that = iter.after,
347 .cp_iter = cp_it,
348 };
349 }
350
351 /// Return the previous word, if any.
352 pub fn prev(iter: *ReverseIterator) ?Word {
353 iter.advance();
354
355 // Done?
356 if (iter.after == null) return null;
357 // Last?
358 if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 };
359
360 const word_end = iter.after.?.offset + iter.after.?.len;
361 var word_len: u32 = 0;
362
363 // State variables.
364 var last_p: WordBreakProperty = .none;
365 var last_last_p: WordBreakProperty = .none;
366
367 scan: while (true) : (iter.advance()) {
368 const after = iter.after.?;
369 word_len += after.len;
370 if (iter.before) |before| {
371 var sneak = sneaky(iter); // 'sneaks' past ignorables
372 const after_p = iter.wb.breakProp(after);
373 var before_p = iter.wb.breakProp(before);
374 if (!isIgnorable(after_p)) {
375 last_last_p = last_p;
376 last_p = after_p;
377 }
378 // WB3 CR × LF
379 if (before_p == .CR and after_p == .LF) continue :scan;
380 // WB3a (Newline | CR | LF) ÷
381 if (isNewline(before_p)) break :scan;
382 // WB3b ÷ (Newline | CR | LF)
383 if (isNewline(after_p)) break :scan;
384 // WB3c ZWJ × \p{Extended_Pictographic}
385 if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) {
386 continue :scan;
387 }
388 // WB3d WSegSpace × WSegSpace
389 if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan;
390 // WB4 X (Extend | Format | ZWJ)* → X
391 if (isIgnorable(before_p)) {
392 const maybe_before = sneak.prev();
393 if (maybe_before) |valid_before| {
394 before_p = iter.wb.breakProp(valid_before);
395 } else if (!isIgnorable(after_p)) {
396 // We're done
397 break :scan;
398 }
399 }
400 if (isIgnorable(after_p)) continue :scan;
401 // WB5 AHLetter × AHLetter
402 if (isAHLetter(last_p) and isAHLetter(before_p)) {
403 continue :scan;
404 }
405 // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter
406 if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) {
407 continue :scan;
408 }
409 // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
410 if (isMidVal(before_p) and isAHLetter(last_p)) {
411 const prev_val = sneak.peek();
412 if (prev_val) |prev_cp| {
413 const prev_p = iter.wb.breakProp(prev_cp);
414 if (isAHLetter(prev_p)) {
415 continue :scan;
416 }
417 }
418 }
419 // WB7a Hebrew_Letter × Single_Quote
420 if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan;
421 // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
422 if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) {
423 continue :scan;
424 }
425 // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
426 if (before_p == .Double_Quote and last_p == .Hebrew_Letter) {
427 const prev_val = sneak.peek();
428 if (prev_val) |prev_cp| {
429 const prev_p = iter.wb.breakProp(prev_cp);
430 if (prev_p == .Hebrew_Letter) {
431 continue :scan;
432 }
433 }
434 }
435 // WB8 Numeric × Numeric
436 if (before_p == .Numeric and last_p == .Numeric) continue :scan;
437 // WB9 AHLetter × Numeric
438 if (isAHLetter(before_p) and last_p == .Numeric) continue :scan;
439 // WB10 Numeric × AHLetter
440 if (before_p == .Numeric and isAHLetter(last_p)) continue :scan;
441 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric
442 if (isMidNum(before_p) and last_p == .Numeric) {
443 const prev_val = sneak.peek();
444 if (prev_val) |prev_cp| {
445 const prev_p = iter.wb.breakProp(prev_cp);
446 if (prev_p == .Numeric) {
447 continue :scan;
448 }
449 }
450 }
451 // WB12 Numeric × (MidNum | MidNumLetQ) Numeric
452 if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) {
453 continue :scan;
454 }
455 // WB13 Katakana × Katakana
456 if (before_p == .Katakana and last_p == .Katakana) continue :scan;
457 // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
458 if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan;
459 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana)
460 if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan;
461 // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI
462 // NOTE:
463 // So here we simply have to know whether a run of flags is even or odd.
464 // The whole run. To avoid quadratic behavior (and long flag runs are
465 // actually a thing in the wild), we have to count them once, store that
466 // on the iterator, and decrement each time we see two, possibly breaking
467 // once extra at the beginning. They break up one per flag, once we hit
468 // zero, that's all the flags. If we see another flag we do it again.
469 if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) {
470 defer {
471 if (iter.flags > 0) iter.flags -= 1;
472 }
473 if (iter.flags == 0) {
474 iter.flags = sneak.countFlags();
475 }
476 if (iter.flags % 2 == 0) {
477 continue :scan;
478 }
479 }
480 // WB999 Any ÷ Any
481 break :scan;
482 }
483 break :scan;
484 }
485 return Word{ .len = word_len, .offset = word_end - word_len };
486 }
487
488 pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
489 try writer.print(
490 "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}",
491 .{ iter.before, iter.after, iter.flags },
492 );
493 }
494
495 fn peekPast(iter: *ReverseIterator) ?CodePoint {
496 const save_cp = iter.cp_iter;
497 defer iter.cp_iter = save_cp;
498 while (iter.cp_iter.peek()) |peeked| {
499 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
500 _ = iter.cp_iter.prev();
501 }
502 return null;
503 }
504
505 fn advance(iter: *ReverseIterator) void {
506 iter.after = iter.before;
507 iter.before = iter.cp_iter.prev();
508 }
509};
510
511//| Implementation Details
512
513/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`.
514fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator {
515 var idx: u32 = @intCast(index);
516 // Find the next lead byte:
517 while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
518 if (idx == string.len) return wb.reverseIterator(string);
519 var iter: ReverseIterator = undefined;
520 iter.wb = wb;
521 iter.flags = 0;
522 // We need to populate the CodePoints, and the codepoint iterator.
523 // Consider "abc| def" with the cursor as |.
524 // We need `before` to be `c` and `after` to be ' ',
525 // and `cp_iter.prev()` to be `b`.
526 var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx };
527 iter.after = cp_iter.prev();
528 iter.before = cp_iter.prev();
529 iter.cp_iter = cp_iter;
530 return iter;
531}
532
533fn sneaky(iter: *const ReverseIterator) SneakIterator {
534 return .{ .cp_iter = iter.cp_iter, .wb = iter.wb };
535}
536
537const SneakIterator = struct {
538 cp_iter: ReverseCodepointIterator,
539 wb: *const WordBreak,
540
541 fn peek(iter: *SneakIterator) ?CodePoint {
542 const save_cp = iter.cp_iter;
543 defer iter.cp_iter = save_cp;
544 while (iter.cp_iter.peek()) |peeked| {
545 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
546 _ = iter.cp_iter.prev();
547 }
548 return null;
549 }
550
551 fn countFlags(iter: *SneakIterator) usize {
552 var flags: usize = 0;
553 const save_cp = iter.cp_iter;
554 defer iter.cp_iter = save_cp;
555 while (iter.cp_iter.prev()) |cp| {
556 const prop = iter.wb.breakProp(cp);
557 if (isIgnorable(prop)) continue;
558 if (prop == .Regional_Indicator) {
559 flags += 1;
560 } else break;
561 }
562 return flags;
563 }
564
565 fn prev(iter: *SneakIterator) ?CodePoint {
566 while (iter.cp_iter.prev()) |peeked| {
567 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked;
568 }
569 return null;
570 }
571};
572
573inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void {
574 const decompressor = compress.flate.inflate.decompressor;
575 const in_bytes = @embedFile("wbp");
576 var in_fbs = std.io.fixedBufferStream(in_bytes);
577 var in_decomp = decompressor(.raw, in_fbs.reader());
578 var reader = in_decomp.reader();
579
580 const endian = builtin.cpu.arch.endian();
581
582 const stage_1_len: u16 = try reader.readInt(u16, endian);
583 wb.s1 = try allocator.alloc(u16, stage_1_len);
584 errdefer allocator.free(wb.s1);
585 for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian);
586
587 const stage_2_len: u16 = try reader.readInt(u16, endian);
588 wb.s2 = try allocator.alloc(u5, stage_2_len);
589 errdefer allocator.free(wb.s2);
590 for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian));
591 var count_0: usize = 0;
592 for (wb.s2) |nyb| {
593 if (nyb == 0) count_0 += 1;
594 }
595}
596
597//| Predicates
598
599inline fn isNewline(wbp: WordBreakProperty) bool {
600 return wbp == .CR or wbp == .LF or wbp == .Newline;
601}
602
603inline fn isIgnorable(wbp: WordBreakProperty) bool {
604 return switch (wbp) {
605 .Format, .Extend, .ZWJ => true,
606 else => false,
607 };
608}
609
610inline fn isAHLetter(wbp: WordBreakProperty) bool {
611 return wbp == .ALetter or wbp == .Hebrew_Letter;
612}
613
614inline fn isMidVal(wbp: WordBreakProperty) bool {
615 return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
616}
617
618inline fn isMidNum(wbp: WordBreakProperty) bool {
619 return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote;
620}
621
622inline fn isExtensible(wbp: WordBreakProperty) bool {
623 return switch (wbp) {
624 .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
625 else => false,
626 };
627}
628
629test "Word Break Properties" {
630 const wb = try WordBreak.init(testing.allocator);
631 defer wb.deinit(testing.allocator);
632 try testing.expectEqual(.CR, wb.breakProperty('\r'));
633 try testing.expectEqual(.LF, wb.breakProperty('\n'));
634 try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש'));
635 try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}'));
636}
637
638test "ext_pict" {
639 try testing.expect(ext_pict.isMatch("👇"));
640 try testing.expect(ext_pict.isMatch("\u{2701}"));
641}
642
643test wordAtIndex {
644 const wb = try WordBreak.init(testing.allocator);
645 defer wb.deinit(testing.allocator);
646 const t_string = "first second third";
647 const second = wb.wordAtIndex(t_string, 8);
648 try testing.expectEqualStrings("second", second.bytes(t_string));
649 const third = wb.wordAtIndex(t_string, 14);
650 try testing.expectEqualStrings("third", third.bytes(t_string));
651 {
652 const first = wb.wordAtIndex(t_string, 3);
653 try testing.expectEqualStrings("first", first.bytes(t_string));
654 }
655 {
656 const first = wb.wordAtIndex(t_string, 0);
657 try testing.expectEqualStrings("first", first.bytes(t_string));
658 }
659 const last = wb.wordAtIndex(t_string, 14);
660 try testing.expectEqualStrings("third", last.bytes(t_string));
661}
662
663const testr = "don't a:ka fin!";
664
665test "reversal" {
666 const wb = try WordBreak.init(testing.allocator);
667 defer wb.deinit(testing.allocator);
668 {
669 var fwd = wb.iterator(testr);
670 var this_word: ?Word = fwd.next();
671
672 while (this_word) |this| : (this_word = fwd.next()) {
673 var back = fwd.reverseIterator();
674 const that_word = back.prev();
675 if (that_word) |that| {
676 try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr));
677 } else {
678 try testing.expect(false);
679 }
680 }
681 }
682 {
683 var back = wb.reverseIterator(testr);
684 var this_word: ?Word = back.prev();
685
686 while (this_word) |this| : (this_word = back.prev()) {
687 var fwd = back.forwardIterator();
688 const that_word = fwd.next();
689 if (that_word) |that| {
690 try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr));
691 } else {
692 try testing.expect(false);
693 }
694 }
695 }
696}
697
698fn testAllocations(allocator: Allocator) !void {
699 const wb = try WordBreak.init(allocator);
700 wb.deinit(allocator);
701}
702
703test "allocation safety" {
704 try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{});
705}
706
707const std = @import("std");
708const builtin = @import("builtin");
709const compress = std.compress;
710const mem = std.mem;
711const Allocator = mem.Allocator;
712const assert = std.debug.assert;
713const testing = std.testing;
714
715const code_point = @import("code_point");
716const CodepointIterator = code_point.Iterator;
717const ReverseCodepointIterator = code_point.ReverseIterator;
718const CodePoint = code_point.CodePoint;
719
720const ext_pict = @import("micro_runeset.zig").Extended_Pictographic;