summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Words.zig215
-rw-r--r--src/unicode_tests.zig14
2 files changed, 85 insertions, 144 deletions
diff --git a/src/Words.zig b/src/Words.zig
index ce3203f..aeb25d1 100644
--- a/src/Words.zig
+++ b/src/Words.zig
@@ -3,6 +3,8 @@
3//! https://www.unicode.org/reports/tr29/#Word_Boundaries 3//! https://www.unicode.org/reports/tr29/#Word_Boundaries
4//! 4//!
5 5
6const Words = @This();
7
6const WordBreakProperty = enum(u5) { 8const WordBreakProperty = enum(u5) {
7 none, 9 none,
8 Double_Quote, 10 Double_Quote,
@@ -25,30 +27,18 @@ const WordBreakProperty = enum(u5) {
25 WSegSpace, 27 WSegSpace,
26}; 28};
27 29
28s1: []u16 = undefined, 30const Data = struct {
29s2: []u5 = undefined, 31 s1: []const u16 = undefined,
30 32 s2: []const u5 = undefined,
31const Words = @This(); 33};
32
33pub fn init(allocator: Allocator) Allocator.Error!Words {
34 var wb: Words = undefined;
35 try wb.setup(allocator);
36 return wb;
37}
38 34
39pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void { 35const wbp = display_width: {
40 wb.setupImpl(allocator) catch |err| { 36 const data = @import("wbp");
41 switch (err) { 37 break :display_width Data{
42 error.OutOfMemory => |e| return e, 38 .s1 = &data.s1,
43 else => unreachable, 39 .s2 = &data.s2,
44 }
45 }; 40 };
46} 41};
47
48pub fn deinit(words: *const Words, allocator: mem.Allocator) void {
49 allocator.free(words.s1);
50 allocator.free(words.s2);
51}
52 42
53/// Represents a Unicode word span, as an offset into the source string 43/// Represents a Unicode word span, as an offset into the source string
54/// and the length of the word. 44/// and the length of the word.
@@ -63,32 +53,32 @@ pub const Word = struct {
63}; 53};
64 54
65/// Returns the word break property type for `cp`. 55/// Returns the word break property type for `cp`.
66pub fn breakProperty(words: *const Words, cp: u21) WordBreakProperty { 56pub fn breakProperty(cp: u21) WordBreakProperty {
67 return @enumFromInt(words.s2[words.s1[cp >> 8] + (cp & 0xff)]); 57 return @enumFromInt(wbp.s2[wbp.s1[cp >> 8] + (cp & 0xff)]);
68} 58}
69 59
70/// Convenience function for working with CodePoints 60/// Convenience function for working with CodePoints
71fn breakProp(words: *const Words, point: CodePoint) WordBreakProperty { 61fn breakProp(point: CodePoint) WordBreakProperty {
72 return @enumFromInt(words.s2[words.s1[point.code >> 8] + (point.code & 0xff)]); 62 return @enumFromInt(wbp.s2[wbp.s1[point.code >> 8] + (point.code & 0xff)]);
73} 63}
74 64
75/// Returns the Word at the given index. Asserts that the index is less than 65/// Returns the Word at the given index. Asserts that the index is less than
76/// `string.len`, and that the string is not empty. Always returns a word. 66/// `string.len`, and that the string is not empty. Always returns a word.
77/// The index does not have to be the start of a codepoint in the word. 67/// The index does not have to be the start of a codepoint in the word.
78pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word { 68pub fn wordAtIndex(string: []const u8, index: usize) Word {
79 assert(index < string.len and string.len > 0); 69 assert(index < string.len and string.len > 0);
80 var iter_back: ReverseIterator = reverseFromIndex(words, string, index); 70 var iter_back: ReverseIterator = reverseFromIndex(string, index);
81 const first_back = iter_back.prev(); 71 const first_back = iter_back.prev();
82 if (first_back) |back| { 72 if (first_back) |back| {
83 if (back.offset == 0) { 73 if (back.offset == 0) {
84 var iter_fwd = words.iterator(string); 74 var iter_fwd = Words.iterator(string);
85 while (iter_fwd.next()) |word| { 75 while (iter_fwd.next()) |word| {
86 if (word.offset <= index and index < word.offset + word.len) 76 if (word.offset <= index and index < word.offset + word.len)
87 return word; 77 return word;
88 } 78 }
89 } 79 }
90 } else { 80 } else {
91 var iter_fwd = words.iterator(string); 81 var iter_fwd = Words.iterator(string);
92 while (iter_fwd.next()) |word| { 82 while (iter_fwd.next()) |word| {
93 if (word.offset <= index and index < word.offset + word.len) 83 if (word.offset <= index and index < word.offset + word.len)
94 return word; 84 return word;
@@ -114,23 +104,23 @@ pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word {
114} 104}
115 105
116/// Returns an iterator over words in `slice`. 106/// Returns an iterator over words in `slice`.
117pub fn iterator(words: *const Words, slice: []const u8) Iterator { 107pub fn iterator(slice: []const u8) Iterator {
118 return Iterator.init(words, slice); 108 return Iterator.init(slice);
119} 109}
120 110
121/// Returns a reverse iterator over the words in `slice`. 111/// Returns a reverse iterator over the words in `slice`.
122pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator { 112pub fn reverseIterator(slice: []const u8) ReverseIterator {
123 return ReverseIterator.init(words, slice); 113 return ReverseIterator.init(slice);
124} 114}
125 115
126/// Returns an iterator after the `word` in `slice`. 116/// Returns an iterator after the `word` in `slice`.
127pub fn iterateAfterWord(words: *const Words, slice: []const u8, word: Word) Iterator { 117pub fn iterateAfterWord(slice: []const u8, word: Word) Iterator {
128 return forwardFromIndex(words, slice, word.offset + word.len); 118 return forwardFromIndex(slice, word.offset + word.len);
129} 119}
130 120
131/// Returns a reverse iterator before the `word` in `slice`. 121/// Returns a reverse iterator before the `word` in `slice`.
132pub fn iterateBeforeWord(words: *const Words, slice: []const u8, word: Word) ReverseIterator { 122pub fn iterateBeforeWord(slice: []const u8, word: Word) ReverseIterator {
133 return reverseFromIndex(words, slice, word.offset); 123 return reverseFromIndex(slice, word.offset);
134} 124}
135 125
136/// An iterator, forward, over all words in a provided string. 126/// An iterator, forward, over all words in a provided string.
@@ -138,11 +128,10 @@ pub const Iterator = struct {
138 this: ?CodePoint = null, 128 this: ?CodePoint = null,
139 that: ?CodePoint = null, 129 that: ?CodePoint = null,
140 cp_iter: CodepointIterator, 130 cp_iter: CodepointIterator,
141 wb: *const Words,
142 131
143 /// Assumes `str` is valid UTF-8. 132 /// Assumes `str` is valid UTF-8.
144 pub fn init(words: *const Words, str: []const u8) Iterator { 133 pub fn init(str: []const u8) Iterator {
145 var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = words }; 134 var wb_iter: Iterator = .{ .cp_iter = .init(str) };
146 wb_iter.advance(); 135 wb_iter.advance();
147 return wb_iter; 136 return wb_iter;
148 } 137 }
@@ -166,7 +155,6 @@ pub const Iterator = struct {
166 if (iter.cp_iter.peek()) |_| 155 if (iter.cp_iter.peek()) |_|
167 _ = cp_it.prev(); 156 _ = cp_it.prev();
168 return .{ 157 return .{
169 .wb = iter.wb,
170 .before = cp_it.prev(), 158 .before = cp_it.prev(),
171 .after = iter.that, 159 .after = iter.that,
172 .cp_iter = cp_it, 160 .cp_iter = cp_it,
@@ -194,8 +182,8 @@ pub const Iterator = struct {
194 const this = iter.this.?; 182 const this = iter.this.?;
195 word_len += this.len; 183 word_len += this.len;
196 if (iter.that) |that| { 184 if (iter.that) |that| {
197 const this_p = iter.wb.breakProp(this); 185 const this_p = Words.breakProp(this);
198 const that_p = iter.wb.breakProp(that); 186 const that_p = Words.breakProp(that);
199 if (!isIgnorable(this_p)) { 187 if (!isIgnorable(this_p)) {
200 last_last_p = last_p; 188 last_last_p = last_p;
201 last_p = this_p; 189 last_p = this_p;
@@ -223,7 +211,7 @@ pub const Iterator = struct {
223 if (isMidVal(that_p)) { 211 if (isMidVal(that_p)) {
224 const next_val = iter.peekPast(); 212 const next_val = iter.peekPast();
225 if (next_val) |next_cp| { 213 if (next_val) |next_cp| {
226 const next_p = iter.wb.breakProp(next_cp); 214 const next_p = Words.breakProp(next_cp);
227 if (isAHLetter(next_p)) { 215 if (isAHLetter(next_p)) {
228 continue :scan; 216 continue :scan;
229 } 217 }
@@ -241,7 +229,7 @@ pub const Iterator = struct {
241 if (that_p == .Double_Quote) { 229 if (that_p == .Double_Quote) {
242 const next_val = iter.peekPast(); 230 const next_val = iter.peekPast();
243 if (next_val) |next_cp| { 231 if (next_val) |next_cp| {
244 const next_p = iter.wb.breakProp(next_cp); 232 const next_p = Words.breakProp(next_cp);
245 if (next_p == .Hebrew_Letter) { 233 if (next_p == .Hebrew_Letter) {
246 continue :scan; 234 continue :scan;
247 } 235 }
@@ -264,7 +252,7 @@ pub const Iterator = struct {
264 if (last_p == .Numeric and isMidNum(that_p)) { 252 if (last_p == .Numeric and isMidNum(that_p)) {
265 const next_val = iter.peekPast(); 253 const next_val = iter.peekPast();
266 if (next_val) |next_cp| { 254 if (next_val) |next_cp| {
267 const next_p = iter.wb.breakProp(next_cp); 255 const next_p = Words.breakProp(next_cp);
268 if (next_p == .Numeric) { 256 if (next_p == .Numeric) {
269 continue :scan; 257 continue :scan;
270 } 258 }
@@ -308,7 +296,7 @@ pub const Iterator = struct {
308 const save_cp = iter.cp_iter; 296 const save_cp = iter.cp_iter;
309 defer iter.cp_iter = save_cp; 297 defer iter.cp_iter = save_cp;
310 while (iter.cp_iter.peek()) |peeked| { 298 while (iter.cp_iter.peek()) |peeked| {
311 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; 299 if (!isIgnorable(Words.breakProp(peeked))) return peeked;
312 _ = iter.cp_iter.next(); 300 _ = iter.cp_iter.next();
313 } 301 }
314 return null; 302 return null;
@@ -320,12 +308,11 @@ pub const ReverseIterator = struct {
320 after: ?CodePoint = null, 308 after: ?CodePoint = null,
321 before: ?CodePoint = null, 309 before: ?CodePoint = null,
322 cp_iter: ReverseCodepointIterator, 310 cp_iter: ReverseCodepointIterator,
323 wb: *const Words,
324 flags: usize = 0, 311 flags: usize = 0,
325 312
326 /// Assumes `str` is valid UTF-8. 313 /// Assumes `str` is valid UTF-8.
327 pub fn init(words: *const Words, str: []const u8) ReverseIterator { 314 pub fn init(str: []const u8) ReverseIterator {
328 var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = words }; 315 var wb_iter: ReverseIterator = .{ .cp_iter = .init(str) };
329 wb_iter.advance(); 316 wb_iter.advance();
330 return wb_iter; 317 return wb_iter;
331 } 318 }
@@ -347,7 +334,6 @@ pub const ReverseIterator = struct {
347 if (iter.before) |_| 334 if (iter.before) |_|
348 _ = cp_it.next(); 335 _ = cp_it.next();
349 return .{ 336 return .{
350 .wb = iter.wb,
351 .this = cp_it.next(), 337 .this = cp_it.next(),
352 .that = iter.after, 338 .that = iter.after,
353 .cp_iter = cp_it, 339 .cp_iter = cp_it,
@@ -375,8 +361,8 @@ pub const ReverseIterator = struct {
375 word_len += after.len; 361 word_len += after.len;
376 if (iter.before) |before| { 362 if (iter.before) |before| {
377 var sneak = sneaky(iter); // 'sneaks' past ignorables 363 var sneak = sneaky(iter); // 'sneaks' past ignorables
378 const after_p = iter.wb.breakProp(after); 364 const after_p = Words.breakProp(after);
379 var before_p = iter.wb.breakProp(before); 365 var before_p = Words.breakProp(before);
380 if (!isIgnorable(after_p)) { 366 if (!isIgnorable(after_p)) {
381 last_last_p = last_p; 367 last_last_p = last_p;
382 last_p = after_p; 368 last_p = after_p;
@@ -397,7 +383,7 @@ pub const ReverseIterator = struct {
397 if (isIgnorable(before_p)) { 383 if (isIgnorable(before_p)) {
398 const maybe_before = sneak.prev(); 384 const maybe_before = sneak.prev();
399 if (maybe_before) |valid_before| { 385 if (maybe_before) |valid_before| {
400 before_p = iter.wb.breakProp(valid_before); 386 before_p = Words.breakProp(valid_before);
401 } else if (!isIgnorable(after_p)) { 387 } else if (!isIgnorable(after_p)) {
402 // We're done 388 // We're done
403 break :scan; 389 break :scan;
@@ -416,7 +402,7 @@ pub const ReverseIterator = struct {
416 if (isMidVal(before_p) and isAHLetter(last_p)) { 402 if (isMidVal(before_p) and isAHLetter(last_p)) {
417 const prev_val = sneak.peek(); 403 const prev_val = sneak.peek();
418 if (prev_val) |prev_cp| { 404 if (prev_val) |prev_cp| {
419 const prev_p = iter.wb.breakProp(prev_cp); 405 const prev_p = Words.breakProp(prev_cp);
420 if (isAHLetter(prev_p)) { 406 if (isAHLetter(prev_p)) {
421 continue :scan; 407 continue :scan;
422 } 408 }
@@ -432,7 +418,7 @@ pub const ReverseIterator = struct {
432 if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { 418 if (before_p == .Double_Quote and last_p == .Hebrew_Letter) {
433 const prev_val = sneak.peek(); 419 const prev_val = sneak.peek();
434 if (prev_val) |prev_cp| { 420 if (prev_val) |prev_cp| {
435 const prev_p = iter.wb.breakProp(prev_cp); 421 const prev_p = Words.breakProp(prev_cp);
436 if (prev_p == .Hebrew_Letter) { 422 if (prev_p == .Hebrew_Letter) {
437 continue :scan; 423 continue :scan;
438 } 424 }
@@ -448,7 +434,7 @@ pub const ReverseIterator = struct {
448 if (isMidNum(before_p) and last_p == .Numeric) { 434 if (isMidNum(before_p) and last_p == .Numeric) {
449 const prev_val = sneak.peek(); 435 const prev_val = sneak.peek();
450 if (prev_val) |prev_cp| { 436 if (prev_val) |prev_cp| {
451 const prev_p = iter.wb.breakProp(prev_cp); 437 const prev_p = Words.breakProp(prev_cp);
452 if (prev_p == .Numeric) { 438 if (prev_p == .Numeric) {
453 continue :scan; 439 continue :scan;
454 } 440 }
@@ -491,7 +477,7 @@ pub const ReverseIterator = struct {
491 return Word{ .len = word_len, .offset = word_end - word_len }; 477 return Word{ .len = word_len, .offset = word_end - word_len };
492 } 478 }
493 479
494 pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { 480 pub fn format(iter: ReverseIterator, writer: anytype) !void {
495 try writer.print( 481 try writer.print(
496 "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}", 482 "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}",
497 .{ iter.before, iter.after, iter.flags }, 483 .{ iter.before, iter.after, iter.flags },
@@ -502,7 +488,7 @@ pub const ReverseIterator = struct {
502 const save_cp = iter.cp_iter; 488 const save_cp = iter.cp_iter;
503 defer iter.cp_iter = save_cp; 489 defer iter.cp_iter = save_cp;
504 while (iter.cp_iter.peek()) |peeked| { 490 while (iter.cp_iter.peek()) |peeked| {
505 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; 491 if (!isIgnorable(Words.breakProp(peeked))) return peeked;
506 _ = iter.cp_iter.prev(); 492 _ = iter.cp_iter.prev();
507 } 493 }
508 return null; 494 return null;
@@ -517,13 +503,12 @@ pub const ReverseIterator = struct {
517//| Implementation Details 503//| Implementation Details
518 504
519/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. 505/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`.
520fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator { 506fn reverseFromIndex(string: []const u8, index: usize) ReverseIterator {
521 var idx: uoffset = @intCast(index); 507 var idx: uoffset = @intCast(index);
522 // Find the next lead byte: 508 // Find the next lead byte:
523 while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} 509 while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
524 if (idx == string.len) return words.reverseIterator(string); 510 if (idx == string.len) return Words.reverseIterator(string);
525 var iter: ReverseIterator = undefined; 511 var iter: ReverseIterator = undefined;
526 iter.wb = words;
527 iter.flags = 0; 512 iter.flags = 0;
528 // We need to populate the CodePoints, and the codepoint iterator. 513 // We need to populate the CodePoints, and the codepoint iterator.
529 // Consider "abc| def" with the cursor as |. 514 // Consider "abc| def" with the cursor as |.
@@ -536,20 +521,18 @@ fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) Rever
536 return iter; 521 return iter;
537} 522}
538 523
539fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator { 524fn forwardFromIndex(string: []const u8, index: usize) Iterator {
540 var idx: uoffset = @intCast(index); 525 var idx: uoffset = @intCast(index);
541 if (idx == string.len) { 526 if (idx == string.len) {
542 return .{ 527 return .{
543 .cp_iter = .{ .bytes = string, .i = idx }, 528 .cp_iter = .{ .bytes = string, .i = idx },
544 .this = null, 529 .this = null,
545 .that = null, 530 .that = null,
546 .wb = words,
547 }; 531 };
548 } 532 }
549 while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} 533 while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {}
550 if (idx == 0) return words.iterator(string); 534 if (idx == 0) return Words.iterator(string);
551 var iter: Iterator = undefined; 535 var iter: Iterator = undefined;
552 iter.wb = words;
553 // We need to populate the CodePoints, and the codepoint iterator. 536 // We need to populate the CodePoints, and the codepoint iterator.
554 // Consider "abc |def" with the cursor as |. 537 // Consider "abc |def" with the cursor as |.
555 // We need `this` to be ` ` and `that` to be 'd', 538 // We need `this` to be ` ` and `that` to be 'd',
@@ -565,18 +548,17 @@ fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Itera
565} 548}
566 549
567fn sneaky(iter: *const ReverseIterator) SneakIterator { 550fn sneaky(iter: *const ReverseIterator) SneakIterator {
568 return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; 551 return .{ .cp_iter = iter.cp_iter };
569} 552}
570 553
571const SneakIterator = struct { 554const SneakIterator = struct {
572 cp_iter: ReverseCodepointIterator, 555 cp_iter: ReverseCodepointIterator,
573 wb: *const Words,
574 556
575 fn peek(iter: *SneakIterator) ?CodePoint { 557 fn peek(iter: *SneakIterator) ?CodePoint {
576 const save_cp = iter.cp_iter; 558 const save_cp = iter.cp_iter;
577 defer iter.cp_iter = save_cp; 559 defer iter.cp_iter = save_cp;
578 while (iter.cp_iter.peek()) |peeked| { 560 while (iter.cp_iter.peek()) |peeked| {
579 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; 561 if (!isIgnorable(Words.breakProp(peeked))) return peeked;
580 _ = iter.cp_iter.prev(); 562 _ = iter.cp_iter.prev();
581 } 563 }
582 return null; 564 return null;
@@ -587,7 +569,7 @@ const SneakIterator = struct {
587 const save_cp = iter.cp_iter; 569 const save_cp = iter.cp_iter;
588 defer iter.cp_iter = save_cp; 570 defer iter.cp_iter = save_cp;
589 while (iter.cp_iter.prev()) |cp| { 571 while (iter.cp_iter.prev()) |cp| {
590 const prop = iter.wb.breakProp(cp); 572 const prop = Words.breakProp(cp);
591 if (isIgnorable(prop)) continue; 573 if (isIgnorable(prop)) continue;
592 if (prop == .Regional_Indicator) { 574 if (prop == .Regional_Indicator) {
593 flags += 1; 575 flags += 1;
@@ -598,73 +580,49 @@ const SneakIterator = struct {
598 580
599 fn prev(iter: *SneakIterator) ?CodePoint { 581 fn prev(iter: *SneakIterator) ?CodePoint {
600 while (iter.cp_iter.prev()) |peeked| { 582 while (iter.cp_iter.prev()) |peeked| {
601 if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; 583 if (!isIgnorable(Words.breakProp(peeked))) return peeked;
602 } 584 }
603 return null; 585 return null;
604 } 586 }
605}; 587};
606 588
607inline fn setupImpl(wb: *Words, allocator: Allocator) !void {
608 const in_bytes = @embedFile("wbp");
609 var in_fbs = std.io.fixedBufferStream(in_bytes);
610 var reader = in_fbs.reader();
611
612 const endian = builtin.cpu.arch.endian();
613
614 const stage_1_len: u16 = try reader.readInt(u16, endian);
615 wb.s1 = try allocator.alloc(u16, stage_1_len);
616 errdefer allocator.free(wb.s1);
617 for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian);
618
619 const stage_2_len: u16 = try reader.readInt(u16, endian);
620 wb.s2 = try allocator.alloc(u5, stage_2_len);
621 errdefer allocator.free(wb.s2);
622 for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian));
623 var count_0: usize = 0;
624 for (wb.s2) |nyb| {
625 if (nyb == 0) count_0 += 1;
626 }
627}
628
629//| Predicates 589//| Predicates
630 590
631inline fn isNewline(wbp: WordBreakProperty) bool { 591inline fn isNewline(w_prop: WordBreakProperty) bool {
632 return wbp == .CR or wbp == .LF or wbp == .Newline; 592 return w_prop == .CR or w_prop == .LF or w_prop == .Newline;
633} 593}
634 594
635inline fn isIgnorable(wbp: WordBreakProperty) bool { 595inline fn isIgnorable(w_prop: WordBreakProperty) bool {
636 return switch (wbp) { 596 return switch (w_prop) {
637 .Format, .Extend, .ZWJ => true, 597 .Format, .Extend, .ZWJ => true,
638 else => false, 598 else => false,
639 }; 599 };
640} 600}
641 601
642inline fn isAHLetter(wbp: WordBreakProperty) bool { 602inline fn isAHLetter(w_prop: WordBreakProperty) bool {
643 return wbp == .ALetter or wbp == .Hebrew_Letter; 603 return w_prop == .ALetter or w_prop == .Hebrew_Letter;
644} 604}
645 605
646inline fn isMidVal(wbp: WordBreakProperty) bool { 606inline fn isMidVal(w_prop: WordBreakProperty) bool {
647 return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; 607 return w_prop == .MidLetter or w_prop == .MidNumLet or w_prop == .Single_Quote;
648} 608}
649 609
650inline fn isMidNum(wbp: WordBreakProperty) bool { 610inline fn isMidNum(w_prop: WordBreakProperty) bool {
651 return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote; 611 return w_prop == .MidNum or w_prop == .MidNumLet or w_prop == .Single_Quote;
652} 612}
653 613
654inline fn isExtensible(wbp: WordBreakProperty) bool { 614inline fn isExtensible(w_prop: WordBreakProperty) bool {
655 return switch (wbp) { 615 return switch (w_prop) {
656 .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, 616 .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
657 else => false, 617 else => false,
658 }; 618 };
659} 619}
660 620
661test "Word Break Properties" { 621test "Word Break Properties" {
662 const wb = try Words.init(testing.allocator); 622 try testing.expectEqual(.CR, Words.breakProperty('\r'));
663 defer wb.deinit(testing.allocator); 623 try testing.expectEqual(.LF, Words.breakProperty('\n'));
664 try testing.expectEqual(.CR, wb.breakProperty('\r')); 624 try testing.expectEqual(.Hebrew_Letter, Words.breakProperty('ש'));
665 try testing.expectEqual(.LF, wb.breakProperty('\n')); 625 try testing.expectEqual(.Katakana, Words.breakProperty('\u{30ff}'));
666 try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש'));
667 try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}'));
668} 626}
669 627
670test "ext_pict" { 628test "ext_pict" {
@@ -673,16 +631,14 @@ test "ext_pict" {
673} 631}
674 632
675test "Words" { 633test "Words" {
676 const wb = try Words.init(testing.allocator);
677 defer wb.deinit(testing.allocator);
678 const word_str = "Metonym Μετωνύμιο メトニム"; 634 const word_str = "Metonym Μετωνύμιο メトニム";
679 var w_iter = wb.iterator(word_str); 635 var w_iter = Words.iterator(word_str);
680 try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str)); 636 try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str));
681 // Spaces are "words" too! 637 // Spaces are "words" too!
682 try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str)); 638 try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str));
683 const in_greek = w_iter.next().?; 639 const in_greek = w_iter.next().?;
684 for (in_greek.offset..in_greek.offset + in_greek.len) |i| { 640 for (in_greek.offset..in_greek.offset + in_greek.len) |i| {
685 const at_index = wb.wordAtIndex(word_str, i).bytes(word_str); 641 const at_index = Words.wordAtIndex(word_str, i).bytes(word_str);
686 try testing.expectEqualStrings("Μετωνύμιο", at_index); 642 try testing.expectEqualStrings("Μετωνύμιο", at_index);
687 } 643 }
688 _ = w_iter.next(); 644 _ = w_iter.next();
@@ -690,32 +646,28 @@ test "Words" {
690} 646}
691 647
692test wordAtIndex { 648test wordAtIndex {
693 const wb = try Words.init(testing.allocator);
694 defer wb.deinit(testing.allocator);
695 const t_string = "first second third"; 649 const t_string = "first second third";
696 const second = wb.wordAtIndex(t_string, 8); 650 const second = Words.wordAtIndex(t_string, 8);
697 try testing.expectEqualStrings("second", second.bytes(t_string)); 651 try testing.expectEqualStrings("second", second.bytes(t_string));
698 const third = wb.wordAtIndex(t_string, 14); 652 const third = Words.wordAtIndex(t_string, 14);
699 try testing.expectEqualStrings("third", third.bytes(t_string)); 653 try testing.expectEqualStrings("third", third.bytes(t_string));
700 { 654 {
701 const first = wb.wordAtIndex(t_string, 3); 655 const first = Words.wordAtIndex(t_string, 3);
702 try testing.expectEqualStrings("first", first.bytes(t_string)); 656 try testing.expectEqualStrings("first", first.bytes(t_string));
703 } 657 }
704 { 658 {
705 const first = wb.wordAtIndex(t_string, 0); 659 const first = Words.wordAtIndex(t_string, 0);
706 try testing.expectEqualStrings("first", first.bytes(t_string)); 660 try testing.expectEqualStrings("first", first.bytes(t_string));
707 } 661 }
708 const last = wb.wordAtIndex(t_string, 14); 662 const last = Words.wordAtIndex(t_string, 14);
709 try testing.expectEqualStrings("third", last.bytes(t_string)); 663 try testing.expectEqualStrings("third", last.bytes(t_string));
710} 664}
711 665
712const testr = "don't a:ka fin!"; 666const testr = "don't a:ka fin!";
713 667
714test "reversal" { 668test "reversal" {
715 const wb = try Words.init(testing.allocator);
716 defer wb.deinit(testing.allocator);
717 { 669 {
718 var fwd = wb.iterator(testr); 670 var fwd = Words.iterator(testr);
719 var this_word: ?Word = fwd.next(); 671 var this_word: ?Word = fwd.next();
720 672
721 while (this_word) |this| : (this_word = fwd.next()) { 673 while (this_word) |this| : (this_word = fwd.next()) {
@@ -729,7 +681,7 @@ test "reversal" {
729 } 681 }
730 } 682 }
731 { 683 {
732 var back = wb.reverseIterator(testr); 684 var back = Words.reverseIterator(testr);
733 var this_word: ?Word = back.prev(); 685 var this_word: ?Word = back.prev();
734 686
735 while (this_word) |this| : (this_word = back.prev()) { 687 while (this_word) |this| : (this_word = back.prev()) {
@@ -744,15 +696,6 @@ test "reversal" {
744 } 696 }
745} 697}
746 698
747fn testAllocations(allocator: Allocator) !void {
748 const wb = try Words.init(allocator);
749 wb.deinit(allocator);
750}
751
752test "allocation safety" {
753 try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{});
754}
755
756const std = @import("std"); 699const std = @import("std");
757const builtin = @import("builtin"); 700const builtin = @import("builtin");
758const compress = std.compress; 701const compress = std.compress;
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 946c197..50b8824 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -255,8 +255,6 @@ test "Segmentation GraphemeIterator" {
255test "Segmentation Word Iterator" { 255test "Segmentation Word Iterator" {
256 const allocator = std.testing.allocator; 256 const allocator = std.testing.allocator;
257 var reader = std.io.Reader.fixed(@embedFile("WordBreakTest.txt")); 257 var reader = std.io.Reader.fixed(@embedFile("WordBreakTest.txt"));
258 const wb = try Words.init(allocator);
259 defer wb.deinit(allocator);
260 258
261 var line_iter: IterRead = .{ .read = &reader }; 259 var line_iter: IterRead = .{ .read = &reader };
262 260
@@ -297,7 +295,7 @@ test "Segmentation Word Iterator" {
297 const this_str = all_bytes.items; 295 const this_str = all_bytes.items;
298 296
299 { 297 {
300 var iter = wb.iterator(this_str); 298 var iter = Words.iterator(this_str);
301 var peeked: ?Word = iter.peek(); 299 var peeked: ?Word = iter.peek();
302 300
303 // Check. 301 // Check.
@@ -330,7 +328,7 @@ test "Segmentation Word Iterator" {
330 } else { 328 } else {
331 try testing.expect(false); 329 try testing.expect(false);
332 } 330 }
333 var peek_iter = wb.iterateAfterWord(this_str, got_word); 331 var peek_iter = Words.iterateAfterWord(this_str, got_word);
334 const peek_1 = peek_iter.next(); 332 const peek_1 = peek_iter.next();
335 if (peek_1) |p1| { 333 if (peek_1) |p1| {
336 const peek_2 = iter.peek(); 334 const peek_2 = iter.peek();
@@ -350,7 +348,7 @@ test "Segmentation Word Iterator" {
350 try testing.expectEqual(null, iter.peek()); 348 try testing.expectEqual(null, iter.peek());
351 } 349 }
352 for (got_word.offset..got_word.offset + got_word.len) |i| { 350 for (got_word.offset..got_word.offset + got_word.len) |i| {
353 const this_word = wb.wordAtIndex(this_str, i); 351 const this_word = Words.wordAtIndex(this_str, i);
354 std.testing.expectEqualSlices( 352 std.testing.expectEqualSlices(
355 u8, 353 u8,
356 got_word.bytes(this_str), 354 got_word.bytes(this_str),
@@ -364,7 +362,7 @@ test "Segmentation Word Iterator" {
364 } 362 }
365 } 363 }
366 { 364 {
367 var r_iter = wb.reverseIterator(this_str); 365 var r_iter = Words.reverseIterator(this_str);
368 var peeked: ?Word = r_iter.peek(); 366 var peeked: ?Word = r_iter.peek();
369 var idx = want.items.len - 1; 367 var idx = want.items.len - 1;
370 368
@@ -399,7 +397,7 @@ test "Segmentation Word Iterator" {
399 } else { 397 } else {
400 try testing.expect(false); 398 try testing.expect(false);
401 } 399 }
402 var peek_iter = wb.iterateBeforeWord(this_str, got_word); 400 var peek_iter = Words.iterateBeforeWord(this_str, got_word);
403 const peek_1 = peek_iter.prev(); 401 const peek_1 = peek_iter.prev();
404 if (peek_1) |p1| { 402 if (peek_1) |p1| {
405 const peek_2 = r_iter.peek(); 403 const peek_2 = r_iter.peek();
@@ -419,7 +417,7 @@ test "Segmentation Word Iterator" {
419 try testing.expectEqual(null, r_iter.peek()); 417 try testing.expectEqual(null, r_iter.peek());
420 } 418 }
421 for (got_word.offset..got_word.offset + got_word.len) |i| { 419 for (got_word.offset..got_word.offset + got_word.len) |i| {
422 const this_word = wb.wordAtIndex(this_str, i); 420 const this_word = Words.wordAtIndex(this_str, i);
423 std.testing.expectEqualSlices( 421 std.testing.expectEqualSlices(
424 u8, 422 u8,
425 got_word.bytes(this_str), 423 got_word.bytes(this_str),