summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-16 12:06:36 -0400
committerGravatar Sam Atman2025-05-16 12:06:36 -0400
commitaa20bebade8eeb3ca75199dc252feb3edb203fb1 (patch)
tree2e832616bbf554ca3a20588d050c0dc764f4cf65 /src
parentMove WordBreak to Words (diff)
downloadzg-aa20bebade8eeb3ca75199dc252feb3edb203fb1.tar.gz
zg-aa20bebade8eeb3ca75199dc252feb3edb203fb1.tar.xz
zg-aa20bebade8eeb3ca75199dc252feb3edb203fb1.zip
Words module
In keeping with the new nomenclature, we're calling the module "Words", not "WordBreak". The latter is Unicode jargon, the module provides word iterators. Words are the figure, word breaks are the ground.
Diffstat (limited to 'src')
-rw-r--r--src/Words.zig42
-rw-r--r--src/unicode_tests.zig6
2 files changed, 24 insertions, 24 deletions
diff --git a/src/Words.zig b/src/Words.zig
index 6a532f5..565a2fb 100644
--- a/src/Words.zig
+++ b/src/Words.zig
@@ -25,15 +25,15 @@ const WordBreakProperty = enum(u5) {
25s1: []u16 = undefined, 25s1: []u16 = undefined,
26s2: []u5 = undefined, 26s2: []u5 = undefined,
27 27
28const WordBreak = @This(); 28const Words = @This();
29 29
30pub fn init(allocator: Allocator) Allocator.Error!WordBreak { 30pub fn init(allocator: Allocator) Allocator.Error!Words {
31 var wb: WordBreak = undefined; 31 var wb: Words = undefined;
32 try wb.setup(allocator); 32 try wb.setup(allocator);
33 return wb; 33 return wb;
34} 34}
35 35
36pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { 36pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void {
37 wb.setupImpl(allocator) catch |err| { 37 wb.setupImpl(allocator) catch |err| {
38 switch (err) { 38 switch (err) {
39 error.OutOfMemory => |e| return e, 39 error.OutOfMemory => |e| return e,
@@ -42,7 +42,7 @@ pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void {
42 }; 42 };
43} 43}
44 44
45pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { 45pub fn deinit(wordbreak: *const Words, allocator: mem.Allocator) void {
46 allocator.free(wordbreak.s1); 46 allocator.free(wordbreak.s1);
47 allocator.free(wordbreak.s2); 47 allocator.free(wordbreak.s2);
48} 48}
@@ -60,19 +60,19 @@ pub const Word = struct {
60}; 60};
61 61
62/// Returns the word break property type for `cp`. 62/// Returns the word break property type for `cp`.
63pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { 63pub fn breakProperty(wordbreak: *const Words, cp: u21) WordBreakProperty {
64 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); 64 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
65} 65}
66 66
67/// Convenience function for working with CodePoints 67/// Convenience function for working with CodePoints
68fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { 68fn breakProp(wb: *const Words, point: CodePoint) WordBreakProperty {
69 return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); 69 return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]);
70} 70}
71 71
72/// Returns the Word at the given index. Asserts that the index is less than 72/// Returns the Word at the given index. Asserts that the index is less than
73/// `string.len`, and that the string is not empty. Always returns a word. 73/// `string.len`, and that the string is not empty. Always returns a word.
74/// The index does not have to be the start of a codepoint in the word. 74/// The index does not have to be the start of a codepoint in the word.
75pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { 75pub fn wordAtIndex(wordbreak: *const Words, string: []const u8, index: usize) Word {
76 assert(index < string.len and string.len > 0); 76 assert(index < string.len and string.len > 0);
77 var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); 77 var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index);
78 const first_back = iter_back.prev(); 78 const first_back = iter_back.prev();
@@ -118,12 +118,12 @@ pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize
118} 118}
119 119
120/// Returns an iterator over words in `slice`. 120/// Returns an iterator over words in `slice`.
121pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { 121pub fn iterator(wordbreak: *const Words, slice: []const u8) Iterator {
122 return Iterator.init(wordbreak, slice); 122 return Iterator.init(wordbreak, slice);
123} 123}
124 124
125/// Returns a reverse iterator over the words in `slice`. 125/// Returns a reverse iterator over the words in `slice`.
126pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator { 126pub fn reverseIterator(wordbreak: *const Words, slice: []const u8) ReverseIterator {
127 return ReverseIterator.init(wordbreak, slice); 127 return ReverseIterator.init(wordbreak, slice);
128} 128}
129 129
@@ -132,10 +132,10 @@ pub const Iterator = struct {
132 this: ?CodePoint = null, 132 this: ?CodePoint = null,
133 that: ?CodePoint = null, 133 that: ?CodePoint = null,
134 cp_iter: CodepointIterator, 134 cp_iter: CodepointIterator,
135 wb: *const WordBreak, 135 wb: *const Words,
136 136
137 /// Assumes `str` is valid UTF-8. 137 /// Assumes `str` is valid UTF-8.
138 pub fn init(wb: *const WordBreak, str: []const u8) Iterator { 138 pub fn init(wb: *const Words, str: []const u8) Iterator {
139 var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; 139 var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb };
140 wb_iter.advance(); 140 wb_iter.advance();
141 return wb_iter; 141 return wb_iter;
@@ -314,11 +314,11 @@ pub const ReverseIterator = struct {
314 after: ?CodePoint = null, 314 after: ?CodePoint = null,
315 before: ?CodePoint = null, 315 before: ?CodePoint = null,
316 cp_iter: ReverseCodepointIterator, 316 cp_iter: ReverseCodepointIterator,
317 wb: *const WordBreak, 317 wb: *const Words,
318 flags: usize = 0, 318 flags: usize = 0,
319 319
320 /// Assumes `str` is valid UTF-8. 320 /// Assumes `str` is valid UTF-8.
321 pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { 321 pub fn init(wb: *const Words, str: []const u8) ReverseIterator {
322 var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; 322 var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb };
323 wb_iter.advance(); 323 wb_iter.advance();
324 return wb_iter; 324 return wb_iter;
@@ -511,7 +511,7 @@ pub const ReverseIterator = struct {
511//| Implementation Details 511//| Implementation Details
512 512
513/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. 513/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`.
514fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator { 514fn initAtIndex(wb: *const Words, string: []const u8, index: usize) ReverseIterator {
515 var idx: u32 = @intCast(index); 515 var idx: u32 = @intCast(index);
516 // Find the next lead byte: 516 // Find the next lead byte:
517 while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} 517 while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
@@ -536,7 +536,7 @@ fn sneaky(iter: *const ReverseIterator) SneakIterator {
536 536
537const SneakIterator = struct { 537const SneakIterator = struct {
538 cp_iter: ReverseCodepointIterator, 538 cp_iter: ReverseCodepointIterator,
539 wb: *const WordBreak, 539 wb: *const Words,
540 540
541 fn peek(iter: *SneakIterator) ?CodePoint { 541 fn peek(iter: *SneakIterator) ?CodePoint {
542 const save_cp = iter.cp_iter; 542 const save_cp = iter.cp_iter;
@@ -570,7 +570,7 @@ const SneakIterator = struct {
570 } 570 }
571}; 571};
572 572
573inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { 573inline fn setupImpl(wb: *Words, allocator: Allocator) !void {
574 const decompressor = compress.flate.inflate.decompressor; 574 const decompressor = compress.flate.inflate.decompressor;
575 const in_bytes = @embedFile("wbp"); 575 const in_bytes = @embedFile("wbp");
576 var in_fbs = std.io.fixedBufferStream(in_bytes); 576 var in_fbs = std.io.fixedBufferStream(in_bytes);
@@ -627,7 +627,7 @@ inline fn isExtensible(wbp: WordBreakProperty) bool {
627} 627}
628 628
629test "Word Break Properties" { 629test "Word Break Properties" {
630 const wb = try WordBreak.init(testing.allocator); 630 const wb = try Words.init(testing.allocator);
631 defer wb.deinit(testing.allocator); 631 defer wb.deinit(testing.allocator);
632 try testing.expectEqual(.CR, wb.breakProperty('\r')); 632 try testing.expectEqual(.CR, wb.breakProperty('\r'));
633 try testing.expectEqual(.LF, wb.breakProperty('\n')); 633 try testing.expectEqual(.LF, wb.breakProperty('\n'));
@@ -641,7 +641,7 @@ test "ext_pict" {
641} 641}
642 642
643test wordAtIndex { 643test wordAtIndex {
644 const wb = try WordBreak.init(testing.allocator); 644 const wb = try Words.init(testing.allocator);
645 defer wb.deinit(testing.allocator); 645 defer wb.deinit(testing.allocator);
646 const t_string = "first second third"; 646 const t_string = "first second third";
647 const second = wb.wordAtIndex(t_string, 8); 647 const second = wb.wordAtIndex(t_string, 8);
@@ -663,7 +663,7 @@ test wordAtIndex {
663const testr = "don't a:ka fin!"; 663const testr = "don't a:ka fin!";
664 664
665test "reversal" { 665test "reversal" {
666 const wb = try WordBreak.init(testing.allocator); 666 const wb = try Words.init(testing.allocator);
667 defer wb.deinit(testing.allocator); 667 defer wb.deinit(testing.allocator);
668 { 668 {
669 var fwd = wb.iterator(testr); 669 var fwd = wb.iterator(testr);
@@ -696,7 +696,7 @@ test "reversal" {
696} 696}
697 697
698fn testAllocations(allocator: Allocator) !void { 698fn testAllocations(allocator: Allocator) !void {
699 const wb = try WordBreak.init(allocator); 699 const wb = try Words.init(allocator);
700 wb.deinit(allocator); 700 wb.deinit(allocator);
701} 701}
702 702
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 7139d4c..18f1814 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -211,7 +211,7 @@ test "Segmentation Word Iterator" {
211 var buf_reader = std.io.bufferedReader(file.reader()); 211 var buf_reader = std.io.bufferedReader(file.reader());
212 var input_stream = buf_reader.reader(); 212 var input_stream = buf_reader.reader();
213 213
214 const wb = try WordBreak.init(allocator); 214 const wb = try Words.init(allocator);
215 defer wb.deinit(allocator); 215 defer wb.deinit(allocator);
216 216
217 var buf: [4096]u8 = undefined; 217 var buf: [4096]u8 = undefined;
@@ -392,5 +392,5 @@ const Graphemes = @import("Graphemes");
392const GraphemeIterator = @import("Graphemes").Iterator; 392const GraphemeIterator = @import("Graphemes").Iterator;
393const Normalize = @import("Normalize"); 393const Normalize = @import("Normalize");
394 394
395const WordBreak = @import("WordBreak"); 395const Words = @import("Words");
396const Word = WordBreak.Word; 396const Word = Words.Word;