Words module

In keeping with the new nomenclature, we're calling the module "Words", not "WordBreak". The latter is Unicode jargon, the module provides word iterators. Words are the figure, word breaks are the ground.
author: Sam Atman 2025-05-16 12:06:36 -0400
committer: Sam Atman 2025-05-16 12:06:36 -0400
commit: aa20bebade8eeb3ca75199dc252feb3edb203fb1 (patch)
tree: 2e832616bbf554ca3a20588d050c0dc764f4cf65 /src
parent: Move WordBreak to Words (diff)
download: zg-aa20bebade8eeb3ca75199dc252feb3edb203fb1.tar.gz
zg-aa20bebade8eeb3ca75199dc252feb3edb203fb1.tar.xz
zg-aa20bebade8eeb3ca75199dc252feb3edb203fb1.zip
2 files changed, 24 insertions, 24 deletions
diff --git a/src/Words.zig b/src/Words.zig
index 6a532f5..565a2fb 100644
--- a/src/Words.zig
+++ b/src/Words.zig
@@ -25,15 +25,15 @@ const WordBreakProperty = enum(u5) {
 s1: []u16 = undefined,
 s2: []u5 = undefined,
-const WordBreak = @This();
+const Words = @This();
-pub fn init(allocator: Allocator) Allocator.Error!WordBreak {
+pub fn init(allocator: Allocator) Allocator.Error!Words {
-    var wb: WordBreak = undefined;
+    var wb: Words = undefined;
    try wb.setup(allocator);
    return wb;
 }
-pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void {
+pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void {
    wb.setupImpl(allocator) catch |err| {
        switch (err) {
            error.OutOfMemory => |e| return e,
@@ -42,7 +42,7 @@ pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void {
    };
 }
-pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void {
+pub fn deinit(wordbreak: *const Words, allocator: mem.Allocator) void {
    allocator.free(wordbreak.s1);
    allocator.free(wordbreak.s2);
 }
@@ -60,19 +60,19 @@ pub const Word = struct {
 };
 /// Returns the word break property type for `cp`.
-pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
+pub fn breakProperty(wordbreak: *const Words, cp: u21) WordBreakProperty {
    return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
 }
 /// Convenience function for working with CodePoints
-fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty {
+fn breakProp(wb: *const Words, point: CodePoint) WordBreakProperty {
    return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]);
 }
 /// Returns the Word at the given index.  Asserts that the index is less than
 /// `string.len`, and that the string is not empty. Always returns a word.
 /// The index does not have to be the start of a codepoint in the word.
-pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word {
+pub fn wordAtIndex(wordbreak: *const Words, string: []const u8, index: usize) Word {
    assert(index < string.len and string.len > 0);
    var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index);
    const first_back = iter_back.prev();
@@ -118,12 +118,12 @@ pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize
 }
 /// Returns an iterator over words in `slice`.
-pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator {
+pub fn iterator(wordbreak: *const Words, slice: []const u8) Iterator {
    return Iterator.init(wordbreak, slice);
 }
 /// Returns a reverse iterator over the words in `slice`.
-pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator {
+pub fn reverseIterator(wordbreak: *const Words, slice: []const u8) ReverseIterator {
    return ReverseIterator.init(wordbreak, slice);
 }
@@ -132,10 +132,10 @@ pub const Iterator = struct {
    this: ?CodePoint = null,
    that: ?CodePoint = null,
    cp_iter: CodepointIterator,
-    wb: *const WordBreak,
+    wb: *const Words,
    /// Assumes `str` is valid UTF-8.
-    pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
+    pub fn init(wb: *const Words, str: []const u8) Iterator {
        var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb };
        wb_iter.advance();
        return wb_iter;
@@ -314,11 +314,11 @@ pub const ReverseIterator = struct {
    after: ?CodePoint = null,
    before: ?CodePoint = null,
    cp_iter: ReverseCodepointIterator,
-    wb: *const WordBreak,
+    wb: *const Words,
    flags: usize = 0,
    /// Assumes `str` is valid UTF-8.
-    pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator {
+    pub fn init(wb: *const Words, str: []const u8) ReverseIterator {
        var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb };
        wb_iter.advance();
        return wb_iter;
@@ -511,7 +511,7 @@ pub const ReverseIterator = struct {
 //| Implementation Details
 /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`.
-fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator {
+fn initAtIndex(wb: *const Words, string: []const u8, index: usize) ReverseIterator {
    var idx: u32 = @intCast(index);
    // Find the next lead byte:
    while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
@@ -536,7 +536,7 @@ fn sneaky(iter: *const ReverseIterator) SneakIterator {
 const SneakIterator = struct {
    cp_iter: ReverseCodepointIterator,
-    wb: *const WordBreak,
+    wb: *const Words,
    fn peek(iter: *SneakIterator) ?CodePoint {
        const save_cp = iter.cp_iter;
@@ -570,7 +570,7 @@ const SneakIterator = struct {
    }
 };
-inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void {
+inline fn setupImpl(wb: *Words, allocator: Allocator) !void {
    const decompressor = compress.flate.inflate.decompressor;
    const in_bytes = @embedFile("wbp");
    var in_fbs = std.io.fixedBufferStream(in_bytes);
@@ -627,7 +627,7 @@ inline fn isExtensible(wbp: WordBreakProperty) bool {
 }
 test "Word Break Properties" {
-    const wb = try WordBreak.init(testing.allocator);
+    const wb = try Words.init(testing.allocator);
    defer wb.deinit(testing.allocator);
    try testing.expectEqual(.CR, wb.breakProperty('\r'));
    try testing.expectEqual(.LF, wb.breakProperty('\n'));
@@ -641,7 +641,7 @@ test "ext_pict" {
 }
 test wordAtIndex {
-    const wb = try WordBreak.init(testing.allocator);
+    const wb = try Words.init(testing.allocator);
    defer wb.deinit(testing.allocator);
    const t_string = "first second third";
    const second = wb.wordAtIndex(t_string, 8);
@@ -663,7 +663,7 @@ test wordAtIndex {
 const testr = "don't a:ka fin!";
 test "reversal" {
-    const wb = try WordBreak.init(testing.allocator);
+    const wb = try Words.init(testing.allocator);
    defer wb.deinit(testing.allocator);
    {
        var fwd = wb.iterator(testr);
@@ -696,7 +696,7 @@ test "reversal" {
 }
 fn testAllocations(allocator: Allocator) !void {
-    const wb = try WordBreak.init(allocator);
+    const wb = try Words.init(allocator);
    wb.deinit(allocator);
 }
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 7139d4c..18f1814 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -211,7 +211,7 @@ test "Segmentation Word Iterator" {
    var buf_reader = std.io.bufferedReader(file.reader());
    var input_stream = buf_reader.reader();
-    const wb = try WordBreak.init(allocator);
+    const wb = try Words.init(allocator);
    defer wb.deinit(allocator);
    var buf: [4096]u8 = undefined;
@@ -392,5 +392,5 @@ const Graphemes = @import("Graphemes");
 const GraphemeIterator = @import("Graphemes").Iterator;
 const Normalize = @import("Normalize");
-const WordBreak = @import("WordBreak");
+const Words = @import("Words");
-const Word = WordBreak.Word;
+const Word = Words.Word;
author	Sam Atman	2025-05-16 12:06:36 -0400
committer	Sam Atman	2025-05-16 12:06:36 -0400
commit	aa20bebade8eeb3ca75199dc252feb3edb203fb1 (patch)
tree	2e832616bbf554ca3a20588d050c0dc764f4cf65 /src
parent	Move WordBreak to Words (diff)
download	zg-aa20bebade8eeb3ca75199dc252feb3edb203fb1.tar.gz zg-aa20bebade8eeb3ca75199dc252feb3edb203fb1.tar.xz zg-aa20bebade8eeb3ca75199dc252feb3edb203fb1.zip

diff --git a/src/Words.zig b/src/Words.zig index 6a532f5..565a2fb 100644 --- a/src/Words.zig +++ b/src/Words.zig
@@ -25,15 +25,15 @@ const WordBreakProperty = enum(u5) {
25	s1: []u16 = undefined,	25	s1: []u16 = undefined,
26	s2: []u5 = undefined,	26	s2: []u5 = undefined,
27		27
28	const WordBreak = @This();	28	const Words = @This();
29		29
30	pub fn init(allocator: Allocator) Allocator.Error!WordBreak {	30	pub fn init(allocator: Allocator) Allocator.Error!Words {
31	var wb: WordBreak = undefined;	31	var wb: Words = undefined;
32	try wb.setup(allocator);	32	try wb.setup(allocator);
33	return wb;	33	return wb;
34	}	34	}
35		35
36	pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void {	36	pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void {
37	wb.setupImpl(allocator) catch \|err\| {	37	wb.setupImpl(allocator) catch \|err\| {
38	switch (err) {	38	switch (err) {
39	error.OutOfMemory => \|e\| return e,	39	error.OutOfMemory => \|e\| return e,
@@ -42,7 +42,7 @@ pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void {
42	};	42	};
43	}	43	}
44		44
45	pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void {	45	pub fn deinit(wordbreak: *const Words, allocator: mem.Allocator) void {
46	allocator.free(wordbreak.s1);	46	allocator.free(wordbreak.s1);
47	allocator.free(wordbreak.s2);	47	allocator.free(wordbreak.s2);
48	}	48	}
@@ -60,19 +60,19 @@ pub const Word = struct {
60	};	60	};
61		61
62	/// Returns the word break property type for `cp`.	62	/// Returns the word break property type for `cp`.
63	pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {	63	pub fn breakProperty(wordbreak: *const Words, cp: u21) WordBreakProperty {
64	return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);	64	return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
65	}	65	}
66		66
67	/// Convenience function for working with CodePoints	67	/// Convenience function for working with CodePoints
68	fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty {	68	fn breakProp(wb: *const Words, point: CodePoint) WordBreakProperty {
69	return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]);	69	return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]);
70	}	70	}
71		71
72	/// Returns the Word at the given index. Asserts that the index is less than	72	/// Returns the Word at the given index. Asserts that the index is less than
73	/// `string.len`, and that the string is not empty. Always returns a word.	73	/// `string.len`, and that the string is not empty. Always returns a word.
74	/// The index does not have to be the start of a codepoint in the word.	74	/// The index does not have to be the start of a codepoint in the word.
75	pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word {	75	pub fn wordAtIndex(wordbreak: *const Words, string: []const u8, index: usize) Word {
76	assert(index < string.len and string.len > 0);	76	assert(index < string.len and string.len > 0);
77	var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index);	77	var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index);
78	const first_back = iter_back.prev();	78	const first_back = iter_back.prev();
@@ -118,12 +118,12 @@ pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize
118	}	118	}
119		119
120	/// Returns an iterator over words in `slice`.	120	/// Returns an iterator over words in `slice`.
121	pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator {	121	pub fn iterator(wordbreak: *const Words, slice: []const u8) Iterator {
122	return Iterator.init(wordbreak, slice);	122	return Iterator.init(wordbreak, slice);
123	}	123	}
124		124
125	/// Returns a reverse iterator over the words in `slice`.	125	/// Returns a reverse iterator over the words in `slice`.
126	pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator {	126	pub fn reverseIterator(wordbreak: *const Words, slice: []const u8) ReverseIterator {
127	return ReverseIterator.init(wordbreak, slice);	127	return ReverseIterator.init(wordbreak, slice);
128	}	128	}
129		129
@@ -132,10 +132,10 @@ pub const Iterator = struct {
132	this: ?CodePoint = null,	132	this: ?CodePoint = null,
133	that: ?CodePoint = null,	133	that: ?CodePoint = null,
134	cp_iter: CodepointIterator,	134	cp_iter: CodepointIterator,
135	wb: *const WordBreak,	135	wb: *const Words,
136		136
137	/// Assumes `str` is valid UTF-8.	137	/// Assumes `str` is valid UTF-8.
138	pub fn init(wb: *const WordBreak, str: []const u8) Iterator {	138	pub fn init(wb: *const Words, str: []const u8) Iterator {
139	var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb };	139	var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb };
140	wb_iter.advance();	140	wb_iter.advance();
141	return wb_iter;	141	return wb_iter;
@@ -314,11 +314,11 @@ pub const ReverseIterator = struct {
314	after: ?CodePoint = null,	314	after: ?CodePoint = null,
315	before: ?CodePoint = null,	315	before: ?CodePoint = null,
316	cp_iter: ReverseCodepointIterator,	316	cp_iter: ReverseCodepointIterator,
317	wb: *const WordBreak,	317	wb: *const Words,
318	flags: usize = 0,	318	flags: usize = 0,
319		319
320	/// Assumes `str` is valid UTF-8.	320	/// Assumes `str` is valid UTF-8.
321	pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator {	321	pub fn init(wb: *const Words, str: []const u8) ReverseIterator {
322	var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb };	322	var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb };
323	wb_iter.advance();	323	wb_iter.advance();
324	return wb_iter;	324	return wb_iter;
@@ -511,7 +511,7 @@ pub const ReverseIterator = struct {
511	//\| Implementation Details	511	//\| Implementation Details
512		512
513	/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`.	513	/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`.
514	fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator {	514	fn initAtIndex(wb: *const Words, string: []const u8, index: usize) ReverseIterator {
515	var idx: u32 = @intCast(index);	515	var idx: u32 = @intCast(index);
516	// Find the next lead byte:	516	// Find the next lead byte:
517	while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}	517	while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
@@ -536,7 +536,7 @@ fn sneaky(iter: *const ReverseIterator) SneakIterator {
536		536
537	const SneakIterator = struct {	537	const SneakIterator = struct {
538	cp_iter: ReverseCodepointIterator,	538	cp_iter: ReverseCodepointIterator,
539	wb: *const WordBreak,	539	wb: *const Words,
540		540
541	fn peek(iter: *SneakIterator) ?CodePoint {	541	fn peek(iter: *SneakIterator) ?CodePoint {
542	const save_cp = iter.cp_iter;	542	const save_cp = iter.cp_iter;
@@ -570,7 +570,7 @@ const SneakIterator = struct {
570	}	570	}
571	};	571	};
572		572
573	inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void {	573	inline fn setupImpl(wb: *Words, allocator: Allocator) !void {
574	const decompressor = compress.flate.inflate.decompressor;	574	const decompressor = compress.flate.inflate.decompressor;
575	const in_bytes = @embedFile("wbp");	575	const in_bytes = @embedFile("wbp");
576	var in_fbs = std.io.fixedBufferStream(in_bytes);	576	var in_fbs = std.io.fixedBufferStream(in_bytes);
@@ -627,7 +627,7 @@ inline fn isExtensible(wbp: WordBreakProperty) bool {
627	}	627	}
628		628
629	test "Word Break Properties" {	629	test "Word Break Properties" {
630	const wb = try WordBreak.init(testing.allocator);	630	const wb = try Words.init(testing.allocator);
631	defer wb.deinit(testing.allocator);	631	defer wb.deinit(testing.allocator);
632	try testing.expectEqual(.CR, wb.breakProperty('\r'));	632	try testing.expectEqual(.CR, wb.breakProperty('\r'));
633	try testing.expectEqual(.LF, wb.breakProperty('\n'));	633	try testing.expectEqual(.LF, wb.breakProperty('\n'));
@@ -641,7 +641,7 @@ test "ext_pict" {
641	}	641	}
642		642
643	test wordAtIndex {	643	test wordAtIndex {
644	const wb = try WordBreak.init(testing.allocator);	644	const wb = try Words.init(testing.allocator);
645	defer wb.deinit(testing.allocator);	645	defer wb.deinit(testing.allocator);
646	const t_string = "first second third";	646	const t_string = "first second third";
647	const second = wb.wordAtIndex(t_string, 8);	647	const second = wb.wordAtIndex(t_string, 8);
@@ -663,7 +663,7 @@ test wordAtIndex {
663	const testr = "don't a:ka fin!";	663	const testr = "don't a:ka fin!";
664		664
665	test "reversal" {	665	test "reversal" {
666	const wb = try WordBreak.init(testing.allocator);	666	const wb = try Words.init(testing.allocator);
667	defer wb.deinit(testing.allocator);	667	defer wb.deinit(testing.allocator);
668	{	668	{
669	var fwd = wb.iterator(testr);	669	var fwd = wb.iterator(testr);
@@ -696,7 +696,7 @@ test "reversal" {
696	}	696	}
697		697
698	fn testAllocations(allocator: Allocator) !void {	698	fn testAllocations(allocator: Allocator) !void {
699	const wb = try WordBreak.init(allocator);	699	const wb = try Words.init(allocator);
700	wb.deinit(allocator);	700	wb.deinit(allocator);
701	}	701	}
702		702


diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 7139d4c..18f1814 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig
@@ -211,7 +211,7 @@ test "Segmentation Word Iterator" {
211	var buf_reader = std.io.bufferedReader(file.reader());	211	var buf_reader = std.io.bufferedReader(file.reader());
212	var input_stream = buf_reader.reader();	212	var input_stream = buf_reader.reader();
213		213
214	const wb = try WordBreak.init(allocator);	214	const wb = try Words.init(allocator);
215	defer wb.deinit(allocator);	215	defer wb.deinit(allocator);
216		216
217	var buf: [4096]u8 = undefined;	217	var buf: [4096]u8 = undefined;
@@ -392,5 +392,5 @@ const Graphemes = @import("Graphemes");
392	const GraphemeIterator = @import("Graphemes").Iterator;	392	const GraphemeIterator = @import("Graphemes").Iterator;
393	const Normalize = @import("Normalize");	393	const Normalize = @import("Normalize");
394		394
395	const WordBreak = @import("WordBreak");	395	const Words = @import("Words");
396	const Word = WordBreak.Word;	396	const Word = Words.Word;