Implement Word iterator

A by-the-book implmentation of the word break rules from tr29. This is superficially inefficient, but compilers are more than able to handle the common subexpression folding ignored by this approach. Now to port the WordBreakPropertyTests, and clean up the inevitable bugs in the implementation.
author: Sam Atman 2025-05-12 12:57:49 -0400
committer: Sam Atman 2025-05-15 15:31:15 -0400
commit: a832b90bc00503a201bf457df1f49dc338314e7b (patch)
tree: 041c975b63e17db81edfc6923cfafa63691e3b33
parent: Vastly simplify peek() (diff)
download: zg-a832b90bc00503a201bf457df1f49dc338314e7b.tar.gz
zg-a832b90bc00503a201bf457df1f49dc338314e7b.tar.xz
zg-a832b90bc00503a201bf457df1f49dc338314e7b.zip
1 files changed, 228 insertions, 0 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 9044740..84fd1f7 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -71,11 +71,234 @@ pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void {
    allocator.free(wordbreak.s2);
 }
+/// Represents a Unicode word span, as an offset into the source string
+/// and the length of the word.
+pub const Word = struct {
+    offset: u32,
+    len: u32,
+    /// Returns a slice of the word given the source string.
+    pub fn bytes(self: Word, src: []const u8) []const u8 {
+        return src[self.offset..][0..self.len];
+    }
+};
 /// Returns the word break property type for `cp`.
 pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
    return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
 }
+const IterState = packed struct {
+    mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter
+    mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric
+    quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter
+    regional: bool, // [^RI] (RI RI)* RI × RI
+    pub const initial: IterState = .{
+        .mid_punct = false,
+        .mid_num = false,
+        .quote_heb = false,
+        .regional = false,
+    };
+};
+pub const Iterator = struct {
+    this: ?CodePoint = null,
+    that: ?CodePoint = null,
+    cache: ?WordBreakProperty = null,
+    cp_iter: CodepointIterator,
+    wb: *const WordBreak,
+    /// Assumes `str` is valid UTF-8.
+    pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
+        var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb };
+        wb_iter.advance();
+        return wb;
+    }
+    pub fn next(iter: *Iterator) ?Word {
+        iter.advance();
+        // Done?
+        if (iter.this == null) return null;
+        // Last?
+        if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
+        const word_start = iter.this.?.offset;
+        var word_len: u32 = 0;
+        var state: IterState = .initial;
+        scan: while (true) : (iter.advance()) {
+            const this = iter.this.?;
+            word_len += this.len;
+            if (iter.that) |that| {
+                const that_p = iter.wb.breakProperty(that.code);
+                const this_p = this_p: {
+                    if (!isIgnorable(that_p) and iter.cache != null) {
+                        defer iter.cache = null;
+                        break :this_p iter.cache.?;
+                    } else {
+                        break :this_p iter.wb.breakProperty(this.code);
+                    }
+                };
+                // WB3  CR × LF
+                if (this_p == .CR and that_p == .LF) continue :scan;
+                // WB3a  (Newline | CR | LF) ÷
+                if (isNewline(this_p)) break :scan;
+                // WB3b  ÷ (Newline | CR | LF)
+                if (isNewline(that_p)) break :scan;
+                // WB3c  ZWJ × \p{Extended_Pictographic}
+                // The right way to do this one is a RuneSet, TODO: circle back
+                // WB3d  WSegSpace × WSegSpace
+                if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
+                // WB4  X (Extend | Format | ZWJ)* → X
+                if (isIgnorable(that_p)) {
+                    if (iter.cache == null) {
+                        iter.cache = this_p;
+                    }
+                    continue :scan;
+                }
+                if (isAHLetter(this_p)) {
+                    // WB5  AHLetter × AHLetter
+                    if (isAHLetter(that_p)) continue :scan;
+                    // WB6  AHLetter × (MidLetter | MidNumLetQ) AHLetter
+                    if (isMidVal(that_p)) {
+                        const next_val = iter.cp_iter.peek();
+                        if (next_val) |next_cp| {
+                            const next_p = iter.wb.breakProperty(next_cp.code);
+                            if (isAHLetter(next_p)) {
+                                state.mid_punct = true;
+                                continue :scan;
+                            }
+                        } else break :scan;
+                    }
+                }
+                // AHLetter (MidLetter | MidNumLetQ) × AHLetter
+                if (state.mid_punct) {
+                    // Should always be true:
+                    assert(isMidVal(this_p));
+                    assert(isAHLetter(that_p));
+                    state.mid_punct = false;
+                    continue :scan;
+                }
+                if (this_p == .Hebrew_Letter) {
+                    // WB7a  Hebrew_Letter × Single_Quote
+                    if (that_p == .Single_Quote) continue :scan;
+                    // WB7b  Hebrew_Letter × Double_Quote Hebrew_Letter
+                    if (that_p == .Double_Quote) {
+                        const next_val = iter.cp_iter.peek();
+                        if (next_val) |next_cp| {
+                            const next_p = iter.wb.breakProperty(next_cp.code);
+                            if (next_p == .Hebrew_Letter) {
+                                state.quote_heb = true;
+                                continue :scan;
+                            }
+                        } else break :scan;
+                    }
+                }
+                // WB7c  Hebrew_Letter Double_Quote × Hebrew_Letter
+                if (state.quote_heb) {
+                    // Should always be true:
+                    assert(this_p == .Double_Quote);
+                    assert(that_p == .Hebrew_Letter);
+                    state.quote_heb = false;
+                    continue :scan;
+                }
+                // WB8  Numeric × Numeric
+                if (this_p == .Numeric and that_p == .Numeric) continue :scan;
+                // WB9  AHLetter × Numeric
+                if (isAHLetter(this_p) and that_p == .Numeric) continue :scan;
+                // WB10  Numeric ×  AHLetter
+                if (this_p == .Numeric and isAHLetter(that_p)) continue :scan;
+                // WB12  Numeric × (MidNum | MidNumLetQ) Numeric
+                if (this_p == .Numeric and isMidVal(that_p)) {
+                    const next_val = iter.cp_iter.peek();
+                    if (next_val) |next_cp| {
+                        const next_p = iter.wb.breakProperty(next_cp.code);
+                        if (next_p == .Numeric) {
+                            state.mid_num = true;
+                            continue :scan;
+                        }
+                    } else break :scan;
+                }
+                // WB11  Numeric (MidNum | MidNumLetQ) × Numeric
+                if (state.mid_num) {
+                    assert(isMidVal(this_p));
+                    assert(that_p == .Numeric);
+                    state.mid_num = false;
+                    continue :scan;
+                }
+                // WB13  Katakana × Katakana
+                if (this_p == .Katakana and that_p == .Katakana) continue :scan;
+                // WB13a  (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+                if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan;
+                // WB13b  ExtendNumLet × (AHLetter | Numeric | Katakana)
+                if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
+                // WB15, WB16  ([^RI] ! sot) (RI RI)* RI × RI
+                if (that_p == .Regional_Indicator) {
+                    if (this_p == .Regional_Indicator) {
+                        if (state.regional) {
+                            state.regional = false;
+                            continue :scan;
+                        } else {
+                            break :scan;
+                        }
+                    } else {
+                        const next_val = iter.cp_iter.peek();
+                        if (next_val) |next_cp| {
+                            const next_p = iter.wb.breakProperty(next_cp.code);
+                            if (next_p == .Regional_Indicator) {
+                                state.regional = true;
+                                continue :scan;
+                            }
+                        } else break :scan;
+                    }
+                }
+                // WB999  Any ÷ Any
+                break :scan;
+            } else { // iter.that == null
+                break :scan;
+            }
+        }
+        return Word{ .len = word_len, .offset = word_start };
+    }
+    fn advance(wb_iter: *Iterator) void {
+        wb_iter.this = wb_iter.that;
+        wb_iter.that = wb_iter.cp_iter.next();
+    }
+};
+//| Predicates
+inline fn isNewline(wbp: WordBreakProperty) bool {
+    return wbp == .CR or wbp == .LF or wbp == .Newline;
+}
+inline fn isIgnorable(wbp: WordBreakProperty) bool {
+    return switch (wbp) {
+        .Format, .Extend, .ZWJ => true,
+        else => false,
+    };
+}
+inline fn isAHLetter(wbp: WordBreakProperty) bool {
+    return wbp == .ALetter or wbp == .Hebrew_Letter;
+}
+inline fn isMidVal(wbp: WordBreakProperty) bool {
+    return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
+}
+inline fn isExtensible(wbp: WordBreakProperty) bool {
+    return switch (wbp) {
+        .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
+        else => false,
+    };
+}
 test "Word Break Properties" {
    const wb = try WordBreak.init(testing.allocator);
    defer wb.deinit(testing.allocator);
@@ -99,4 +322,9 @@ const builtin = @import("builtin");
 const compress = std.compress;
 const mem = std.mem;
 const Allocator = mem.Allocator;
+const assert = std.debug.assert;
 const testing = std.testing;
+const code_point = @import("code_point");
+const CodepointIterator = code_point.Iterator;
+const CodePoint = code_point.CodePoint;
author	Sam Atman	2025-05-12 12:57:49 -0400
committer	Sam Atman	2025-05-15 15:31:15 -0400
commit	a832b90bc00503a201bf457df1f49dc338314e7b (patch)
tree	041c975b63e17db81edfc6923cfafa63691e3b33
parent	Vastly simplify peek() (diff)
download	zg-a832b90bc00503a201bf457df1f49dc338314e7b.tar.gz zg-a832b90bc00503a201bf457df1f49dc338314e7b.tar.xz zg-a832b90bc00503a201bf457df1f49dc338314e7b.zip

diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 9044740..84fd1f7 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig
@@ -71,11 +71,234 @@ pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void {
71	allocator.free(wordbreak.s2);	71	allocator.free(wordbreak.s2);
72	}	72	}
73		73
		74	/// Represents a Unicode word span, as an offset into the source string
		75	/// and the length of the word.
		76	pub const Word = struct {
		77	offset: u32,
		78	len: u32,
		79
		80	/// Returns a slice of the word given the source string.
		81	pub fn bytes(self: Word, src: []const u8) []const u8 {
		82	return src[self.offset..][0..self.len];
		83	}
		84	};
		85
74	/// Returns the word break property type for `cp`.	86	/// Returns the word break property type for `cp`.
75	pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {	87	pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
76	return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);	88	return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
77	}	89	}
78		90
		91	const IterState = packed struct {
		92	mid_punct: bool, // AHLetter (MidLetter \| MidNumLetQ) × AHLetter
		93	mid_num: bool, // Numeric (MidNum \| MidNumLetQ) × Numeric
		94	quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter
		95	regional: bool, // [^RI] (RI RI)* RI × RI
		96
		97	pub const initial: IterState = .{
		98	.mid_punct = false,
		99	.mid_num = false,
		100	.quote_heb = false,
		101	.regional = false,
		102	};
		103	};
		104
		105	pub const Iterator = struct {
		106	this: ?CodePoint = null,
		107	that: ?CodePoint = null,
		108	cache: ?WordBreakProperty = null,
		109	cp_iter: CodepointIterator,
		110	wb: *const WordBreak,
		111
		112	/// Assumes `str` is valid UTF-8.
		113	pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
		114	var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb };
		115	wb_iter.advance();
		116	return wb;
		117	}
		118
		119	pub fn next(iter: *Iterator) ?Word {
		120	iter.advance();
		121
		122	// Done?
		123	if (iter.this == null) return null;
		124	// Last?
		125	if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
		126
		127	const word_start = iter.this.?.offset;
		128	var word_len: u32 = 0;
		129
		130	var state: IterState = .initial;
		131
		132	scan: while (true) : (iter.advance()) {
		133	const this = iter.this.?;
		134	word_len += this.len;
		135	if (iter.that) \|that\| {
		136	const that_p = iter.wb.breakProperty(that.code);
		137	const this_p = this_p: {
		138	if (!isIgnorable(that_p) and iter.cache != null) {
		139	defer iter.cache = null;
		140	break :this_p iter.cache.?;
		141	} else {
		142	break :this_p iter.wb.breakProperty(this.code);
		143	}
		144	};
		145	// WB3 CR × LF
		146	if (this_p == .CR and that_p == .LF) continue :scan;
		147	// WB3a (Newline \| CR \| LF) ÷
		148	if (isNewline(this_p)) break :scan;
		149	// WB3b ÷ (Newline \| CR \| LF)
		150	if (isNewline(that_p)) break :scan;
		151	// WB3c ZWJ × \p{Extended_Pictographic}
		152	// The right way to do this one is a RuneSet, TODO: circle back
		153	// WB3d WSegSpace × WSegSpace
		154	if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
		155	// WB4 X (Extend \| Format \| ZWJ)* → X
		156	if (isIgnorable(that_p)) {
		157	if (iter.cache == null) {
		158	iter.cache = this_p;
		159	}
		160	continue :scan;
		161	}
		162	if (isAHLetter(this_p)) {
		163	// WB5 AHLetter × AHLetter
		164	if (isAHLetter(that_p)) continue :scan;
		165	// WB6 AHLetter × (MidLetter \| MidNumLetQ) AHLetter
		166	if (isMidVal(that_p)) {
		167	const next_val = iter.cp_iter.peek();
		168	if (next_val) \|next_cp\| {
		169	const next_p = iter.wb.breakProperty(next_cp.code);
		170	if (isAHLetter(next_p)) {
		171	state.mid_punct = true;
		172	continue :scan;
		173	}
		174	} else break :scan;
		175	}
		176	}
		177	// AHLetter (MidLetter \| MidNumLetQ) × AHLetter
		178	if (state.mid_punct) {
		179	// Should always be true:
		180	assert(isMidVal(this_p));
		181	assert(isAHLetter(that_p));
		182	state.mid_punct = false;
		183	continue :scan;
		184	}
		185	if (this_p == .Hebrew_Letter) {
		186	// WB7a Hebrew_Letter × Single_Quote
		187	if (that_p == .Single_Quote) continue :scan;
		188	// WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
		189	if (that_p == .Double_Quote) {
		190	const next_val = iter.cp_iter.peek();
		191	if (next_val) \|next_cp\| {
		192	const next_p = iter.wb.breakProperty(next_cp.code);
		193	if (next_p == .Hebrew_Letter) {
		194	state.quote_heb = true;
		195	continue :scan;
		196	}
		197	} else break :scan;
		198	}
		199	}
		200	// WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
		201	if (state.quote_heb) {
		202	// Should always be true:
		203	assert(this_p == .Double_Quote);
		204	assert(that_p == .Hebrew_Letter);
		205	state.quote_heb = false;
		206	continue :scan;
		207	}
		208	// WB8 Numeric × Numeric
		209	if (this_p == .Numeric and that_p == .Numeric) continue :scan;
		210	// WB9 AHLetter × Numeric
		211	if (isAHLetter(this_p) and that_p == .Numeric) continue :scan;
		212	// WB10 Numeric × AHLetter
		213	if (this_p == .Numeric and isAHLetter(that_p)) continue :scan;
		214	// WB12 Numeric × (MidNum \| MidNumLetQ) Numeric
		215	if (this_p == .Numeric and isMidVal(that_p)) {
		216	const next_val = iter.cp_iter.peek();
		217	if (next_val) \|next_cp\| {
		218	const next_p = iter.wb.breakProperty(next_cp.code);
		219	if (next_p == .Numeric) {
		220	state.mid_num = true;
		221	continue :scan;
		222	}
		223	} else break :scan;
		224	}
		225	// WB11 Numeric (MidNum \| MidNumLetQ) × Numeric
		226	if (state.mid_num) {
		227	assert(isMidVal(this_p));
		228	assert(that_p == .Numeric);
		229	state.mid_num = false;
		230	continue :scan;
		231	}
		232	// WB13 Katakana × Katakana
		233	if (this_p == .Katakana and that_p == .Katakana) continue :scan;
		234	// WB13a (AHLetter \| Numeric \| Katakana \| ExtendNumLet) × ExtendNumLet
		235	if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan;
		236	// WB13b ExtendNumLet × (AHLetter \| Numeric \| Katakana)
		237	if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
		238	// WB15, WB16 ([^RI] ! sot) (RI RI)* RI × RI
		239	if (that_p == .Regional_Indicator) {
		240	if (this_p == .Regional_Indicator) {
		241	if (state.regional) {
		242	state.regional = false;
		243	continue :scan;
		244	} else {
		245	break :scan;
		246	}
		247	} else {
		248	const next_val = iter.cp_iter.peek();
		249	if (next_val) \|next_cp\| {
		250	const next_p = iter.wb.breakProperty(next_cp.code);
		251	if (next_p == .Regional_Indicator) {
		252	state.regional = true;
		253	continue :scan;
		254	}
		255	} else break :scan;
		256	}
		257	}
		258	// WB999 Any ÷ Any
		259	break :scan;
		260	} else { // iter.that == null
		261	break :scan;
		262	}
		263	}
		264
		265	return Word{ .len = word_len, .offset = word_start };
		266	}
		267
		268	fn advance(wb_iter: *Iterator) void {
		269	wb_iter.this = wb_iter.that;
		270	wb_iter.that = wb_iter.cp_iter.next();
		271	}
		272	};
		273
		274	//\| Predicates
		275
		276	inline fn isNewline(wbp: WordBreakProperty) bool {
		277	return wbp == .CR or wbp == .LF or wbp == .Newline;
		278	}
		279
		280	inline fn isIgnorable(wbp: WordBreakProperty) bool {
		281	return switch (wbp) {
		282	.Format, .Extend, .ZWJ => true,
		283	else => false,
		284	};
		285	}
		286
		287	inline fn isAHLetter(wbp: WordBreakProperty) bool {
		288	return wbp == .ALetter or wbp == .Hebrew_Letter;
		289	}
		290
		291	inline fn isMidVal(wbp: WordBreakProperty) bool {
		292	return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
		293	}
		294
		295	inline fn isExtensible(wbp: WordBreakProperty) bool {
		296	return switch (wbp) {
		297	.ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
		298	else => false,
		299	};
		300	}
		301
79	test "Word Break Properties" {	302	test "Word Break Properties" {
80	const wb = try WordBreak.init(testing.allocator);	303	const wb = try WordBreak.init(testing.allocator);
81	defer wb.deinit(testing.allocator);	304	defer wb.deinit(testing.allocator);
@@ -99,4 +322,9 @@ const builtin = @import("builtin");
99	const compress = std.compress;	322	const compress = std.compress;
100	const mem = std.mem;	323	const mem = std.mem;
101	const Allocator = mem.Allocator;	324	const Allocator = mem.Allocator;
		325	const assert = std.debug.assert;
102	const testing = std.testing;	326	const testing = std.testing;
		327
		328	const code_point = @import("code_point");
		329	const CodepointIterator = code_point.Iterator;
		330	const CodePoint = code_point.CodePoint;