1 files changed, 228 insertions, 0 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 9044740..84fd1f7 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -71,11 +71,234 @@ pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void {
    allocator.free(wordbreak.s2);
 }
+/// Represents a Unicode word span, as an offset into the source string
+/// and the length of the word.
+pub const Word = struct {
+    offset: u32,
+    len: u32,
+    /// Returns a slice of the word given the source string.
+    pub fn bytes(self: Word, src: []const u8) []const u8 {
+        return src[self.offset..][0..self.len];
+    }
+};
 /// Returns the word break property type for `cp`.
 pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
    return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
 }
+const IterState = packed struct {
+    mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter
+    mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric
+    quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter
+    regional: bool, // [^RI] (RI RI)* RI × RI
+    pub const initial: IterState = .{
+        .mid_punct = false,
+        .mid_num = false,
+        .quote_heb = false,
+        .regional = false,
+    };
+};
+pub const Iterator = struct {
+    this: ?CodePoint = null,
+    that: ?CodePoint = null,
+    cache: ?WordBreakProperty = null,
+    cp_iter: CodepointIterator,
+    wb: *const WordBreak,
+    /// Assumes `str` is valid UTF-8.
+    pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
+        var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb };
+        wb_iter.advance();
+        return wb;
+    }
+    pub fn next(iter: *Iterator) ?Word {
+        iter.advance();
+        // Done?
+        if (iter.this == null) return null;
+        // Last?
+        if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
+        const word_start = iter.this.?.offset;
+        var word_len: u32 = 0;
+        var state: IterState = .initial;
+        scan: while (true) : (iter.advance()) {
+            const this = iter.this.?;
+            word_len += this.len;
+            if (iter.that) |that| {
+                const that_p = iter.wb.breakProperty(that.code);
+                const this_p = this_p: {
+                    if (!isIgnorable(that_p) and iter.cache != null) {
+                        defer iter.cache = null;
+                        break :this_p iter.cache.?;
+                    } else {
+                        break :this_p iter.wb.breakProperty(this.code);
+                    }
+                };
+                // WB3  CR × LF
+                if (this_p == .CR and that_p == .LF) continue :scan;
+                // WB3a  (Newline | CR | LF) ÷
+                if (isNewline(this_p)) break :scan;
+                // WB3b  ÷ (Newline | CR | LF)
+                if (isNewline(that_p)) break :scan;
+                // WB3c  ZWJ × \p{Extended_Pictographic}
+                // The right way to do this one is a RuneSet, TODO: circle back
+                // WB3d  WSegSpace × WSegSpace
+                if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
+                // WB4  X (Extend | Format | ZWJ)* → X
+                if (isIgnorable(that_p)) {
+                    if (iter.cache == null) {
+                        iter.cache = this_p;
+                    }
+                    continue :scan;
+                }
+                if (isAHLetter(this_p)) {
+                    // WB5  AHLetter × AHLetter
+                    if (isAHLetter(that_p)) continue :scan;
+                    // WB6  AHLetter × (MidLetter | MidNumLetQ) AHLetter
+                    if (isMidVal(that_p)) {
+                        const next_val = iter.cp_iter.peek();
+                        if (next_val) |next_cp| {
+                            const next_p = iter.wb.breakProperty(next_cp.code);
+                            if (isAHLetter(next_p)) {
+                                state.mid_punct = true;
+                                continue :scan;
+                            }
+                        } else break :scan;
+                    }
+                }
+                // AHLetter (MidLetter | MidNumLetQ) × AHLetter
+                if (state.mid_punct) {
+                    // Should always be true:
+                    assert(isMidVal(this_p));
+                    assert(isAHLetter(that_p));
+                    state.mid_punct = false;
+                    continue :scan;
+                }
+                if (this_p == .Hebrew_Letter) {
+                    // WB7a  Hebrew_Letter × Single_Quote
+                    if (that_p == .Single_Quote) continue :scan;
+                    // WB7b  Hebrew_Letter × Double_Quote Hebrew_Letter
+                    if (that_p == .Double_Quote) {
+                        const next_val = iter.cp_iter.peek();
+                        if (next_val) |next_cp| {
+                            const next_p = iter.wb.breakProperty(next_cp.code);
+                            if (next_p == .Hebrew_Letter) {
+                                state.quote_heb = true;
+                                continue :scan;
+                            }
+                        } else break :scan;
+                    }
+                }
+                // WB7c  Hebrew_Letter Double_Quote × Hebrew_Letter
+                if (state.quote_heb) {
+                    // Should always be true:
+                    assert(this_p == .Double_Quote);
+                    assert(that_p == .Hebrew_Letter);
+                    state.quote_heb = false;
+                    continue :scan;
+                }
+                // WB8  Numeric × Numeric
+                if (this_p == .Numeric and that_p == .Numeric) continue :scan;
+                // WB9  AHLetter × Numeric
+                if (isAHLetter(this_p) and that_p == .Numeric) continue :scan;
+                // WB10  Numeric ×  AHLetter
+                if (this_p == .Numeric and isAHLetter(that_p)) continue :scan;
+                // WB12  Numeric × (MidNum | MidNumLetQ) Numeric
+                if (this_p == .Numeric and isMidVal(that_p)) {
+                    const next_val = iter.cp_iter.peek();
+                    if (next_val) |next_cp| {
+                        const next_p = iter.wb.breakProperty(next_cp.code);
+                        if (next_p == .Numeric) {
+                            state.mid_num = true;
+                            continue :scan;
+                        }
+                    } else break :scan;
+                }
+                // WB11  Numeric (MidNum | MidNumLetQ) × Numeric
+                if (state.mid_num) {
+                    assert(isMidVal(this_p));
+                    assert(that_p == .Numeric);
+                    state.mid_num = false;
+                    continue :scan;
+                }
+                // WB13  Katakana × Katakana
+                if (this_p == .Katakana and that_p == .Katakana) continue :scan;
+                // WB13a  (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+                if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan;
+                // WB13b  ExtendNumLet × (AHLetter | Numeric | Katakana)
+                if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
+                // WB15, WB16  ([^RI] ! sot) (RI RI)* RI × RI
+                if (that_p == .Regional_Indicator) {
+                    if (this_p == .Regional_Indicator) {
+                        if (state.regional) {
+                            state.regional = false;
+                            continue :scan;
+                        } else {
+                            break :scan;
+                        }
+                    } else {
+                        const next_val = iter.cp_iter.peek();
+                        if (next_val) |next_cp| {
+                            const next_p = iter.wb.breakProperty(next_cp.code);
+                            if (next_p == .Regional_Indicator) {
+                                state.regional = true;
+                                continue :scan;
+                            }
+                        } else break :scan;
+                    }
+                }
+                // WB999  Any ÷ Any
+                break :scan;
+            } else { // iter.that == null
+                break :scan;
+            }
+        }
+        return Word{ .len = word_len, .offset = word_start };
+    }
+    fn advance(wb_iter: *Iterator) void {
+        wb_iter.this = wb_iter.that;
+        wb_iter.that = wb_iter.cp_iter.next();
+    }
+};
+//| Predicates
+inline fn isNewline(wbp: WordBreakProperty) bool {
+    return wbp == .CR or wbp == .LF or wbp == .Newline;
+}
+inline fn isIgnorable(wbp: WordBreakProperty) bool {
+    return switch (wbp) {
+        .Format, .Extend, .ZWJ => true,
+        else => false,
+    };
+}
+inline fn isAHLetter(wbp: WordBreakProperty) bool {
+    return wbp == .ALetter or wbp == .Hebrew_Letter;
+}
+inline fn isMidVal(wbp: WordBreakProperty) bool {
+    return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
+}
+inline fn isExtensible(wbp: WordBreakProperty) bool {
+    return switch (wbp) {
+        .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
+        else => false,
+    };
+}
 test "Word Break Properties" {
    const wb = try WordBreak.init(testing.allocator);
    defer wb.deinit(testing.allocator);
@@ -99,4 +322,9 @@ const builtin = @import("builtin");
 const compress = std.compress;
 const mem = std.mem;
 const Allocator = mem.Allocator;
+const assert = std.debug.assert;
 const testing = std.testing;
+const code_point = @import("code_point");
+const CodepointIterator = code_point.Iterator;
+const CodePoint = code_point.CodePoint;

diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 9044740..84fd1f7 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig
@@ -71,11 +71,234 @@ pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void {
71	allocator.free(wordbreak.s2);	71	allocator.free(wordbreak.s2);
72	}	72	}
73		73
		74	/// Represents a Unicode word span, as an offset into the source string
		75	/// and the length of the word.
		76	pub const Word = struct {
		77	offset: u32,
		78	len: u32,
		79
		80	/// Returns a slice of the word given the source string.
		81	pub fn bytes(self: Word, src: []const u8) []const u8 {
		82	return src[self.offset..][0..self.len];
		83	}
		84	};
		85
74	/// Returns the word break property type for `cp`.	86	/// Returns the word break property type for `cp`.
75	pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {	87	pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
76	return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);	88	return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
77	}	89	}
78		90
		91	const IterState = packed struct {
		92	mid_punct: bool, // AHLetter (MidLetter \| MidNumLetQ) × AHLetter
		93	mid_num: bool, // Numeric (MidNum \| MidNumLetQ) × Numeric
		94	quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter
		95	regional: bool, // [^RI] (RI RI)* RI × RI
		96
		97	pub const initial: IterState = .{
		98	.mid_punct = false,
		99	.mid_num = false,
		100	.quote_heb = false,
		101	.regional = false,
		102	};
		103	};
		104
		105	pub const Iterator = struct {
		106	this: ?CodePoint = null,
		107	that: ?CodePoint = null,
		108	cache: ?WordBreakProperty = null,
		109	cp_iter: CodepointIterator,
		110	wb: *const WordBreak,
		111
		112	/// Assumes `str` is valid UTF-8.
		113	pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
		114	var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb };
		115	wb_iter.advance();
		116	return wb;
		117	}
		118
		119	pub fn next(iter: *Iterator) ?Word {
		120	iter.advance();
		121
		122	// Done?
		123	if (iter.this == null) return null;
		124	// Last?
		125	if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
		126
		127	const word_start = iter.this.?.offset;
		128	var word_len: u32 = 0;
		129
		130	var state: IterState = .initial;
		131
		132	scan: while (true) : (iter.advance()) {
		133	const this = iter.this.?;
		134	word_len += this.len;
		135	if (iter.that) \|that\| {
		136	const that_p = iter.wb.breakProperty(that.code);
		137	const this_p = this_p: {
		138	if (!isIgnorable(that_p) and iter.cache != null) {
		139	defer iter.cache = null;
		140	break :this_p iter.cache.?;
		141	} else {
		142	break :this_p iter.wb.breakProperty(this.code);
		143	}
		144	};
		145	// WB3 CR × LF
		146	if (this_p == .CR and that_p == .LF) continue :scan;
		147	// WB3a (Newline \| CR \| LF) ÷
		148	if (isNewline(this_p)) break :scan;
		149	// WB3b ÷ (Newline \| CR \| LF)
		150	if (isNewline(that_p)) break :scan;
		151	// WB3c ZWJ × \p{Extended_Pictographic}
		152	// The right way to do this one is a RuneSet, TODO: circle back
		153	// WB3d WSegSpace × WSegSpace
		154	if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
		155	// WB4 X (Extend \| Format \| ZWJ)* → X
		156	if (isIgnorable(that_p)) {
		157	if (iter.cache == null) {
		158	iter.cache = this_p;
		159	}
		160	continue :scan;
		161	}
		162	if (isAHLetter(this_p)) {
		163	// WB5 AHLetter × AHLetter
		164	if (isAHLetter(that_p)) continue :scan;
		165	// WB6 AHLetter × (MidLetter \| MidNumLetQ) AHLetter
		166	if (isMidVal(that_p)) {
		167	const next_val = iter.cp_iter.peek();
		168	if (next_val) \|next_cp\| {
		169	const next_p = iter.wb.breakProperty(next_cp.code);
		170	if (isAHLetter(next_p)) {
		171	state.mid_punct = true;
		172	continue :scan;
		173	}
		174	} else break :scan;
		175	}
		176	}
		177	// AHLetter (MidLetter \| MidNumLetQ) × AHLetter
		178	if (state.mid_punct) {
		179	// Should always be true:
		180	assert(isMidVal(this_p));
		181	assert(isAHLetter(that_p));
		182	state.mid_punct = false;
		183	continue :scan;
		184	}
		185	if (this_p == .Hebrew_Letter) {
		186	// WB7a Hebrew_Letter × Single_Quote
		187	if (that_p == .Single_Quote) continue :scan;
		188	// WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
		189	if (that_p == .Double_Quote) {
		190	const next_val = iter.cp_iter.peek();
		191	if (next_val) \|next_cp\| {
		192	const next_p = iter.wb.breakProperty(next_cp.code);
		193	if (next_p == .Hebrew_Letter) {
		194	state.quote_heb = true;
		195	continue :scan;
		196	}
		197	} else break :scan;
		198	}
		199	}
		200	// WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
		201	if (state.quote_heb) {
		202	// Should always be true:
		203	assert(this_p == .Double_Quote);
		204	assert(that_p == .Hebrew_Letter);
		205	state.quote_heb = false;
		206	continue :scan;
		207	}
		208	// WB8 Numeric × Numeric
		209	if (this_p == .Numeric and that_p == .Numeric) continue :scan;
		210	// WB9 AHLetter × Numeric
		211	if (isAHLetter(this_p) and that_p == .Numeric) continue :scan;
		212	// WB10 Numeric × AHLetter
		213	if (this_p == .Numeric and isAHLetter(that_p)) continue :scan;
		214	// WB12 Numeric × (MidNum \| MidNumLetQ) Numeric
		215	if (this_p == .Numeric and isMidVal(that_p)) {
		216	const next_val = iter.cp_iter.peek();
		217	if (next_val) \|next_cp\| {
		218	const next_p = iter.wb.breakProperty(next_cp.code);
		219	if (next_p == .Numeric) {
		220	state.mid_num = true;
		221	continue :scan;
		222	}
		223	} else break :scan;
		224	}
		225	// WB11 Numeric (MidNum \| MidNumLetQ) × Numeric
		226	if (state.mid_num) {
		227	assert(isMidVal(this_p));
		228	assert(that_p == .Numeric);
		229	state.mid_num = false;
		230	continue :scan;
		231	}
		232	// WB13 Katakana × Katakana
		233	if (this_p == .Katakana and that_p == .Katakana) continue :scan;
		234	// WB13a (AHLetter \| Numeric \| Katakana \| ExtendNumLet) × ExtendNumLet
		235	if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan;
		236	// WB13b ExtendNumLet × (AHLetter \| Numeric \| Katakana)
		237	if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
		238	// WB15, WB16 ([^RI] ! sot) (RI RI)* RI × RI
		239	if (that_p == .Regional_Indicator) {
		240	if (this_p == .Regional_Indicator) {
		241	if (state.regional) {
		242	state.regional = false;
		243	continue :scan;
		244	} else {
		245	break :scan;
		246	}
		247	} else {
		248	const next_val = iter.cp_iter.peek();
		249	if (next_val) \|next_cp\| {
		250	const next_p = iter.wb.breakProperty(next_cp.code);
		251	if (next_p == .Regional_Indicator) {
		252	state.regional = true;
		253	continue :scan;
		254	}
		255	} else break :scan;
		256	}
		257	}
		258	// WB999 Any ÷ Any
		259	break :scan;
		260	} else { // iter.that == null
		261	break :scan;
		262	}
		263	}
		264
		265	return Word{ .len = word_len, .offset = word_start };
		266	}
		267
		268	fn advance(wb_iter: *Iterator) void {
		269	wb_iter.this = wb_iter.that;
		270	wb_iter.that = wb_iter.cp_iter.next();
		271	}
		272	};
		273
		274	//\| Predicates
		275
		276	inline fn isNewline(wbp: WordBreakProperty) bool {
		277	return wbp == .CR or wbp == .LF or wbp == .Newline;
		278	}
		279
		280	inline fn isIgnorable(wbp: WordBreakProperty) bool {
		281	return switch (wbp) {
		282	.Format, .Extend, .ZWJ => true,
		283	else => false,
		284	};
		285	}
		286
		287	inline fn isAHLetter(wbp: WordBreakProperty) bool {
		288	return wbp == .ALetter or wbp == .Hebrew_Letter;
		289	}
		290
		291	inline fn isMidVal(wbp: WordBreakProperty) bool {
		292	return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
		293	}
		294
		295	inline fn isExtensible(wbp: WordBreakProperty) bool {
		296	return switch (wbp) {
		297	.ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
		298	else => false,
		299	};
		300	}
		301
79	test "Word Break Properties" {	302	test "Word Break Properties" {
80	const wb = try WordBreak.init(testing.allocator);	303	const wb = try WordBreak.init(testing.allocator);
81	defer wb.deinit(testing.allocator);	304	defer wb.deinit(testing.allocator);
@@ -99,4 +322,9 @@ const builtin = @import("builtin");
99	const compress = std.compress;	322	const compress = std.compress;
100	const mem = std.mem;	323	const mem = std.mem;
101	const Allocator = mem.Allocator;	324	const Allocator = mem.Allocator;
		325	const assert = std.debug.assert;
102	const testing = std.testing;	326	const testing = std.testing;
		327
		328	const code_point = @import("code_point");
		329	const CodepointIterator = code_point.Iterator;
		330	const CodePoint = code_point.CodePoint;