summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-12 12:57:49 -0400
committerGravatar Sam Atman2025-05-15 15:31:15 -0400
commita832b90bc00503a201bf457df1f49dc338314e7b (patch)
tree041c975b63e17db81edfc6923cfafa63691e3b33
parentVastly simplify peek() (diff)
downloadzg-a832b90bc00503a201bf457df1f49dc338314e7b.tar.gz
zg-a832b90bc00503a201bf457df1f49dc338314e7b.tar.xz
zg-a832b90bc00503a201bf457df1f49dc338314e7b.zip
Implement Word iterator
A by-the-book implmentation of the word break rules from tr29. This is superficially inefficient, but compilers are more than able to handle the common subexpression folding ignored by this approach. Now to port the WordBreakPropertyTests, and clean up the inevitable bugs in the implementation.
-rw-r--r--src/WordBreak.zig228
1 files changed, 228 insertions, 0 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 9044740..84fd1f7 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -71,11 +71,234 @@ pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void {
71 allocator.free(wordbreak.s2); 71 allocator.free(wordbreak.s2);
72} 72}
73 73
74/// Represents a Unicode word span, as an offset into the source string
75/// and the length of the word.
76pub const Word = struct {
77 offset: u32,
78 len: u32,
79
80 /// Returns a slice of the word given the source string.
81 pub fn bytes(self: Word, src: []const u8) []const u8 {
82 return src[self.offset..][0..self.len];
83 }
84};
85
74/// Returns the word break property type for `cp`. 86/// Returns the word break property type for `cp`.
75pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { 87pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
76 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); 88 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
77} 89}
78 90
91const IterState = packed struct {
92 mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter
93 mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric
94 quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter
95 regional: bool, // [^RI] (RI RI)* RI × RI
96
97 pub const initial: IterState = .{
98 .mid_punct = false,
99 .mid_num = false,
100 .quote_heb = false,
101 .regional = false,
102 };
103};
104
105pub const Iterator = struct {
106 this: ?CodePoint = null,
107 that: ?CodePoint = null,
108 cache: ?WordBreakProperty = null,
109 cp_iter: CodepointIterator,
110 wb: *const WordBreak,
111
112 /// Assumes `str` is valid UTF-8.
113 pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
114 var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb };
115 wb_iter.advance();
116 return wb;
117 }
118
119 pub fn next(iter: *Iterator) ?Word {
120 iter.advance();
121
122 // Done?
123 if (iter.this == null) return null;
124 // Last?
125 if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
126
127 const word_start = iter.this.?.offset;
128 var word_len: u32 = 0;
129
130 var state: IterState = .initial;
131
132 scan: while (true) : (iter.advance()) {
133 const this = iter.this.?;
134 word_len += this.len;
135 if (iter.that) |that| {
136 const that_p = iter.wb.breakProperty(that.code);
137 const this_p = this_p: {
138 if (!isIgnorable(that_p) and iter.cache != null) {
139 defer iter.cache = null;
140 break :this_p iter.cache.?;
141 } else {
142 break :this_p iter.wb.breakProperty(this.code);
143 }
144 };
145 // WB3 CR × LF
146 if (this_p == .CR and that_p == .LF) continue :scan;
147 // WB3a (Newline | CR | LF) ÷
148 if (isNewline(this_p)) break :scan;
149 // WB3b ÷ (Newline | CR | LF)
150 if (isNewline(that_p)) break :scan;
151 // WB3c ZWJ × \p{Extended_Pictographic}
152 // The right way to do this one is a RuneSet, TODO: circle back
153 // WB3d WSegSpace × WSegSpace
154 if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
155 // WB4 X (Extend | Format | ZWJ)* → X
156 if (isIgnorable(that_p)) {
157 if (iter.cache == null) {
158 iter.cache = this_p;
159 }
160 continue :scan;
161 }
162 if (isAHLetter(this_p)) {
163 // WB5 AHLetter × AHLetter
164 if (isAHLetter(that_p)) continue :scan;
165 // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter
166 if (isMidVal(that_p)) {
167 const next_val = iter.cp_iter.peek();
168 if (next_val) |next_cp| {
169 const next_p = iter.wb.breakProperty(next_cp.code);
170 if (isAHLetter(next_p)) {
171 state.mid_punct = true;
172 continue :scan;
173 }
174 } else break :scan;
175 }
176 }
177 // AHLetter (MidLetter | MidNumLetQ) × AHLetter
178 if (state.mid_punct) {
179 // Should always be true:
180 assert(isMidVal(this_p));
181 assert(isAHLetter(that_p));
182 state.mid_punct = false;
183 continue :scan;
184 }
185 if (this_p == .Hebrew_Letter) {
186 // WB7a Hebrew_Letter × Single_Quote
187 if (that_p == .Single_Quote) continue :scan;
188 // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
189 if (that_p == .Double_Quote) {
190 const next_val = iter.cp_iter.peek();
191 if (next_val) |next_cp| {
192 const next_p = iter.wb.breakProperty(next_cp.code);
193 if (next_p == .Hebrew_Letter) {
194 state.quote_heb = true;
195 continue :scan;
196 }
197 } else break :scan;
198 }
199 }
200 // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
201 if (state.quote_heb) {
202 // Should always be true:
203 assert(this_p == .Double_Quote);
204 assert(that_p == .Hebrew_Letter);
205 state.quote_heb = false;
206 continue :scan;
207 }
208 // WB8 Numeric × Numeric
209 if (this_p == .Numeric and that_p == .Numeric) continue :scan;
210 // WB9 AHLetter × Numeric
211 if (isAHLetter(this_p) and that_p == .Numeric) continue :scan;
212 // WB10 Numeric × AHLetter
213 if (this_p == .Numeric and isAHLetter(that_p)) continue :scan;
214 // WB12 Numeric × (MidNum | MidNumLetQ) Numeric
215 if (this_p == .Numeric and isMidVal(that_p)) {
216 const next_val = iter.cp_iter.peek();
217 if (next_val) |next_cp| {
218 const next_p = iter.wb.breakProperty(next_cp.code);
219 if (next_p == .Numeric) {
220 state.mid_num = true;
221 continue :scan;
222 }
223 } else break :scan;
224 }
225 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric
226 if (state.mid_num) {
227 assert(isMidVal(this_p));
228 assert(that_p == .Numeric);
229 state.mid_num = false;
230 continue :scan;
231 }
232 // WB13 Katakana × Katakana
233 if (this_p == .Katakana and that_p == .Katakana) continue :scan;
234 // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
235 if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan;
236 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana)
237 if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
238 // WB15, WB16 ([^RI] ! sot) (RI RI)* RI × RI
239 if (that_p == .Regional_Indicator) {
240 if (this_p == .Regional_Indicator) {
241 if (state.regional) {
242 state.regional = false;
243 continue :scan;
244 } else {
245 break :scan;
246 }
247 } else {
248 const next_val = iter.cp_iter.peek();
249 if (next_val) |next_cp| {
250 const next_p = iter.wb.breakProperty(next_cp.code);
251 if (next_p == .Regional_Indicator) {
252 state.regional = true;
253 continue :scan;
254 }
255 } else break :scan;
256 }
257 }
258 // WB999 Any ÷ Any
259 break :scan;
260 } else { // iter.that == null
261 break :scan;
262 }
263 }
264
265 return Word{ .len = word_len, .offset = word_start };
266 }
267
268 fn advance(wb_iter: *Iterator) void {
269 wb_iter.this = wb_iter.that;
270 wb_iter.that = wb_iter.cp_iter.next();
271 }
272};
273
274//| Predicates
275
276inline fn isNewline(wbp: WordBreakProperty) bool {
277 return wbp == .CR or wbp == .LF or wbp == .Newline;
278}
279
280inline fn isIgnorable(wbp: WordBreakProperty) bool {
281 return switch (wbp) {
282 .Format, .Extend, .ZWJ => true,
283 else => false,
284 };
285}
286
287inline fn isAHLetter(wbp: WordBreakProperty) bool {
288 return wbp == .ALetter or wbp == .Hebrew_Letter;
289}
290
291inline fn isMidVal(wbp: WordBreakProperty) bool {
292 return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
293}
294
295inline fn isExtensible(wbp: WordBreakProperty) bool {
296 return switch (wbp) {
297 .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
298 else => false,
299 };
300}
301
79test "Word Break Properties" { 302test "Word Break Properties" {
80 const wb = try WordBreak.init(testing.allocator); 303 const wb = try WordBreak.init(testing.allocator);
81 defer wb.deinit(testing.allocator); 304 defer wb.deinit(testing.allocator);
@@ -99,4 +322,9 @@ const builtin = @import("builtin");
99const compress = std.compress; 322const compress = std.compress;
100const mem = std.mem; 323const mem = std.mem;
101const Allocator = mem.Allocator; 324const Allocator = mem.Allocator;
325const assert = std.debug.assert;
102const testing = std.testing; 326const testing = std.testing;
327
328const code_point = @import("code_point");
329const CodepointIterator = code_point.Iterator;
330const CodePoint = code_point.CodePoint;