summaryrefslogtreecommitdiff
path: root/src/WordBreak.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/WordBreak.zig')
-rw-r--r--src/WordBreak.zig83
1 files changed, 57 insertions, 26 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 84fd1f7..53db76b 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -88,6 +88,11 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
88 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); 88 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
89} 89}
90 90
91/// Returns an iterator over words in `slice`
92pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator {
93 return Iterator.init(wordbreak, slice);
94}
95
91const IterState = packed struct { 96const IterState = packed struct {
92 mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter 97 mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter
93 mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric 98 mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric
@@ -113,7 +118,7 @@ pub const Iterator = struct {
113 pub fn init(wb: *const WordBreak, str: []const u8) Iterator { 118 pub fn init(wb: *const WordBreak, str: []const u8) Iterator {
114 var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; 119 var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb };
115 wb_iter.advance(); 120 wb_iter.advance();
116 return wb; 121 return wb_iter;
117 } 122 }
118 123
119 pub fn next(iter: *Iterator) ?Word { 124 pub fn next(iter: *Iterator) ?Word {
@@ -132,12 +137,18 @@ pub const Iterator = struct {
132 scan: while (true) : (iter.advance()) { 137 scan: while (true) : (iter.advance()) {
133 const this = iter.this.?; 138 const this = iter.this.?;
134 word_len += this.len; 139 word_len += this.len;
140 var ignored = false;
135 if (iter.that) |that| { 141 if (iter.that) |that| {
136 const that_p = iter.wb.breakProperty(that.code); 142 const that_p = iter.wb.breakProperty(that.code);
137 const this_p = this_p: { 143 const this_p = this_p: {
138 if (!isIgnorable(that_p) and iter.cache != null) { 144 if (!isIgnorable(that_p) and iter.cache != null) {
145 // TODO: might not need these what with peekPast
146 ignored = true;
139 defer iter.cache = null; 147 defer iter.cache = null;
140 break :this_p iter.cache.?; 148 // Fixup some state, apply pre-4 rules
149 const restore = iter.cache.?;
150 if (restore == .WSegSpace) break :this_p .none;
151 break :this_p restore;
141 } else { 152 } else {
142 break :this_p iter.wb.breakProperty(this.code); 153 break :this_p iter.wb.breakProperty(this.code);
143 } 154 }
@@ -149,11 +160,22 @@ pub const Iterator = struct {
149 // WB3b ÷ (Newline | CR | LF) 160 // WB3b ÷ (Newline | CR | LF)
150 if (isNewline(that_p)) break :scan; 161 if (isNewline(that_p)) break :scan;
151 // WB3c ZWJ × \p{Extended_Pictographic} 162 // WB3c ZWJ × \p{Extended_Pictographic}
152 // The right way to do this one is a RuneSet, TODO: circle back 163 if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) {
164 // Invalid after ignoring
165 if (ignored) break :scan else continue :scan;
166 }
153 // WB3d WSegSpace × WSegSpace 167 // WB3d WSegSpace × WSegSpace
154 if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; 168 if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
155 // WB4 X (Extend | Format | ZWJ)* → X 169 // WB4 X (Extend | Format | ZWJ)* → X
156 if (isIgnorable(that_p)) { 170 if (isIgnorable(that_p)) {
171 if (that_p == .ZWJ) {
172 const next_val = iter.peekPast();
173 if (next_val) |next_cp| {
174 if (ext_pict.isMatch(next_cp.bytes(iter.cp_iter.bytes))) {
175 continue :scan;
176 }
177 }
178 }
157 if (iter.cache == null) { 179 if (iter.cache == null) {
158 iter.cache = this_p; 180 iter.cache = this_p;
159 } 181 }
@@ -164,14 +186,14 @@ pub const Iterator = struct {
164 if (isAHLetter(that_p)) continue :scan; 186 if (isAHLetter(that_p)) continue :scan;
165 // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter 187 // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter
166 if (isMidVal(that_p)) { 188 if (isMidVal(that_p)) {
167 const next_val = iter.cp_iter.peek(); 189 const next_val = iter.peekPast();
168 if (next_val) |next_cp| { 190 if (next_val) |next_cp| {
169 const next_p = iter.wb.breakProperty(next_cp.code); 191 const next_p = iter.wb.breakProperty(next_cp.code);
170 if (isAHLetter(next_p)) { 192 if (isAHLetter(next_p)) {
171 state.mid_punct = true; 193 state.mid_punct = true;
172 continue :scan; 194 continue :scan;
173 } 195 }
174 } else break :scan; 196 }
175 } 197 }
176 } 198 }
177 // AHLetter (MidLetter | MidNumLetQ) × AHLetter 199 // AHLetter (MidLetter | MidNumLetQ) × AHLetter
@@ -187,7 +209,7 @@ pub const Iterator = struct {
187 if (that_p == .Single_Quote) continue :scan; 209 if (that_p == .Single_Quote) continue :scan;
188 // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter 210 // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
189 if (that_p == .Double_Quote) { 211 if (that_p == .Double_Quote) {
190 const next_val = iter.cp_iter.peek(); 212 const next_val = iter.peekPast();
191 if (next_val) |next_cp| { 213 if (next_val) |next_cp| {
192 const next_p = iter.wb.breakProperty(next_cp.code); 214 const next_p = iter.wb.breakProperty(next_cp.code);
193 if (next_p == .Hebrew_Letter) { 215 if (next_p == .Hebrew_Letter) {
@@ -212,8 +234,8 @@ pub const Iterator = struct {
212 // WB10 Numeric × AHLetter 234 // WB10 Numeric × AHLetter
213 if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; 235 if (this_p == .Numeric and isAHLetter(that_p)) continue :scan;
214 // WB12 Numeric × (MidNum | MidNumLetQ) Numeric 236 // WB12 Numeric × (MidNum | MidNumLetQ) Numeric
215 if (this_p == .Numeric and isMidVal(that_p)) { 237 if (this_p == .Numeric and isMidNum(that_p)) {
216 const next_val = iter.cp_iter.peek(); 238 const next_val = iter.peekPast();
217 if (next_val) |next_cp| { 239 if (next_val) |next_cp| {
218 const next_p = iter.wb.breakProperty(next_cp.code); 240 const next_p = iter.wb.breakProperty(next_cp.code);
219 if (next_p == .Numeric) { 241 if (next_p == .Numeric) {
@@ -224,7 +246,7 @@ pub const Iterator = struct {
224 } 246 }
225 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric 247 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric
226 if (state.mid_num) { 248 if (state.mid_num) {
227 assert(isMidVal(this_p)); 249 assert(isMidNum(this_p));
228 assert(that_p == .Numeric); 250 assert(that_p == .Numeric);
229 state.mid_num = false; 251 state.mid_num = false;
230 continue :scan; 252 continue :scan;
@@ -235,25 +257,18 @@ pub const Iterator = struct {
235 if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; 257 if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan;
236 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) 258 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana)
237 if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; 259 if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
238 // WB15, WB16 ([^RI] ! sot) (RI RI)* RI × RI 260 // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI
239 if (that_p == .Regional_Indicator) { 261 if (this_p == .Regional_Indicator) {
240 if (this_p == .Regional_Indicator) { 262 if (that_p == .Regional_Indicator) {
241 if (state.regional) { 263 if (state.regional == true or this.offset == 0) {
242 state.regional = false; 264 state.regional = false;
243 continue :scan; 265 continue :scan;
244 } else {
245 break :scan;
246 } 266 }
247 } else { 267 } else {
248 const next_val = iter.cp_iter.peek(); 268 state.regional = true;
249 if (next_val) |next_cp| {
250 const next_p = iter.wb.breakProperty(next_cp.code);
251 if (next_p == .Regional_Indicator) {
252 state.regional = true;
253 continue :scan;
254 }
255 } else break :scan;
256 } 269 }
270 } else if (that_p == .Regional_Indicator) {
271 state.regional = true;
257 } 272 }
258 // WB999 Any ÷ Any 273 // WB999 Any ÷ Any
259 break :scan; 274 break :scan;
@@ -265,9 +280,19 @@ pub const Iterator = struct {
265 return Word{ .len = word_len, .offset = word_start }; 280 return Word{ .len = word_len, .offset = word_start };
266 } 281 }
267 282
268 fn advance(wb_iter: *Iterator) void { 283 fn advance(iter: *Iterator) void {
269 wb_iter.this = wb_iter.that; 284 iter.this = iter.that;
270 wb_iter.that = wb_iter.cp_iter.next(); 285 iter.that = iter.cp_iter.next();
286 }
287
288 fn peekPast(iter: *Iterator) ?CodePoint {
289 const save_cp = iter.cp_iter;
290 defer iter.cp_iter = save_cp;
291 while (iter.cp_iter.peek()) |peeked| {
292 if (!isIgnorable(iter.wb.breakProperty(peeked.code))) return peeked;
293 _ = iter.cp_iter.next();
294 }
295 return null;
271 } 296 }
272}; 297};
273 298
@@ -292,6 +317,10 @@ inline fn isMidVal(wbp: WordBreakProperty) bool {
292 return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; 317 return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote;
293} 318}
294 319
320inline fn isMidNum(wbp: WordBreakProperty) bool {
321 return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote;
322}
323
295inline fn isExtensible(wbp: WordBreakProperty) bool { 324inline fn isExtensible(wbp: WordBreakProperty) bool {
296 return switch (wbp) { 325 return switch (wbp) {
297 .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, 326 .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true,
@@ -328,3 +357,5 @@ const testing = std.testing;
328const code_point = @import("code_point"); 357const code_point = @import("code_point");
329const CodepointIterator = code_point.Iterator; 358const CodepointIterator = code_point.Iterator;
330const CodePoint = code_point.CodePoint; 359const CodePoint = code_point.CodePoint;
360
361const ext_pict = @import("micro_runeset.zig").Extended_Pictographic;