summaryrefslogtreecommitdiff
path: root/src/WordBreak.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/WordBreak.zig')
-rw-r--r--src/WordBreak.zig111
1 files changed, 37 insertions, 74 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 53db76b..a2be011 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -132,28 +132,21 @@ pub const Iterator = struct {
132 const word_start = iter.this.?.offset; 132 const word_start = iter.this.?.offset;
133 var word_len: u32 = 0; 133 var word_len: u32 = 0;
134 134
135 var state: IterState = .initial; 135 // state variables
136 var last_p: WordBreakProperty = .none;
137 var last_last_p: WordBreakProperty = .none;
138 var ri_count: usize = 0;
136 139
137 scan: while (true) : (iter.advance()) { 140 scan: while (true) : (iter.advance()) {
138 const this = iter.this.?; 141 const this = iter.this.?;
139 word_len += this.len; 142 word_len += this.len;
140 var ignored = false;
141 if (iter.that) |that| { 143 if (iter.that) |that| {
144 const this_p = iter.wb.breakProperty(this.code); // WB3 CR × LF
142 const that_p = iter.wb.breakProperty(that.code); 145 const that_p = iter.wb.breakProperty(that.code);
143 const this_p = this_p: { 146 if (!isIgnorable(this_p)) {
144 if (!isIgnorable(that_p) and iter.cache != null) { 147 last_last_p = last_p;
145 // TODO: might not need these what with peekPast 148 last_p = this_p;
146 ignored = true; 149 }
147 defer iter.cache = null;
148 // Fixup some state, apply pre-4 rules
149 const restore = iter.cache.?;
150 if (restore == .WSegSpace) break :this_p .none;
151 break :this_p restore;
152 } else {
153 break :this_p iter.wb.breakProperty(this.code);
154 }
155 };
156 // WB3 CR × LF
157 if (this_p == .CR and that_p == .LF) continue :scan; 150 if (this_p == .CR and that_p == .LF) continue :scan;
158 // WB3a (Newline | CR | LF) ÷ 151 // WB3a (Newline | CR | LF) ÷
159 if (isNewline(this_p)) break :scan; 152 if (isNewline(this_p)) break :scan;
@@ -161,27 +154,15 @@ pub const Iterator = struct {
161 if (isNewline(that_p)) break :scan; 154 if (isNewline(that_p)) break :scan;
162 // WB3c ZWJ × \p{Extended_Pictographic} 155 // WB3c ZWJ × \p{Extended_Pictographic}
163 if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { 156 if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) {
164 // Invalid after ignoring 157 continue :scan;
165 if (ignored) break :scan else continue :scan;
166 } 158 }
167 // WB3d WSegSpace × WSegSpace 159 // WB3d WSegSpace × WSegSpace
168 if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; 160 if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
169 // WB4 X (Extend | Format | ZWJ)* → X 161 // WB4 X (Extend | Format | ZWJ)* → X
170 if (isIgnorable(that_p)) { 162 if (isIgnorable(that_p)) {
171 if (that_p == .ZWJ) {
172 const next_val = iter.peekPast();
173 if (next_val) |next_cp| {
174 if (ext_pict.isMatch(next_cp.bytes(iter.cp_iter.bytes))) {
175 continue :scan;
176 }
177 }
178 }
179 if (iter.cache == null) {
180 iter.cache = this_p;
181 }
182 continue :scan; 163 continue :scan;
183 } 164 } // Now we use last_p instead of this_p for ignorable's sake
184 if (isAHLetter(this_p)) { 165 if (isAHLetter(last_p)) {
185 // WB5 AHLetter × AHLetter 166 // WB5 AHLetter × AHLetter
186 if (isAHLetter(that_p)) continue :scan; 167 if (isAHLetter(that_p)) continue :scan;
187 // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter 168 // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter
@@ -190,21 +171,16 @@ pub const Iterator = struct {
190 if (next_val) |next_cp| { 171 if (next_val) |next_cp| {
191 const next_p = iter.wb.breakProperty(next_cp.code); 172 const next_p = iter.wb.breakProperty(next_cp.code);
192 if (isAHLetter(next_p)) { 173 if (isAHLetter(next_p)) {
193 state.mid_punct = true;
194 continue :scan; 174 continue :scan;
195 } 175 }
196 } 176 }
197 } 177 }
198 } 178 }
199 // AHLetter (MidLetter | MidNumLetQ) × AHLetter 179 // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
200 if (state.mid_punct) { 180 if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) {
201 // Should always be true:
202 assert(isMidVal(this_p));
203 assert(isAHLetter(that_p));
204 state.mid_punct = false;
205 continue :scan; 181 continue :scan;
206 } 182 }
207 if (this_p == .Hebrew_Letter) { 183 if (last_p == .Hebrew_Letter) {
208 // WB7a Hebrew_Letter × Single_Quote 184 // WB7a Hebrew_Letter × Single_Quote
209 if (that_p == .Single_Quote) continue :scan; 185 if (that_p == .Single_Quote) continue :scan;
210 // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter 186 // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
@@ -213,62 +189,44 @@ pub const Iterator = struct {
213 if (next_val) |next_cp| { 189 if (next_val) |next_cp| {
214 const next_p = iter.wb.breakProperty(next_cp.code); 190 const next_p = iter.wb.breakProperty(next_cp.code);
215 if (next_p == .Hebrew_Letter) { 191 if (next_p == .Hebrew_Letter) {
216 state.quote_heb = true;
217 continue :scan; 192 continue :scan;
218 } 193 }
219 } else break :scan; 194 }
220 } 195 }
221 } 196 }
222 // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter 197 // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
223 if (state.quote_heb) { 198 if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter)
224 // Should always be true:
225 assert(this_p == .Double_Quote);
226 assert(that_p == .Hebrew_Letter);
227 state.quote_heb = false;
228 continue :scan; 199 continue :scan;
229 }
230 // WB8 Numeric × Numeric 200 // WB8 Numeric × Numeric
231 if (this_p == .Numeric and that_p == .Numeric) continue :scan; 201 if (last_p == .Numeric and that_p == .Numeric) continue :scan;
232 // WB9 AHLetter × Numeric 202 // WB9 AHLetter × Numeric
233 if (isAHLetter(this_p) and that_p == .Numeric) continue :scan; 203 if (isAHLetter(last_p) and that_p == .Numeric) continue :scan;
234 // WB10 Numeric × AHLetter 204 // WB10 Numeric × AHLetter
235 if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; 205 if (last_p == .Numeric and isAHLetter(that_p)) continue :scan;
206 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric
207 if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric)
208 continue :scan;
236 // WB12 Numeric × (MidNum | MidNumLetQ) Numeric 209 // WB12 Numeric × (MidNum | MidNumLetQ) Numeric
237 if (this_p == .Numeric and isMidNum(that_p)) { 210 if (last_p == .Numeric and isMidNum(that_p)) {
238 const next_val = iter.peekPast(); 211 const next_val = iter.peekPast();
239 if (next_val) |next_cp| { 212 if (next_val) |next_cp| {
240 const next_p = iter.wb.breakProperty(next_cp.code); 213 const next_p = iter.wb.breakProperty(next_cp.code);
241 if (next_p == .Numeric) { 214 if (next_p == .Numeric) {
242 state.mid_num = true;
243 continue :scan; 215 continue :scan;
244 } 216 }
245 } else break :scan; 217 }
246 }
247 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric
248 if (state.mid_num) {
249 assert(isMidNum(this_p));
250 assert(that_p == .Numeric);
251 state.mid_num = false;
252 continue :scan;
253 } 218 }
254 // WB13 Katakana × Katakana 219 // WB13 Katakana × Katakana
255 if (this_p == .Katakana and that_p == .Katakana) continue :scan; 220 if (last_p == .Katakana and that_p == .Katakana) continue :scan;
256 // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet 221 // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
257 if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; 222 if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan;
258 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) 223 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana)
259 if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; 224 if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
260 // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI 225 // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI
261 if (this_p == .Regional_Indicator) { 226 const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator;
262 if (that_p == .Regional_Indicator) { 227 if (maybe_flag) {
263 if (state.regional == true or this.offset == 0) { 228 ri_count += 1;
264 state.regional = false; 229 if (ri_count % 2 == 1) continue :scan;
265 continue :scan;
266 }
267 } else {
268 state.regional = true;
269 }
270 } else if (that_p == .Regional_Indicator) {
271 state.regional = true;
272 } 230 }
273 // WB999 Any ÷ Any 231 // WB999 Any ÷ Any
274 break :scan; 232 break :scan;
@@ -337,6 +295,11 @@ test "Word Break Properties" {
337 try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); 295 try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}'));
338} 296}
339 297
298test "ext_pic" {
299 try testing.expect(ext_pict.isMatch("👇"));
300 try testing.expect(ext_pict.isMatch("\u{2704}"));
301}
302
340fn testAllocations(allocator: Allocator) !void { 303fn testAllocations(allocator: Allocator) !void {
341 const wb = try WordBreak.init(allocator); 304 const wb = try WordBreak.init(allocator);
342 wb.deinit(allocator); 305 wb.deinit(allocator);