summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/WordBreak.zig111
-rw-r--r--src/micro_runeset.zig4
-rw-r--r--src/unicode_tests.zig3
3 files changed, 40 insertions, 78 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
index 53db76b..a2be011 100644
--- a/src/WordBreak.zig
+++ b/src/WordBreak.zig
@@ -132,28 +132,21 @@ pub const Iterator = struct {
132 const word_start = iter.this.?.offset; 132 const word_start = iter.this.?.offset;
133 var word_len: u32 = 0; 133 var word_len: u32 = 0;
134 134
135 var state: IterState = .initial; 135 // state variables
136 var last_p: WordBreakProperty = .none;
137 var last_last_p: WordBreakProperty = .none;
138 var ri_count: usize = 0;
136 139
137 scan: while (true) : (iter.advance()) { 140 scan: while (true) : (iter.advance()) {
138 const this = iter.this.?; 141 const this = iter.this.?;
139 word_len += this.len; 142 word_len += this.len;
140 var ignored = false;
141 if (iter.that) |that| { 143 if (iter.that) |that| {
144 const this_p = iter.wb.breakProperty(this.code); // WB3 CR × LF
142 const that_p = iter.wb.breakProperty(that.code); 145 const that_p = iter.wb.breakProperty(that.code);
143 const this_p = this_p: { 146 if (!isIgnorable(this_p)) {
144 if (!isIgnorable(that_p) and iter.cache != null) { 147 last_last_p = last_p;
145 // TODO: might not need these what with peekPast 148 last_p = this_p;
146 ignored = true; 149 }
147 defer iter.cache = null;
148 // Fixup some state, apply pre-4 rules
149 const restore = iter.cache.?;
150 if (restore == .WSegSpace) break :this_p .none;
151 break :this_p restore;
152 } else {
153 break :this_p iter.wb.breakProperty(this.code);
154 }
155 };
156 // WB3 CR × LF
157 if (this_p == .CR and that_p == .LF) continue :scan; 150 if (this_p == .CR and that_p == .LF) continue :scan;
158 // WB3a (Newline | CR | LF) ÷ 151 // WB3a (Newline | CR | LF) ÷
159 if (isNewline(this_p)) break :scan; 152 if (isNewline(this_p)) break :scan;
@@ -161,27 +154,15 @@ pub const Iterator = struct {
161 if (isNewline(that_p)) break :scan; 154 if (isNewline(that_p)) break :scan;
162 // WB3c ZWJ × \p{Extended_Pictographic} 155 // WB3c ZWJ × \p{Extended_Pictographic}
163 if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { 156 if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) {
164 // Invalid after ignoring 157 continue :scan;
165 if (ignored) break :scan else continue :scan;
166 } 158 }
167 // WB3d WSegSpace × WSegSpace 159 // WB3d WSegSpace × WSegSpace
168 if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; 160 if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan;
169 // WB4 X (Extend | Format | ZWJ)* → X 161 // WB4 X (Extend | Format | ZWJ)* → X
170 if (isIgnorable(that_p)) { 162 if (isIgnorable(that_p)) {
171 if (that_p == .ZWJ) {
172 const next_val = iter.peekPast();
173 if (next_val) |next_cp| {
174 if (ext_pict.isMatch(next_cp.bytes(iter.cp_iter.bytes))) {
175 continue :scan;
176 }
177 }
178 }
179 if (iter.cache == null) {
180 iter.cache = this_p;
181 }
182 continue :scan; 163 continue :scan;
183 } 164 } // Now we use last_p instead of this_p for ignorable's sake
184 if (isAHLetter(this_p)) { 165 if (isAHLetter(last_p)) {
185 // WB5 AHLetter × AHLetter 166 // WB5 AHLetter × AHLetter
186 if (isAHLetter(that_p)) continue :scan; 167 if (isAHLetter(that_p)) continue :scan;
187 // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter 168 // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter
@@ -190,21 +171,16 @@ pub const Iterator = struct {
190 if (next_val) |next_cp| { 171 if (next_val) |next_cp| {
191 const next_p = iter.wb.breakProperty(next_cp.code); 172 const next_p = iter.wb.breakProperty(next_cp.code);
192 if (isAHLetter(next_p)) { 173 if (isAHLetter(next_p)) {
193 state.mid_punct = true;
194 continue :scan; 174 continue :scan;
195 } 175 }
196 } 176 }
197 } 177 }
198 } 178 }
199 // AHLetter (MidLetter | MidNumLetQ) × AHLetter 179 // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter
200 if (state.mid_punct) { 180 if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) {
201 // Should always be true:
202 assert(isMidVal(this_p));
203 assert(isAHLetter(that_p));
204 state.mid_punct = false;
205 continue :scan; 181 continue :scan;
206 } 182 }
207 if (this_p == .Hebrew_Letter) { 183 if (last_p == .Hebrew_Letter) {
208 // WB7a Hebrew_Letter × Single_Quote 184 // WB7a Hebrew_Letter × Single_Quote
209 if (that_p == .Single_Quote) continue :scan; 185 if (that_p == .Single_Quote) continue :scan;
210 // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter 186 // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
@@ -213,62 +189,44 @@ pub const Iterator = struct {
213 if (next_val) |next_cp| { 189 if (next_val) |next_cp| {
214 const next_p = iter.wb.breakProperty(next_cp.code); 190 const next_p = iter.wb.breakProperty(next_cp.code);
215 if (next_p == .Hebrew_Letter) { 191 if (next_p == .Hebrew_Letter) {
216 state.quote_heb = true;
217 continue :scan; 192 continue :scan;
218 } 193 }
219 } else break :scan; 194 }
220 } 195 }
221 } 196 }
222 // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter 197 // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
223 if (state.quote_heb) { 198 if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter)
224 // Should always be true:
225 assert(this_p == .Double_Quote);
226 assert(that_p == .Hebrew_Letter);
227 state.quote_heb = false;
228 continue :scan; 199 continue :scan;
229 }
230 // WB8 Numeric × Numeric 200 // WB8 Numeric × Numeric
231 if (this_p == .Numeric and that_p == .Numeric) continue :scan; 201 if (last_p == .Numeric and that_p == .Numeric) continue :scan;
232 // WB9 AHLetter × Numeric 202 // WB9 AHLetter × Numeric
233 if (isAHLetter(this_p) and that_p == .Numeric) continue :scan; 203 if (isAHLetter(last_p) and that_p == .Numeric) continue :scan;
234 // WB10 Numeric × AHLetter 204 // WB10 Numeric × AHLetter
235 if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; 205 if (last_p == .Numeric and isAHLetter(that_p)) continue :scan;
206 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric
207 if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric)
208 continue :scan;
236 // WB12 Numeric × (MidNum | MidNumLetQ) Numeric 209 // WB12 Numeric × (MidNum | MidNumLetQ) Numeric
237 if (this_p == .Numeric and isMidNum(that_p)) { 210 if (last_p == .Numeric and isMidNum(that_p)) {
238 const next_val = iter.peekPast(); 211 const next_val = iter.peekPast();
239 if (next_val) |next_cp| { 212 if (next_val) |next_cp| {
240 const next_p = iter.wb.breakProperty(next_cp.code); 213 const next_p = iter.wb.breakProperty(next_cp.code);
241 if (next_p == .Numeric) { 214 if (next_p == .Numeric) {
242 state.mid_num = true;
243 continue :scan; 215 continue :scan;
244 } 216 }
245 } else break :scan; 217 }
246 }
247 // WB11 Numeric (MidNum | MidNumLetQ) × Numeric
248 if (state.mid_num) {
249 assert(isMidNum(this_p));
250 assert(that_p == .Numeric);
251 state.mid_num = false;
252 continue :scan;
253 } 218 }
254 // WB13 Katakana × Katakana 219 // WB13 Katakana × Katakana
255 if (this_p == .Katakana and that_p == .Katakana) continue :scan; 220 if (last_p == .Katakana and that_p == .Katakana) continue :scan;
256 // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet 221 // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
257 if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; 222 if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan;
258 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) 223 // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana)
259 if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; 224 if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan;
260 // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI 225 // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI
261 if (this_p == .Regional_Indicator) { 226 const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator;
262 if (that_p == .Regional_Indicator) { 227 if (maybe_flag) {
263 if (state.regional == true or this.offset == 0) { 228 ri_count += 1;
264 state.regional = false; 229 if (ri_count % 2 == 1) continue :scan;
265 continue :scan;
266 }
267 } else {
268 state.regional = true;
269 }
270 } else if (that_p == .Regional_Indicator) {
271 state.regional = true;
272 } 230 }
273 // WB999 Any ÷ Any 231 // WB999 Any ÷ Any
274 break :scan; 232 break :scan;
@@ -337,6 +295,11 @@ test "Word Break Properties" {
337 try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); 295 try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}'));
338} 296}
339 297
298test "ext_pic" {
299 try testing.expect(ext_pict.isMatch("👇"));
300 try testing.expect(ext_pict.isMatch("\u{2704}"));
301}
302
340fn testAllocations(allocator: Allocator) !void { 303fn testAllocations(allocator: Allocator) !void {
341 const wb = try WordBreak.init(allocator); 304 const wb = try WordBreak.init(allocator);
342 wb.deinit(allocator); 305 wb.deinit(allocator);
diff --git a/src/micro_runeset.zig b/src/micro_runeset.zig
index 34fbcd3..80ce4bf 100644
--- a/src/micro_runeset.zig
+++ b/src/micro_runeset.zig
@@ -9,7 +9,7 @@
9//! The RuneSet is borrowed from Runicode, which encodes Unicode things 9//! The RuneSet is borrowed from Runicode, which encodes Unicode things
10//! in RuneSet form. This will need updating for each version of Unicode. 10//! in RuneSet form. This will need updating for each version of Unicode.
11 11
12pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x8000060, 0x10000000000000, 0x8001000200600000, 0x7800985090, 0x801022055ef2d, 0xedf57effffffdf57, 0xaffd75bd6f7d001f, 0xdbffffbbbff7ff7f, 0x7d7fddd76f56dfb5, 0x3800000000000001, 0x40040000000000, 0x4, 0x30bae0000008000, 0x100, 0x10004000000, 0x20001f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xfffffffffffffff7, 0xffffffffffffffff, 0xffffffffffffffff, 0x7fffffffffffbfff, 0x800000006000, 0x4001700000000000, 0xffffe00003fe4000, 0x1fffffffff, 0x73fc800004007ffa, 0xfffffffffffd7e00, 0xffffffffffffffff, 0x7fffffffffffffff, 0xffd56ff6bedfafff, 0x77ffffffffff7bff, 0xffffffff5757ffff, 0x3fafff77ff7bfef, 0xbffffdfffffab77f, 0xffffd7efffffffff, 0xff5fefffffffffff, 0xef6fd7ffffffffff, 0x1fffd7ffffefff7b, 0xfdfabf7ff7ffbac0, 0xf7faff77ffaf5dbf, 0x7dfbbf7eb7f6ffed, 0xfff7775fbfefdebf, 0x7fee, 0xbedddfddfbf7f7db, 0x6ebb6edf776b7bdf, 0x7ff0000000000000, 0x7fff77ff7fe00000, 0x7000, 0x7c007f00, 0xffffc00000007f00, 0x7fffffffffffffff, 0xb3fb7f7fbeff7000, 0x7ebef7ffbfff779f, 0x7dff5bebff7dffef, 0x7fffffbfffff7bfb, 0xffffffffffffffff, 0x6b777fffffffffff, 0xdbbf6effffdfbebb, 0x7ebf7f7fb5bf5fdb, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x1fffffffffffffff } }; 12pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x180000e0, 0x30000000000000, 0x8001000200e00000, 0xf800b85090, 0x1801022057ff3f, 0xffffffffffffffff, 0xffffffffffff003f, 0xffffffffffffffff, 0xfffffffffff7ffbf, 0x7800000000000001, 0x400c0000000000, 0x4, 0x70ffe0000008000, 0x100, 0x1000c000000, 0x60003f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x80000000e000, 0xc003f00000000000, 0xffffe00007fe4000, 0x3fffffffff, 0xf7fc80000400fffe, 0xfffffffffffffe00, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x7ffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff, 0xffffffffffffffc0, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xfff0000000000000, 0xffffffffffe00000, 0xf000, 0xfc00ff00, 0xffffc0000000ff00, 0xffffffffffffffff, 0xf7fffffffffff000, 0xffffffffffffffbf, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff } };
13 13
14// Meaningful names for the T1 slots 14// Meaningful names for the T1 slots
15const LOW = 0; 15const LOW = 0;
@@ -27,7 +27,7 @@ pub const RuneSet = struct {
27 const set = runeset.body; 27 const set = runeset.body;
28 const a = codeunit(str[0]); 28 const a = codeunit(str[0]);
29 switch (a.kind) { 29 switch (a.kind) {
30 .follow => return false, 30 .follow => unreachable,
31 .low => { 31 .low => {
32 const mask = toMask(set[LOW]); 32 const mask = toMask(set[LOW]);
33 if (mask.isIn(a)) 33 if (mask.isIn(a))
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 7ce2b4e..59f0c6f 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -228,8 +228,7 @@ test "Segmentation Word Iterator" {
228 // Check. 228 // Check.
229 for (want.items, 1..) |want_word, i| { 229 for (want.items, 1..) |want_word, i| {
230 const got_word = (iter.next()).?; 230 const got_word = (iter.next()).?;
231 std.testing.expectEqualSlices( 231 std.testing.expectEqualStrings(
232 u8,
233 want_word.bytes(all_bytes.items), 232 want_word.bytes(all_bytes.items),
234 got_word.bytes(all_bytes.items), 233 got_word.bytes(all_bytes.items),
235 ) catch |err| { 234 ) catch |err| {