diff options
Diffstat (limited to 'src/WordBreak.zig')
| -rw-r--r-- | src/WordBreak.zig | 83 |
1 files changed, 57 insertions, 26 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 84fd1f7..53db76b 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig | |||
| @@ -88,6 +88,11 @@ pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { | |||
| 88 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); | 88 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); |
| 89 | } | 89 | } |
| 90 | 90 | ||
| 91 | /// Returns an iterator over words in `slice` | ||
| 92 | pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { | ||
| 93 | return Iterator.init(wordbreak, slice); | ||
| 94 | } | ||
| 95 | |||
| 91 | const IterState = packed struct { | 96 | const IterState = packed struct { |
| 92 | mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter | 97 | mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter |
| 93 | mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric | 98 | mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric |
| @@ -113,7 +118,7 @@ pub const Iterator = struct { | |||
| 113 | pub fn init(wb: *const WordBreak, str: []const u8) Iterator { | 118 | pub fn init(wb: *const WordBreak, str: []const u8) Iterator { |
| 114 | var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; | 119 | var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; |
| 115 | wb_iter.advance(); | 120 | wb_iter.advance(); |
| 116 | return wb; | 121 | return wb_iter; |
| 117 | } | 122 | } |
| 118 | 123 | ||
| 119 | pub fn next(iter: *Iterator) ?Word { | 124 | pub fn next(iter: *Iterator) ?Word { |
| @@ -132,12 +137,18 @@ pub const Iterator = struct { | |||
| 132 | scan: while (true) : (iter.advance()) { | 137 | scan: while (true) : (iter.advance()) { |
| 133 | const this = iter.this.?; | 138 | const this = iter.this.?; |
| 134 | word_len += this.len; | 139 | word_len += this.len; |
| 140 | var ignored = false; | ||
| 135 | if (iter.that) |that| { | 141 | if (iter.that) |that| { |
| 136 | const that_p = iter.wb.breakProperty(that.code); | 142 | const that_p = iter.wb.breakProperty(that.code); |
| 137 | const this_p = this_p: { | 143 | const this_p = this_p: { |
| 138 | if (!isIgnorable(that_p) and iter.cache != null) { | 144 | if (!isIgnorable(that_p) and iter.cache != null) { |
| 145 | // TODO: might not need these what with peekPast | ||
| 146 | ignored = true; | ||
| 139 | defer iter.cache = null; | 147 | defer iter.cache = null; |
| 140 | break :this_p iter.cache.?; | 148 | // Fixup some state, apply pre-4 rules |
| 149 | const restore = iter.cache.?; | ||
| 150 | if (restore == .WSegSpace) break :this_p .none; | ||
| 151 | break :this_p restore; | ||
| 141 | } else { | 152 | } else { |
| 142 | break :this_p iter.wb.breakProperty(this.code); | 153 | break :this_p iter.wb.breakProperty(this.code); |
| 143 | } | 154 | } |
| @@ -149,11 +160,22 @@ pub const Iterator = struct { | |||
| 149 | // WB3b ÷ (Newline | CR | LF) | 160 | // WB3b ÷ (Newline | CR | LF) |
| 150 | if (isNewline(that_p)) break :scan; | 161 | if (isNewline(that_p)) break :scan; |
| 151 | // WB3c ZWJ × \p{Extended_Pictographic} | 162 | // WB3c ZWJ × \p{Extended_Pictographic} |
| 152 | // The right way to do this one is a RuneSet, TODO: circle back | 163 | if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { |
| 164 | // Invalid after ignoring | ||
| 165 | if (ignored) break :scan else continue :scan; | ||
| 166 | } | ||
| 153 | // WB3d WSegSpace × WSegSpace | 167 | // WB3d WSegSpace × WSegSpace |
| 154 | if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; | 168 | if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; |
| 155 | // WB4 X (Extend | Format | ZWJ)* → X | 169 | // WB4 X (Extend | Format | ZWJ)* → X |
| 156 | if (isIgnorable(that_p)) { | 170 | if (isIgnorable(that_p)) { |
| 171 | if (that_p == .ZWJ) { | ||
| 172 | const next_val = iter.peekPast(); | ||
| 173 | if (next_val) |next_cp| { | ||
| 174 | if (ext_pict.isMatch(next_cp.bytes(iter.cp_iter.bytes))) { | ||
| 175 | continue :scan; | ||
| 176 | } | ||
| 177 | } | ||
| 178 | } | ||
| 157 | if (iter.cache == null) { | 179 | if (iter.cache == null) { |
| 158 | iter.cache = this_p; | 180 | iter.cache = this_p; |
| 159 | } | 181 | } |
| @@ -164,14 +186,14 @@ pub const Iterator = struct { | |||
| 164 | if (isAHLetter(that_p)) continue :scan; | 186 | if (isAHLetter(that_p)) continue :scan; |
| 165 | // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter | 187 | // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter |
| 166 | if (isMidVal(that_p)) { | 188 | if (isMidVal(that_p)) { |
| 167 | const next_val = iter.cp_iter.peek(); | 189 | const next_val = iter.peekPast(); |
| 168 | if (next_val) |next_cp| { | 190 | if (next_val) |next_cp| { |
| 169 | const next_p = iter.wb.breakProperty(next_cp.code); | 191 | const next_p = iter.wb.breakProperty(next_cp.code); |
| 170 | if (isAHLetter(next_p)) { | 192 | if (isAHLetter(next_p)) { |
| 171 | state.mid_punct = true; | 193 | state.mid_punct = true; |
| 172 | continue :scan; | 194 | continue :scan; |
| 173 | } | 195 | } |
| 174 | } else break :scan; | 196 | } |
| 175 | } | 197 | } |
| 176 | } | 198 | } |
| 177 | // AHLetter (MidLetter | MidNumLetQ) × AHLetter | 199 | // AHLetter (MidLetter | MidNumLetQ) × AHLetter |
| @@ -187,7 +209,7 @@ pub const Iterator = struct { | |||
| 187 | if (that_p == .Single_Quote) continue :scan; | 209 | if (that_p == .Single_Quote) continue :scan; |
| 188 | // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter | 210 | // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter |
| 189 | if (that_p == .Double_Quote) { | 211 | if (that_p == .Double_Quote) { |
| 190 | const next_val = iter.cp_iter.peek(); | 212 | const next_val = iter.peekPast(); |
| 191 | if (next_val) |next_cp| { | 213 | if (next_val) |next_cp| { |
| 192 | const next_p = iter.wb.breakProperty(next_cp.code); | 214 | const next_p = iter.wb.breakProperty(next_cp.code); |
| 193 | if (next_p == .Hebrew_Letter) { | 215 | if (next_p == .Hebrew_Letter) { |
| @@ -212,8 +234,8 @@ pub const Iterator = struct { | |||
| 212 | // WB10 Numeric × AHLetter | 234 | // WB10 Numeric × AHLetter |
| 213 | if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; | 235 | if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; |
| 214 | // WB12 Numeric × (MidNum | MidNumLetQ) Numeric | 236 | // WB12 Numeric × (MidNum | MidNumLetQ) Numeric |
| 215 | if (this_p == .Numeric and isMidVal(that_p)) { | 237 | if (this_p == .Numeric and isMidNum(that_p)) { |
| 216 | const next_val = iter.cp_iter.peek(); | 238 | const next_val = iter.peekPast(); |
| 217 | if (next_val) |next_cp| { | 239 | if (next_val) |next_cp| { |
| 218 | const next_p = iter.wb.breakProperty(next_cp.code); | 240 | const next_p = iter.wb.breakProperty(next_cp.code); |
| 219 | if (next_p == .Numeric) { | 241 | if (next_p == .Numeric) { |
| @@ -224,7 +246,7 @@ pub const Iterator = struct { | |||
| 224 | } | 246 | } |
| 225 | // WB11 Numeric (MidNum | MidNumLetQ) × Numeric | 247 | // WB11 Numeric (MidNum | MidNumLetQ) × Numeric |
| 226 | if (state.mid_num) { | 248 | if (state.mid_num) { |
| 227 | assert(isMidVal(this_p)); | 249 | assert(isMidNum(this_p)); |
| 228 | assert(that_p == .Numeric); | 250 | assert(that_p == .Numeric); |
| 229 | state.mid_num = false; | 251 | state.mid_num = false; |
| 230 | continue :scan; | 252 | continue :scan; |
| @@ -235,25 +257,18 @@ pub const Iterator = struct { | |||
| 235 | if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; | 257 | if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; |
| 236 | // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) | 258 | // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) |
| 237 | if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; | 259 | if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; |
| 238 | // WB15, WB16 ([^RI] ! sot) (RI RI)* RI × RI | 260 | // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI |
| 239 | if (that_p == .Regional_Indicator) { | 261 | if (this_p == .Regional_Indicator) { |
| 240 | if (this_p == .Regional_Indicator) { | 262 | if (that_p == .Regional_Indicator) { |
| 241 | if (state.regional) { | 263 | if (state.regional == true or this.offset == 0) { |
| 242 | state.regional = false; | 264 | state.regional = false; |
| 243 | continue :scan; | 265 | continue :scan; |
| 244 | } else { | ||
| 245 | break :scan; | ||
| 246 | } | 266 | } |
| 247 | } else { | 267 | } else { |
| 248 | const next_val = iter.cp_iter.peek(); | 268 | state.regional = true; |
| 249 | if (next_val) |next_cp| { | ||
| 250 | const next_p = iter.wb.breakProperty(next_cp.code); | ||
| 251 | if (next_p == .Regional_Indicator) { | ||
| 252 | state.regional = true; | ||
| 253 | continue :scan; | ||
| 254 | } | ||
| 255 | } else break :scan; | ||
| 256 | } | 269 | } |
| 270 | } else if (that_p == .Regional_Indicator) { | ||
| 271 | state.regional = true; | ||
| 257 | } | 272 | } |
| 258 | // WB999 Any ÷ Any | 273 | // WB999 Any ÷ Any |
| 259 | break :scan; | 274 | break :scan; |
| @@ -265,9 +280,19 @@ pub const Iterator = struct { | |||
| 265 | return Word{ .len = word_len, .offset = word_start }; | 280 | return Word{ .len = word_len, .offset = word_start }; |
| 266 | } | 281 | } |
| 267 | 282 | ||
| 268 | fn advance(wb_iter: *Iterator) void { | 283 | fn advance(iter: *Iterator) void { |
| 269 | wb_iter.this = wb_iter.that; | 284 | iter.this = iter.that; |
| 270 | wb_iter.that = wb_iter.cp_iter.next(); | 285 | iter.that = iter.cp_iter.next(); |
| 286 | } | ||
| 287 | |||
| 288 | fn peekPast(iter: *Iterator) ?CodePoint { | ||
| 289 | const save_cp = iter.cp_iter; | ||
| 290 | defer iter.cp_iter = save_cp; | ||
| 291 | while (iter.cp_iter.peek()) |peeked| { | ||
| 292 | if (!isIgnorable(iter.wb.breakProperty(peeked.code))) return peeked; | ||
| 293 | _ = iter.cp_iter.next(); | ||
| 294 | } | ||
| 295 | return null; | ||
| 271 | } | 296 | } |
| 272 | }; | 297 | }; |
| 273 | 298 | ||
| @@ -292,6 +317,10 @@ inline fn isMidVal(wbp: WordBreakProperty) bool { | |||
| 292 | return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; | 317 | return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; |
| 293 | } | 318 | } |
| 294 | 319 | ||
| 320 | inline fn isMidNum(wbp: WordBreakProperty) bool { | ||
| 321 | return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote; | ||
| 322 | } | ||
| 323 | |||
| 295 | inline fn isExtensible(wbp: WordBreakProperty) bool { | 324 | inline fn isExtensible(wbp: WordBreakProperty) bool { |
| 296 | return switch (wbp) { | 325 | return switch (wbp) { |
| 297 | .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, | 326 | .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, |
| @@ -328,3 +357,5 @@ const testing = std.testing; | |||
| 328 | const code_point = @import("code_point"); | 357 | const code_point = @import("code_point"); |
| 329 | const CodepointIterator = code_point.Iterator; | 358 | const CodepointIterator = code_point.Iterator; |
| 330 | const CodePoint = code_point.CodePoint; | 359 | const CodePoint = code_point.CodePoint; |
| 360 | |||
| 361 | const ext_pict = @import("micro_runeset.zig").Extended_Pictographic; | ||