diff options
| -rw-r--r-- | src/WordBreak.zig | 228 |
1 files changed, 228 insertions, 0 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 9044740..84fd1f7 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig | |||
| @@ -71,11 +71,234 @@ pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { | |||
| 71 | allocator.free(wordbreak.s2); | 71 | allocator.free(wordbreak.s2); |
| 72 | } | 72 | } |
| 73 | 73 | ||
| 74 | /// Represents a Unicode word span, as an offset into the source string | ||
| 75 | /// and the length of the word. | ||
| 76 | pub const Word = struct { | ||
| 77 | offset: u32, | ||
| 78 | len: u32, | ||
| 79 | |||
| 80 | /// Returns a slice of the word given the source string. | ||
| 81 | pub fn bytes(self: Word, src: []const u8) []const u8 { | ||
| 82 | return src[self.offset..][0..self.len]; | ||
| 83 | } | ||
| 84 | }; | ||
| 85 | |||
| 74 | /// Returns the word break property type for `cp`. | 86 | /// Returns the word break property type for `cp`. |
| 75 | pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { | 87 | pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { |
| 76 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); | 88 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); |
| 77 | } | 89 | } |
| 78 | 90 | ||
| 91 | const IterState = packed struct { | ||
| 92 | mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter | ||
| 93 | mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric | ||
| 94 | quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter | ||
| 95 | regional: bool, // [^RI] (RI RI)* RI × RI | ||
| 96 | |||
| 97 | pub const initial: IterState = .{ | ||
| 98 | .mid_punct = false, | ||
| 99 | .mid_num = false, | ||
| 100 | .quote_heb = false, | ||
| 101 | .regional = false, | ||
| 102 | }; | ||
| 103 | }; | ||
| 104 | |||
| 105 | pub const Iterator = struct { | ||
| 106 | this: ?CodePoint = null, | ||
| 107 | that: ?CodePoint = null, | ||
| 108 | cache: ?WordBreakProperty = null, | ||
| 109 | cp_iter: CodepointIterator, | ||
| 110 | wb: *const WordBreak, | ||
| 111 | |||
| 112 | /// Assumes `str` is valid UTF-8. | ||
| 113 | pub fn init(wb: *const WordBreak, str: []const u8) Iterator { | ||
| 114 | var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; | ||
| 115 | wb_iter.advance(); | ||
| 116 | return wb; | ||
| 117 | } | ||
| 118 | |||
| 119 | pub fn next(iter: *Iterator) ?Word { | ||
| 120 | iter.advance(); | ||
| 121 | |||
| 122 | // Done? | ||
| 123 | if (iter.this == null) return null; | ||
| 124 | // Last? | ||
| 125 | if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; | ||
| 126 | |||
| 127 | const word_start = iter.this.?.offset; | ||
| 128 | var word_len: u32 = 0; | ||
| 129 | |||
| 130 | var state: IterState = .initial; | ||
| 131 | |||
| 132 | scan: while (true) : (iter.advance()) { | ||
| 133 | const this = iter.this.?; | ||
| 134 | word_len += this.len; | ||
| 135 | if (iter.that) |that| { | ||
| 136 | const that_p = iter.wb.breakProperty(that.code); | ||
| 137 | const this_p = this_p: { | ||
| 138 | if (!isIgnorable(that_p) and iter.cache != null) { | ||
| 139 | defer iter.cache = null; | ||
| 140 | break :this_p iter.cache.?; | ||
| 141 | } else { | ||
| 142 | break :this_p iter.wb.breakProperty(this.code); | ||
| 143 | } | ||
| 144 | }; | ||
| 145 | // WB3 CR × LF | ||
| 146 | if (this_p == .CR and that_p == .LF) continue :scan; | ||
| 147 | // WB3a (Newline | CR | LF) ÷ | ||
| 148 | if (isNewline(this_p)) break :scan; | ||
| 149 | // WB3b ÷ (Newline | CR | LF) | ||
| 150 | if (isNewline(that_p)) break :scan; | ||
| 151 | // WB3c ZWJ × \p{Extended_Pictographic} | ||
| 152 | // The right way to do this one is a RuneSet, TODO: circle back | ||
| 153 | // WB3d WSegSpace × WSegSpace | ||
| 154 | if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; | ||
| 155 | // WB4 X (Extend | Format | ZWJ)* → X | ||
| 156 | if (isIgnorable(that_p)) { | ||
| 157 | if (iter.cache == null) { | ||
| 158 | iter.cache = this_p; | ||
| 159 | } | ||
| 160 | continue :scan; | ||
| 161 | } | ||
| 162 | if (isAHLetter(this_p)) { | ||
| 163 | // WB5 AHLetter × AHLetter | ||
| 164 | if (isAHLetter(that_p)) continue :scan; | ||
| 165 | // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter | ||
| 166 | if (isMidVal(that_p)) { | ||
| 167 | const next_val = iter.cp_iter.peek(); | ||
| 168 | if (next_val) |next_cp| { | ||
| 169 | const next_p = iter.wb.breakProperty(next_cp.code); | ||
| 170 | if (isAHLetter(next_p)) { | ||
| 171 | state.mid_punct = true; | ||
| 172 | continue :scan; | ||
| 173 | } | ||
| 174 | } else break :scan; | ||
| 175 | } | ||
| 176 | } | ||
| 177 | // AHLetter (MidLetter | MidNumLetQ) × AHLetter | ||
| 178 | if (state.mid_punct) { | ||
| 179 | // Should always be true: | ||
| 180 | assert(isMidVal(this_p)); | ||
| 181 | assert(isAHLetter(that_p)); | ||
| 182 | state.mid_punct = false; | ||
| 183 | continue :scan; | ||
| 184 | } | ||
| 185 | if (this_p == .Hebrew_Letter) { | ||
| 186 | // WB7a Hebrew_Letter × Single_Quote | ||
| 187 | if (that_p == .Single_Quote) continue :scan; | ||
| 188 | // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter | ||
| 189 | if (that_p == .Double_Quote) { | ||
| 190 | const next_val = iter.cp_iter.peek(); | ||
| 191 | if (next_val) |next_cp| { | ||
| 192 | const next_p = iter.wb.breakProperty(next_cp.code); | ||
| 193 | if (next_p == .Hebrew_Letter) { | ||
| 194 | state.quote_heb = true; | ||
| 195 | continue :scan; | ||
| 196 | } | ||
| 197 | } else break :scan; | ||
| 198 | } | ||
| 199 | } | ||
| 200 | // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter | ||
| 201 | if (state.quote_heb) { | ||
| 202 | // Should always be true: | ||
| 203 | assert(this_p == .Double_Quote); | ||
| 204 | assert(that_p == .Hebrew_Letter); | ||
| 205 | state.quote_heb = false; | ||
| 206 | continue :scan; | ||
| 207 | } | ||
| 208 | // WB8 Numeric × Numeric | ||
| 209 | if (this_p == .Numeric and that_p == .Numeric) continue :scan; | ||
| 210 | // WB9 AHLetter × Numeric | ||
| 211 | if (isAHLetter(this_p) and that_p == .Numeric) continue :scan; | ||
| 212 | // WB10 Numeric × AHLetter | ||
| 213 | if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; | ||
| 214 | // WB12 Numeric × (MidNum | MidNumLetQ) Numeric | ||
| 215 | if (this_p == .Numeric and isMidVal(that_p)) { | ||
| 216 | const next_val = iter.cp_iter.peek(); | ||
| 217 | if (next_val) |next_cp| { | ||
| 218 | const next_p = iter.wb.breakProperty(next_cp.code); | ||
| 219 | if (next_p == .Numeric) { | ||
| 220 | state.mid_num = true; | ||
| 221 | continue :scan; | ||
| 222 | } | ||
| 223 | } else break :scan; | ||
| 224 | } | ||
| 225 | // WB11 Numeric (MidNum | MidNumLetQ) × Numeric | ||
| 226 | if (state.mid_num) { | ||
| 227 | assert(isMidVal(this_p)); | ||
| 228 | assert(that_p == .Numeric); | ||
| 229 | state.mid_num = false; | ||
| 230 | continue :scan; | ||
| 231 | } | ||
| 232 | // WB13 Katakana × Katakana | ||
| 233 | if (this_p == .Katakana and that_p == .Katakana) continue :scan; | ||
| 234 | // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet | ||
| 235 | if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; | ||
| 236 | // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) | ||
| 237 | if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; | ||
| 238 | // WB15, WB16 ([^RI] ! sot) (RI RI)* RI × RI | ||
| 239 | if (that_p == .Regional_Indicator) { | ||
| 240 | if (this_p == .Regional_Indicator) { | ||
| 241 | if (state.regional) { | ||
| 242 | state.regional = false; | ||
| 243 | continue :scan; | ||
| 244 | } else { | ||
| 245 | break :scan; | ||
| 246 | } | ||
| 247 | } else { | ||
| 248 | const next_val = iter.cp_iter.peek(); | ||
| 249 | if (next_val) |next_cp| { | ||
| 250 | const next_p = iter.wb.breakProperty(next_cp.code); | ||
| 251 | if (next_p == .Regional_Indicator) { | ||
| 252 | state.regional = true; | ||
| 253 | continue :scan; | ||
| 254 | } | ||
| 255 | } else break :scan; | ||
| 256 | } | ||
| 257 | } | ||
| 258 | // WB999 Any ÷ Any | ||
| 259 | break :scan; | ||
| 260 | } else { // iter.that == null | ||
| 261 | break :scan; | ||
| 262 | } | ||
| 263 | } | ||
| 264 | |||
| 265 | return Word{ .len = word_len, .offset = word_start }; | ||
| 266 | } | ||
| 267 | |||
| 268 | fn advance(wb_iter: *Iterator) void { | ||
| 269 | wb_iter.this = wb_iter.that; | ||
| 270 | wb_iter.that = wb_iter.cp_iter.next(); | ||
| 271 | } | ||
| 272 | }; | ||
| 273 | |||
| 274 | //| Predicates | ||
| 275 | |||
| 276 | inline fn isNewline(wbp: WordBreakProperty) bool { | ||
| 277 | return wbp == .CR or wbp == .LF or wbp == .Newline; | ||
| 278 | } | ||
| 279 | |||
| 280 | inline fn isIgnorable(wbp: WordBreakProperty) bool { | ||
| 281 | return switch (wbp) { | ||
| 282 | .Format, .Extend, .ZWJ => true, | ||
| 283 | else => false, | ||
| 284 | }; | ||
| 285 | } | ||
| 286 | |||
| 287 | inline fn isAHLetter(wbp: WordBreakProperty) bool { | ||
| 288 | return wbp == .ALetter or wbp == .Hebrew_Letter; | ||
| 289 | } | ||
| 290 | |||
| 291 | inline fn isMidVal(wbp: WordBreakProperty) bool { | ||
| 292 | return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; | ||
| 293 | } | ||
| 294 | |||
| 295 | inline fn isExtensible(wbp: WordBreakProperty) bool { | ||
| 296 | return switch (wbp) { | ||
| 297 | .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, | ||
| 298 | else => false, | ||
| 299 | }; | ||
| 300 | } | ||
| 301 | |||
| 79 | test "Word Break Properties" { | 302 | test "Word Break Properties" { |
| 80 | const wb = try WordBreak.init(testing.allocator); | 303 | const wb = try WordBreak.init(testing.allocator); |
| 81 | defer wb.deinit(testing.allocator); | 304 | defer wb.deinit(testing.allocator); |
| @@ -99,4 +322,9 @@ const builtin = @import("builtin"); | |||
| 99 | const compress = std.compress; | 322 | const compress = std.compress; |
| 100 | const mem = std.mem; | 323 | const mem = std.mem; |
| 101 | const Allocator = mem.Allocator; | 324 | const Allocator = mem.Allocator; |
| 325 | const assert = std.debug.assert; | ||
| 102 | const testing = std.testing; | 326 | const testing = std.testing; |
| 327 | |||
| 328 | const code_point = @import("code_point"); | ||
| 329 | const CodepointIterator = code_point.Iterator; | ||
| 330 | const CodePoint = code_point.CodePoint; | ||