diff options
| author | 2025-05-12 12:57:49 -0400 | |
|---|---|---|
| committer | 2025-05-15 15:31:15 -0400 | |
| commit | a832b90bc00503a201bf457df1f49dc338314e7b (patch) | |
| tree | 041c975b63e17db81edfc6923cfafa63691e3b33 | |
| parent | Vastly simplify peek() (diff) | |
| download | zg-a832b90bc00503a201bf457df1f49dc338314e7b.tar.gz zg-a832b90bc00503a201bf457df1f49dc338314e7b.tar.xz zg-a832b90bc00503a201bf457df1f49dc338314e7b.zip | |
Implement Word iterator
A by-the-book implmentation of the word break rules from tr29. This is
superficially inefficient, but compilers are more than able to handle
the common subexpression folding ignored by this approach.
Now to port the WordBreakPropertyTests, and clean up the inevitable bugs
in the implementation.
| -rw-r--r-- | src/WordBreak.zig | 228 |
1 files changed, 228 insertions, 0 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig index 9044740..84fd1f7 100644 --- a/src/WordBreak.zig +++ b/src/WordBreak.zig | |||
| @@ -71,11 +71,234 @@ pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { | |||
| 71 | allocator.free(wordbreak.s2); | 71 | allocator.free(wordbreak.s2); |
| 72 | } | 72 | } |
| 73 | 73 | ||
| 74 | /// Represents a Unicode word span, as an offset into the source string | ||
| 75 | /// and the length of the word. | ||
| 76 | pub const Word = struct { | ||
| 77 | offset: u32, | ||
| 78 | len: u32, | ||
| 79 | |||
| 80 | /// Returns a slice of the word given the source string. | ||
| 81 | pub fn bytes(self: Word, src: []const u8) []const u8 { | ||
| 82 | return src[self.offset..][0..self.len]; | ||
| 83 | } | ||
| 84 | }; | ||
| 85 | |||
| 74 | /// Returns the word break property type for `cp`. | 86 | /// Returns the word break property type for `cp`. |
| 75 | pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { | 87 | pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { |
| 76 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); | 88 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); |
| 77 | } | 89 | } |
| 78 | 90 | ||
| 91 | const IterState = packed struct { | ||
| 92 | mid_punct: bool, // AHLetter (MidLetter | MidNumLetQ) × AHLetter | ||
| 93 | mid_num: bool, // Numeric (MidNum | MidNumLetQ) × Numeric | ||
| 94 | quote_heb: bool, // Hebrew_Letter Double_Quote × Hebrew_Letter | ||
| 95 | regional: bool, // [^RI] (RI RI)* RI × RI | ||
| 96 | |||
| 97 | pub const initial: IterState = .{ | ||
| 98 | .mid_punct = false, | ||
| 99 | .mid_num = false, | ||
| 100 | .quote_heb = false, | ||
| 101 | .regional = false, | ||
| 102 | }; | ||
| 103 | }; | ||
| 104 | |||
| 105 | pub const Iterator = struct { | ||
| 106 | this: ?CodePoint = null, | ||
| 107 | that: ?CodePoint = null, | ||
| 108 | cache: ?WordBreakProperty = null, | ||
| 109 | cp_iter: CodepointIterator, | ||
| 110 | wb: *const WordBreak, | ||
| 111 | |||
| 112 | /// Assumes `str` is valid UTF-8. | ||
| 113 | pub fn init(wb: *const WordBreak, str: []const u8) Iterator { | ||
| 114 | var wb_iter: Iterator = .{ .cp_iter = .{ .bytes = str }, .wb = wb }; | ||
| 115 | wb_iter.advance(); | ||
| 116 | return wb; | ||
| 117 | } | ||
| 118 | |||
| 119 | pub fn next(iter: *Iterator) ?Word { | ||
| 120 | iter.advance(); | ||
| 121 | |||
| 122 | // Done? | ||
| 123 | if (iter.this == null) return null; | ||
| 124 | // Last? | ||
| 125 | if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; | ||
| 126 | |||
| 127 | const word_start = iter.this.?.offset; | ||
| 128 | var word_len: u32 = 0; | ||
| 129 | |||
| 130 | var state: IterState = .initial; | ||
| 131 | |||
| 132 | scan: while (true) : (iter.advance()) { | ||
| 133 | const this = iter.this.?; | ||
| 134 | word_len += this.len; | ||
| 135 | if (iter.that) |that| { | ||
| 136 | const that_p = iter.wb.breakProperty(that.code); | ||
| 137 | const this_p = this_p: { | ||
| 138 | if (!isIgnorable(that_p) and iter.cache != null) { | ||
| 139 | defer iter.cache = null; | ||
| 140 | break :this_p iter.cache.?; | ||
| 141 | } else { | ||
| 142 | break :this_p iter.wb.breakProperty(this.code); | ||
| 143 | } | ||
| 144 | }; | ||
| 145 | // WB3 CR × LF | ||
| 146 | if (this_p == .CR and that_p == .LF) continue :scan; | ||
| 147 | // WB3a (Newline | CR | LF) ÷ | ||
| 148 | if (isNewline(this_p)) break :scan; | ||
| 149 | // WB3b ÷ (Newline | CR | LF) | ||
| 150 | if (isNewline(that_p)) break :scan; | ||
| 151 | // WB3c ZWJ × \p{Extended_Pictographic} | ||
| 152 | // The right way to do this one is a RuneSet, TODO: circle back | ||
| 153 | // WB3d WSegSpace × WSegSpace | ||
| 154 | if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; | ||
| 155 | // WB4 X (Extend | Format | ZWJ)* → X | ||
| 156 | if (isIgnorable(that_p)) { | ||
| 157 | if (iter.cache == null) { | ||
| 158 | iter.cache = this_p; | ||
| 159 | } | ||
| 160 | continue :scan; | ||
| 161 | } | ||
| 162 | if (isAHLetter(this_p)) { | ||
| 163 | // WB5 AHLetter × AHLetter | ||
| 164 | if (isAHLetter(that_p)) continue :scan; | ||
| 165 | // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter | ||
| 166 | if (isMidVal(that_p)) { | ||
| 167 | const next_val = iter.cp_iter.peek(); | ||
| 168 | if (next_val) |next_cp| { | ||
| 169 | const next_p = iter.wb.breakProperty(next_cp.code); | ||
| 170 | if (isAHLetter(next_p)) { | ||
| 171 | state.mid_punct = true; | ||
| 172 | continue :scan; | ||
| 173 | } | ||
| 174 | } else break :scan; | ||
| 175 | } | ||
| 176 | } | ||
| 177 | // AHLetter (MidLetter | MidNumLetQ) × AHLetter | ||
| 178 | if (state.mid_punct) { | ||
| 179 | // Should always be true: | ||
| 180 | assert(isMidVal(this_p)); | ||
| 181 | assert(isAHLetter(that_p)); | ||
| 182 | state.mid_punct = false; | ||
| 183 | continue :scan; | ||
| 184 | } | ||
| 185 | if (this_p == .Hebrew_Letter) { | ||
| 186 | // WB7a Hebrew_Letter × Single_Quote | ||
| 187 | if (that_p == .Single_Quote) continue :scan; | ||
| 188 | // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter | ||
| 189 | if (that_p == .Double_Quote) { | ||
| 190 | const next_val = iter.cp_iter.peek(); | ||
| 191 | if (next_val) |next_cp| { | ||
| 192 | const next_p = iter.wb.breakProperty(next_cp.code); | ||
| 193 | if (next_p == .Hebrew_Letter) { | ||
| 194 | state.quote_heb = true; | ||
| 195 | continue :scan; | ||
| 196 | } | ||
| 197 | } else break :scan; | ||
| 198 | } | ||
| 199 | } | ||
| 200 | // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter | ||
| 201 | if (state.quote_heb) { | ||
| 202 | // Should always be true: | ||
| 203 | assert(this_p == .Double_Quote); | ||
| 204 | assert(that_p == .Hebrew_Letter); | ||
| 205 | state.quote_heb = false; | ||
| 206 | continue :scan; | ||
| 207 | } | ||
| 208 | // WB8 Numeric × Numeric | ||
| 209 | if (this_p == .Numeric and that_p == .Numeric) continue :scan; | ||
| 210 | // WB9 AHLetter × Numeric | ||
| 211 | if (isAHLetter(this_p) and that_p == .Numeric) continue :scan; | ||
| 212 | // WB10 Numeric × AHLetter | ||
| 213 | if (this_p == .Numeric and isAHLetter(that_p)) continue :scan; | ||
| 214 | // WB12 Numeric × (MidNum | MidNumLetQ) Numeric | ||
| 215 | if (this_p == .Numeric and isMidVal(that_p)) { | ||
| 216 | const next_val = iter.cp_iter.peek(); | ||
| 217 | if (next_val) |next_cp| { | ||
| 218 | const next_p = iter.wb.breakProperty(next_cp.code); | ||
| 219 | if (next_p == .Numeric) { | ||
| 220 | state.mid_num = true; | ||
| 221 | continue :scan; | ||
| 222 | } | ||
| 223 | } else break :scan; | ||
| 224 | } | ||
| 225 | // WB11 Numeric (MidNum | MidNumLetQ) × Numeric | ||
| 226 | if (state.mid_num) { | ||
| 227 | assert(isMidVal(this_p)); | ||
| 228 | assert(that_p == .Numeric); | ||
| 229 | state.mid_num = false; | ||
| 230 | continue :scan; | ||
| 231 | } | ||
| 232 | // WB13 Katakana × Katakana | ||
| 233 | if (this_p == .Katakana and that_p == .Katakana) continue :scan; | ||
| 234 | // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet | ||
| 235 | if (isExtensible(this_p) and that_p == .ExtendNumLet) continue :scan; | ||
| 236 | // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) | ||
| 237 | if (this_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; | ||
| 238 | // WB15, WB16 ([^RI] ! sot) (RI RI)* RI × RI | ||
| 239 | if (that_p == .Regional_Indicator) { | ||
| 240 | if (this_p == .Regional_Indicator) { | ||
| 241 | if (state.regional) { | ||
| 242 | state.regional = false; | ||
| 243 | continue :scan; | ||
| 244 | } else { | ||
| 245 | break :scan; | ||
| 246 | } | ||
| 247 | } else { | ||
| 248 | const next_val = iter.cp_iter.peek(); | ||
| 249 | if (next_val) |next_cp| { | ||
| 250 | const next_p = iter.wb.breakProperty(next_cp.code); | ||
| 251 | if (next_p == .Regional_Indicator) { | ||
| 252 | state.regional = true; | ||
| 253 | continue :scan; | ||
| 254 | } | ||
| 255 | } else break :scan; | ||
| 256 | } | ||
| 257 | } | ||
| 258 | // WB999 Any ÷ Any | ||
| 259 | break :scan; | ||
| 260 | } else { // iter.that == null | ||
| 261 | break :scan; | ||
| 262 | } | ||
| 263 | } | ||
| 264 | |||
| 265 | return Word{ .len = word_len, .offset = word_start }; | ||
| 266 | } | ||
| 267 | |||
| 268 | fn advance(wb_iter: *Iterator) void { | ||
| 269 | wb_iter.this = wb_iter.that; | ||
| 270 | wb_iter.that = wb_iter.cp_iter.next(); | ||
| 271 | } | ||
| 272 | }; | ||
| 273 | |||
| 274 | //| Predicates | ||
| 275 | |||
| 276 | inline fn isNewline(wbp: WordBreakProperty) bool { | ||
| 277 | return wbp == .CR or wbp == .LF or wbp == .Newline; | ||
| 278 | } | ||
| 279 | |||
| 280 | inline fn isIgnorable(wbp: WordBreakProperty) bool { | ||
| 281 | return switch (wbp) { | ||
| 282 | .Format, .Extend, .ZWJ => true, | ||
| 283 | else => false, | ||
| 284 | }; | ||
| 285 | } | ||
| 286 | |||
| 287 | inline fn isAHLetter(wbp: WordBreakProperty) bool { | ||
| 288 | return wbp == .ALetter or wbp == .Hebrew_Letter; | ||
| 289 | } | ||
| 290 | |||
| 291 | inline fn isMidVal(wbp: WordBreakProperty) bool { | ||
| 292 | return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; | ||
| 293 | } | ||
| 294 | |||
| 295 | inline fn isExtensible(wbp: WordBreakProperty) bool { | ||
| 296 | return switch (wbp) { | ||
| 297 | .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, | ||
| 298 | else => false, | ||
| 299 | }; | ||
| 300 | } | ||
| 301 | |||
| 79 | test "Word Break Properties" { | 302 | test "Word Break Properties" { |
| 80 | const wb = try WordBreak.init(testing.allocator); | 303 | const wb = try WordBreak.init(testing.allocator); |
| 81 | defer wb.deinit(testing.allocator); | 304 | defer wb.deinit(testing.allocator); |
| @@ -99,4 +322,9 @@ const builtin = @import("builtin"); | |||
| 99 | const compress = std.compress; | 322 | const compress = std.compress; |
| 100 | const mem = std.mem; | 323 | const mem = std.mem; |
| 101 | const Allocator = mem.Allocator; | 324 | const Allocator = mem.Allocator; |
| 325 | const assert = std.debug.assert; | ||
| 102 | const testing = std.testing; | 326 | const testing = std.testing; |
| 327 | |||
| 328 | const code_point = @import("code_point"); | ||
| 329 | const CodepointIterator = code_point.Iterator; | ||
| 330 | const CodePoint = code_point.CodePoint; | ||