diff options
| author | 2025-07-08 12:15:32 -0400 | |
|---|---|---|
| committer | 2025-07-08 12:15:32 -0400 | |
| commit | 9427a9e53aaa29ee071f4dcb35b809a699d75aa9 (patch) | |
| tree | 2607c185fd8053b84d60041fadc35c05a0225d34 /src | |
| parent | Merge pull request 'Fix benchmarks' (#56) from jacobsandlund/zg:benchmarks in... (diff) | |
| parent | Add Words.zig example to README (diff) | |
| download | zg-9427a9e53aaa29ee071f4dcb35b809a699d75aa9.tar.gz zg-9427a9e53aaa29ee071f4dcb35b809a699d75aa9.tar.xz zg-9427a9e53aaa29ee071f4dcb35b809a699d75aa9.zip | |
Diffstat (limited to 'src')
| -rw-r--r-- | src/CaseFolding.zig | 8 | ||||
| -rw-r--r-- | src/Graphemes.zig | 479 | ||||
| -rw-r--r-- | src/Words.zig | 773 | ||||
| -rw-r--r-- | src/code_point.zig | 200 | ||||
| -rw-r--r-- | src/micro_runeset.zig | 207 | ||||
| -rw-r--r-- | src/unicode_tests.zig | 398 |
6 files changed, 1873 insertions, 192 deletions
diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig index f63b860..ff41b3e 100644 --- a/src/CaseFolding.zig +++ b/src/CaseFolding.zig | |||
| @@ -300,13 +300,13 @@ fn testAllocations(allocator: Allocator) !void { | |||
| 300 | { | 300 | { |
| 301 | const normalize = try Normalize.init(allocator); | 301 | const normalize = try Normalize.init(allocator); |
| 302 | defer normalize.deinit(allocator); | 302 | defer normalize.deinit(allocator); |
| 303 | const caser1 = try CaseFolding.initWithNormalize(allocator, normalize); | 303 | const caser = try CaseFolding.initWithNormalize(allocator, normalize); |
| 304 | defer caser1.deinit(allocator); | 304 | defer caser.deinit(allocator); |
| 305 | } | 305 | } |
| 306 | // With normalize owned | 306 | // With normalize owned |
| 307 | { | 307 | { |
| 308 | const caser2 = try CaseFolding.init(allocator); | 308 | const caser = try CaseFolding.init(allocator); |
| 309 | defer caser2.deinit(allocator); | 309 | defer caser.deinit(allocator); |
| 310 | } | 310 | } |
| 311 | } | 311 | } |
| 312 | 312 | ||
diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 7bf328a..f1c56ed 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig | |||
| @@ -1,12 +1,7 @@ | |||
| 1 | const std = @import("std"); | 1 | //! Graphemes Module |
| 2 | const builtin = @import("builtin"); | 2 | //! |
| 3 | const mem = std.mem; | 3 | //! Code for handling graphemes: fragments of string which should be |
| 4 | const Allocator = mem.Allocator; | 4 | //! treated as one unit. Like Farmer Bob here: 👨🏻🌾 |
| 5 | const compress = std.compress; | ||
| 6 | const unicode = std.unicode; | ||
| 7 | |||
| 8 | const CodePoint = @import("code_point").CodePoint; | ||
| 9 | const CodePointIterator = @import("code_point").Iterator; | ||
| 10 | 5 | ||
| 11 | s1: []u16 = undefined, | 6 | s1: []u16 = undefined, |
| 12 | s2: []u16 = undefined, | 7 | s2: []u16 = undefined, |
| @@ -66,10 +61,16 @@ pub fn isEmoji(graphemes: Graphemes, cp: u21) bool { | |||
| 66 | return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; | 61 | return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; |
| 67 | } | 62 | } |
| 68 | 63 | ||
| 64 | /// Returns an iterator over the graphemes in `string`. | ||
| 69 | pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { | 65 | pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { |
| 70 | return Iterator.init(string, graphemes); | 66 | return Iterator.init(string, graphemes); |
| 71 | } | 67 | } |
| 72 | 68 | ||
| 69 | /// Returns a reverse iterator over the graphemes in `string`. | ||
| 70 | pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { | ||
| 71 | return ReverseIterator.init(string, graphemes); | ||
| 72 | } | ||
| 73 | |||
| 73 | /// Indic syllable type. | 74 | /// Indic syllable type. |
| 74 | pub const Indic = enum { | 75 | pub const Indic = enum { |
| 75 | none, | 76 | none, |
| @@ -99,8 +100,8 @@ pub const Gbp = enum { | |||
| 99 | 100 | ||
| 100 | /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. | 101 | /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. |
| 101 | pub const Grapheme = struct { | 102 | pub const Grapheme = struct { |
| 102 | len: u8, | 103 | len: uoffset, |
| 103 | offset: u32, | 104 | offset: uoffset, |
| 104 | 105 | ||
| 105 | /// `bytes` returns the slice of bytes that correspond to | 106 | /// `bytes` returns the slice of bytes that correspond to |
| 106 | /// this grapheme cluster in `src`. | 107 | /// this grapheme cluster in `src`. |
| @@ -109,6 +110,96 @@ pub const Grapheme = struct { | |||
| 109 | } | 110 | } |
| 110 | }; | 111 | }; |
| 111 | 112 | ||
| 113 | // NOTE: graphemeAtIndex is, probably, not in an optimal form. It has the advantage | ||
| 114 | // of being composed of other parts, but the constant factor can _probably_ be improved | ||
| 115 | // by a bespoke implmentation using graphemes.graphemeBreak directly. There's a limit | ||
| 116 | // to how much cycle-bumming I'm willing to do at any given moment; that limit has been | ||
| 117 | // reached. Perhaps you, Dear Reader, might pick up the torch? | ||
| 118 | |||
| 119 | /// Returns the `Grapheme` at `string[index]`, which does not have to be a | ||
| 120 | /// valid start of a codepoint. Asserts the string is not empty. Index must be | ||
| 121 | /// less than `string.len`. Always returns a `Grapheme`. | ||
| 122 | pub fn graphemeAtIndex(graphemes: *const Graphemes, string: []const u8, index: usize) Grapheme { | ||
| 123 | assert(string.len != 0); | ||
| 124 | if (index == 0 or (index > 0 and | ||
| 125 | string[index] < 0x80 and | ||
| 126 | string[index - 1] < 0x80) and | ||
| 127 | (string[index - 1] != '\r' and string[index] != '\n')) | ||
| 128 | { | ||
| 129 | // There's always a grapheme break between two ASCII code points (except CRLF) | ||
| 130 | var iter = graphemes.iterator(string[index..]); | ||
| 131 | const next = iter.next().?; | ||
| 132 | return Grapheme{ | ||
| 133 | .len = next.len, | ||
| 134 | .offset = @as(u32, @intCast(index)) + next.offset, | ||
| 135 | }; | ||
| 136 | } // Otherwise it gets hairy. | ||
| 137 | const idx: uoffset = code_point.codepointAtIndex(string, @intCast(index)).?.offset; | ||
| 138 | if (idx == string.len) { | ||
| 139 | var iter = graphemes.reverseIterator(string); | ||
| 140 | return iter.prev().?; | ||
| 141 | } | ||
| 142 | // We're on a valid codepoint boundary, we go back from here | ||
| 143 | var r_iter = graphemes.reverseIterAtIndex(string, idx); | ||
| 144 | if (r_iter.prev()) |g| { | ||
| 145 | if (g.offset == 0) { | ||
| 146 | var iter = graphemes.iterator(string); | ||
| 147 | while (iter.next()) |g2| { | ||
| 148 | if (g2.offset <= idx and idx < g2.offset + g2.len) return g2; | ||
| 149 | } | ||
| 150 | } | ||
| 151 | } | ||
| 152 | // We need to toss one, because otherwise we might not be pending when | ||
| 153 | // we in fact need to be. | ||
| 154 | _ = r_iter.prev(); | ||
| 155 | while (r_iter.pending != .none) : (_ = r_iter.prev()) {} | ||
| 156 | var iter = graphemes.iterAtIndex(string, r_iter.cp_iter.i orelse 0); | ||
| 157 | while (iter.next()) |g| { | ||
| 158 | if (g.offset <= idx and idx < g.offset + g.len) return g; | ||
| 159 | } | ||
| 160 | unreachable; | ||
| 161 | } | ||
| 162 | |||
| 163 | /// Return a (forward) iterator of `string` after `grapheme`. | ||
| 164 | pub fn iterateAfterGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) Iterator { | ||
| 165 | return graphemes.iterAtIndex(string, grapheme.offset + grapheme.len); | ||
| 166 | } | ||
| 167 | |||
| 168 | /// Return a reverse iterator of `string` before `grapheme`. | ||
| 169 | pub fn iterateBeforeGrapheme(graphemes: *const Graphemes, string: []const u8, grapheme: Grapheme) ReverseIterator { | ||
| 170 | // This bit of weirdness is because reverse iterators are "advance last", | ||
| 171 | // while forward iterators are "advance first". This leaves some room for | ||
| 172 | // further optimization, if anyone dares. | ||
| 173 | var r_iter = graphemes.reverseIterAtIndex(string, grapheme.offset + grapheme.len - 1); | ||
| 174 | _ = r_iter.prev(); | ||
| 175 | return r_iter; | ||
| 176 | } | ||
| 177 | |||
| 178 | fn reverseIterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) ReverseIterator { | ||
| 179 | var r_iter: ReverseIterator = undefined; | ||
| 180 | r_iter.data = graphemes; | ||
| 181 | var rcp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; | ||
| 182 | r_iter.buf[1] = rcp_iter.prev(); | ||
| 183 | r_iter.buf[0] = rcp_iter.prev(); | ||
| 184 | r_iter.pending = .none; | ||
| 185 | r_iter.cp_iter = rcp_iter; | ||
| 186 | return r_iter; | ||
| 187 | } | ||
| 188 | |||
| 189 | fn iterAtIndex(graphemes: *const Graphemes, string: []const u8, idx: uoffset) Iterator { | ||
| 190 | var iter: Iterator = undefined; | ||
| 191 | iter.data = graphemes; | ||
| 192 | iter.buf[0] = first: { | ||
| 193 | if (idx == string.len) break :first null; | ||
| 194 | var r_cp_iter: CodePointReverseIterator = .{ .bytes = string, .i = idx }; | ||
| 195 | break :first r_cp_iter.prev(); | ||
| 196 | }; | ||
| 197 | var cp_iter: CodePointIterator = .{ .bytes = string, .i = idx }; | ||
| 198 | iter.buf[1] = cp_iter.next(); | ||
| 199 | iter.cp_iter = cp_iter; | ||
| 200 | return iter; | ||
| 201 | } | ||
| 202 | |||
| 112 | /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. | 203 | /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. |
| 113 | pub const Iterator = struct { | 204 | pub const Iterator = struct { |
| 114 | buf: [2]?CodePoint = .{ null, null }, | 205 | buf: [2]?CodePoint = .{ null, null }, |
| @@ -143,7 +234,7 @@ pub const Iterator = struct { | |||
| 143 | 234 | ||
| 144 | const gc_start = self.buf[0].?.offset; | 235 | const gc_start = self.buf[0].?.offset; |
| 145 | var gc_len: u8 = self.buf[0].?.len; | 236 | var gc_len: u8 = self.buf[0].?.len; |
| 146 | var state = State{}; | 237 | var state = IterState{}; |
| 147 | 238 | ||
| 148 | if (graphemeBreak( | 239 | if (graphemeBreak( |
| 149 | self.buf[0].?.code, | 240 | self.buf[0].?.code, |
| @@ -173,72 +264,244 @@ pub const Iterator = struct { | |||
| 173 | const saved_cp_iter = self.cp_iter; | 264 | const saved_cp_iter = self.cp_iter; |
| 174 | const s0 = self.buf[0]; | 265 | const s0 = self.buf[0]; |
| 175 | const s1 = self.buf[1]; | 266 | const s1 = self.buf[1]; |
| 176 | 267 | defer { | |
| 177 | self.advance(); | ||
| 178 | |||
| 179 | // If no more | ||
| 180 | if (self.buf[0] == null) { | ||
| 181 | self.cp_iter = saved_cp_iter; | ||
| 182 | self.buf[0] = s0; | ||
| 183 | self.buf[1] = s1; | ||
| 184 | return null; | ||
| 185 | } | ||
| 186 | // If last one | ||
| 187 | if (self.buf[1] == null) { | ||
| 188 | const len = self.buf[0].?.len; | ||
| 189 | const offset = self.buf[0].?.offset; | ||
| 190 | self.cp_iter = saved_cp_iter; | ||
| 191 | self.buf[0] = s0; | ||
| 192 | self.buf[1] = s1; | ||
| 193 | return Grapheme{ .len = len, .offset = offset }; | ||
| 194 | } | ||
| 195 | // If ASCII | ||
| 196 | if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) { | ||
| 197 | const len = self.buf[0].?.len; | ||
| 198 | const offset = self.buf[0].?.offset; | ||
| 199 | self.cp_iter = saved_cp_iter; | 268 | self.cp_iter = saved_cp_iter; |
| 200 | self.buf[0] = s0; | 269 | self.buf[0] = s0; |
| 201 | self.buf[1] = s1; | 270 | self.buf[1] = s1; |
| 202 | return Grapheme{ .len = len, .offset = offset }; | ||
| 203 | } | 271 | } |
| 272 | return self.next(); | ||
| 273 | } | ||
| 274 | }; | ||
| 204 | 275 | ||
| 205 | const gc_start = self.buf[0].?.offset; | 276 | /// Iterate a string backward by Grapheme. |
| 206 | var gc_len: u8 = self.buf[0].?.len; | 277 | pub const ReverseIterator = struct { |
| 207 | var state = State{}; | 278 | buf: [2]?CodePoint = .{ null, null }, |
| 279 | cp_iter: CodePointReverseIterator, | ||
| 280 | data: *const Graphemes, | ||
| 281 | /// Codepoint read from `cp_iter` but not returned by `previous` | ||
| 282 | pending: Pending = .none, | ||
| 208 | 283 | ||
| 209 | if (graphemeBreak( | 284 | const Pending = union(enum) { |
| 210 | self.buf[0].?.code, | 285 | none: void, |
| 211 | self.buf[1].?.code, | 286 | /// Count of pending RI codepoints, it is an even number |
| 212 | self.data, | 287 | ri_count: usize, |
| 213 | &state, | 288 | /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji |
| 214 | )) { | 289 | extend_end: uoffset, |
| 215 | self.cp_iter = saved_cp_iter; | 290 | }; |
| 216 | self.buf[0] = s0; | ||
| 217 | self.buf[1] = s1; | ||
| 218 | return Grapheme{ .len = gc_len, .offset = gc_start }; | ||
| 219 | } | ||
| 220 | 291 | ||
| 221 | while (true) { | 292 | const Self = @This(); |
| 222 | self.advance(); | ||
| 223 | if (self.buf[0] == null) break; | ||
| 224 | 293 | ||
| 225 | gc_len += self.buf[0].?.len; | 294 | pub fn init(str: []const u8, data: *const Graphemes) Self { |
| 295 | var self: Self = .{ .cp_iter = .init(str), .data = data }; | ||
| 296 | self.advance(); | ||
| 297 | self.advance(); | ||
| 298 | return self; | ||
| 299 | } | ||
| 300 | |||
| 301 | fn advance(self: *Self) void { | ||
| 302 | self.buf[1] = self.buf[0]; | ||
| 303 | self.buf[0] = self.cp_iter.prev(); | ||
| 304 | } | ||
| 305 | |||
| 306 | pub fn peek(self: *Self) ?Grapheme { | ||
| 307 | const cache = .{ self.buf, self.cp_iter, self.pending }; | ||
| 308 | defer self.buf, self.cp_iter, self.pending = cache; | ||
| 309 | return self.prev(); | ||
| 310 | } | ||
| 311 | |||
| 312 | pub fn prev(self: *Self) ?Grapheme { | ||
| 313 | if (self.buf[1] == null) return null; | ||
| 314 | |||
| 315 | const grapheme_end: uoffset = end: { | ||
| 316 | const codepoint = self.buf[1].?; | ||
| 317 | |||
| 318 | switch (self.pending) { | ||
| 319 | // BUF: [?Any, Any] | ||
| 320 | .none => break :end codepoint.offset + codepoint.len, | ||
| 321 | .ri_count => |ri_count| { | ||
| 322 | std.debug.assert(ri_count > 0); | ||
| 323 | std.debug.assert(ri_count % 2 == 0); | ||
| 324 | |||
| 325 | if (ri_count > 2) { | ||
| 326 | self.pending.ri_count -= 2; | ||
| 327 | |||
| 328 | // Use the fact that all RI have length 4 in utf8 encoding | ||
| 329 | // since they are in range 0x1f1e6...0x1f1ff | ||
| 330 | // https://en.wikipedia.org/wiki/UTF-8#Encoding | ||
| 331 | return Grapheme{ | ||
| 332 | .len = 8, | ||
| 333 | .offset = @intCast(codepoint.offset + self.pending.ri_count * 4), | ||
| 334 | }; | ||
| 335 | } else { | ||
| 336 | self.pending = .{ .none = {} }; | ||
| 337 | break :end codepoint.offset + codepoint.len + 4; | ||
| 338 | } | ||
| 339 | }, | ||
| 340 | // BUF: [?Any, Extend] Extend* ZWJ | ||
| 341 | .extend_end => |extend_end| { | ||
| 342 | self.pending = .{ .none = {} }; | ||
| 343 | break :end extend_end; | ||
| 344 | }, | ||
| 345 | } | ||
| 346 | }; | ||
| 347 | |||
| 348 | while (self.buf[0] != null) { | ||
| 349 | var state: IterState = .{}; | ||
| 350 | state.xpic = true; | ||
| 351 | state.regional = false; | ||
| 352 | state.indic = true; | ||
| 226 | 353 | ||
| 227 | if (graphemeBreak( | 354 | if (graphemeBreak( |
| 228 | self.buf[0].?.code, | 355 | self.buf[0].?.code, |
| 229 | if (self.buf[1]) |ncp| ncp.code else 0, | 356 | self.buf[1].?.code, |
| 230 | self.data, | 357 | self.data, |
| 231 | &state, | 358 | &state, |
| 232 | )) break; | 359 | )) break; |
| 360 | |||
| 361 | self.advance(); | ||
| 362 | |||
| 363 | if (!state.indic) { | ||
| 364 | |||
| 365 | // BUF: [?Any, Extend | Linker] Consonant | ||
| 366 | var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; | ||
| 367 | |||
| 368 | indic: while (true) { | ||
| 369 | if (self.buf[0] == null) { | ||
| 370 | self.pending = .{ .extend_end = indic_offset }; | ||
| 371 | return .{ | ||
| 372 | .len = @intCast(grapheme_end - indic_offset), | ||
| 373 | .offset = indic_offset, | ||
| 374 | }; | ||
| 375 | } | ||
| 376 | |||
| 377 | const codepoint = self.buf[0].?; | ||
| 378 | |||
| 379 | switch (self.data.indic(codepoint.code)) { | ||
| 380 | .Extend, .Linker => { | ||
| 381 | self.advance(); | ||
| 382 | continue :indic; | ||
| 383 | }, | ||
| 384 | .Consonant => { | ||
| 385 | // BUF: [Consonant, Extend | Linker] (Extend | Linker)* Consonant | ||
| 386 | indic_offset = codepoint.offset; | ||
| 387 | self.advance(); | ||
| 388 | |||
| 389 | if (self.buf[0]) |cp1| { | ||
| 390 | state.indic = true; | ||
| 391 | |||
| 392 | if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; | ||
| 393 | |||
| 394 | if (!state.indic) { | ||
| 395 | continue :indic; | ||
| 396 | } else { | ||
| 397 | break :indic; | ||
| 398 | } | ||
| 399 | } else { | ||
| 400 | break :indic; | ||
| 401 | } | ||
| 402 | }, | ||
| 403 | .none => { | ||
| 404 | // BUF: [Any, Extend | Linker] (Extend | Linker)* Consonant | ||
| 405 | self.pending = .{ .extend_end = indic_offset }; | ||
| 406 | return .{ | ||
| 407 | .len = @intCast(grapheme_end - indic_offset), | ||
| 408 | .offset = indic_offset, | ||
| 409 | }; | ||
| 410 | }, | ||
| 411 | } | ||
| 412 | } | ||
| 413 | } | ||
| 414 | |||
| 415 | if (!state.xpic) { | ||
| 416 | // BUF: [?Any, ZWJ] Emoji | ||
| 417 | var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; | ||
| 418 | |||
| 419 | // Look for previous Emoji | ||
| 420 | emoji: while (true) { | ||
| 421 | if (self.buf[0] == null) { | ||
| 422 | self.pending = .{ .extend_end = emoji_offset }; | ||
| 423 | return .{ | ||
| 424 | .len = @intCast(grapheme_end - emoji_offset), | ||
| 425 | .offset = emoji_offset, | ||
| 426 | }; | ||
| 427 | } | ||
| 428 | |||
| 429 | const codepoint = self.buf[0].?; | ||
| 430 | |||
| 431 | if (self.data.gbp(codepoint.code) == .Extend) { | ||
| 432 | self.advance(); | ||
| 433 | continue :emoji; | ||
| 434 | } | ||
| 435 | |||
| 436 | if (self.data.isEmoji(codepoint.code)) { | ||
| 437 | // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)* | ||
| 438 | emoji_offset = codepoint.offset; | ||
| 439 | self.advance(); | ||
| 440 | |||
| 441 | if (self.buf[0] != null and | ||
| 442 | // ZWJ = 0x200d | ||
| 443 | self.buf[0].?.code == 0x200d) | ||
| 444 | { | ||
| 445 | // BUF: [ZWJ, Emoji] (Extend* ZWJ Emoji)* | ||
| 446 | // Back at the beginning of the loop, "recursively" look for emoji | ||
| 447 | self.advance(); | ||
| 448 | continue :emoji; | ||
| 449 | } else { | ||
| 450 | // BUF: [?Any, Emoji] (Extend* ZWJ Emoji)* | ||
| 451 | break :emoji; | ||
| 452 | } | ||
| 453 | } else { | ||
| 454 | // BUF: [Any, Extend] (Extend* ZWJ Emoji)* | ||
| 455 | self.pending = .{ .extend_end = emoji_offset }; | ||
| 456 | return .{ | ||
| 457 | .len = @intCast(grapheme_end - emoji_offset), | ||
| 458 | .offset = emoji_offset, | ||
| 459 | }; | ||
| 460 | } | ||
| 461 | } | ||
| 462 | } | ||
| 463 | |||
| 464 | if (state.regional) { | ||
| 465 | var ri_count: usize = 0; | ||
| 466 | while (self.buf[0] != null and | ||
| 467 | self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) | ||
| 468 | { | ||
| 469 | ri_count += 1; | ||
| 470 | self.advance(); | ||
| 471 | } | ||
| 472 | |||
| 473 | // Use the fact that all RI have length 4 in utf8 encoding | ||
| 474 | // since they are in range 0x1f1e6...0x1f1ff | ||
| 475 | // https://en.wikipedia.org/wiki/UTF-8#Encoding | ||
| 476 | if (ri_count == 0) { | ||
| 477 | // There are no pending RI codepoints | ||
| 478 | } else if (ri_count % 2 == 0) { | ||
| 479 | self.pending = .{ .ri_count = ri_count }; | ||
| 480 | return .{ .len = 8, .offset = grapheme_end - 8 }; | ||
| 481 | } else { | ||
| 482 | // Add one to count for the unused RI | ||
| 483 | self.pending = .{ .ri_count = ri_count + 1 }; | ||
| 484 | return .{ .len = 4, .offset = grapheme_end - 4 }; | ||
| 485 | } | ||
| 486 | } | ||
| 233 | } | 487 | } |
| 234 | self.cp_iter = saved_cp_iter; | ||
| 235 | self.buf[0] = s0; | ||
| 236 | self.buf[1] = s1; | ||
| 237 | 488 | ||
| 238 | return Grapheme{ .len = gc_len, .offset = gc_start }; | 489 | const grapheme_start = if (self.buf[1]) |codepoint| codepoint.offset else 0; |
| 490 | self.advance(); | ||
| 491 | return .{ | ||
| 492 | .len = @intCast(grapheme_end - grapheme_start), | ||
| 493 | .offset = grapheme_start, | ||
| 494 | }; | ||
| 239 | } | 495 | } |
| 240 | }; | 496 | }; |
| 241 | 497 | ||
| 498 | /// Grapheme Iterator state. | ||
| 499 | pub const IterState = packed struct(u3) { | ||
| 500 | xpic: bool = false, | ||
| 501 | regional: bool = false, | ||
| 502 | indic: bool = false, | ||
| 503 | }; | ||
| 504 | |||
| 242 | // Predicates | 505 | // Predicates |
| 243 | fn isBreaker(cp: u21, data: *const Graphemes) bool { | 506 | fn isBreaker(cp: u21, data: *const Graphemes) bool { |
| 244 | // Extract relevant properties. | 507 | // Extract relevant properties. |
| @@ -246,44 +509,6 @@ fn isBreaker(cp: u21, data: *const Graphemes) bool { | |||
| 246 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; | 509 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; |
| 247 | } | 510 | } |
| 248 | 511 | ||
| 249 | // Grapheme break state. | ||
| 250 | pub const State = struct { | ||
| 251 | bits: u3 = 0, | ||
| 252 | |||
| 253 | // Extended Pictographic (emoji) | ||
| 254 | fn hasXpic(self: State) bool { | ||
| 255 | return self.bits & 1 == 1; | ||
| 256 | } | ||
| 257 | fn setXpic(self: *State) void { | ||
| 258 | self.bits |= 1; | ||
| 259 | } | ||
| 260 | fn unsetXpic(self: *State) void { | ||
| 261 | self.bits ^= 1; | ||
| 262 | } | ||
| 263 | |||
| 264 | // Regional Indicatior (flags) | ||
| 265 | fn hasRegional(self: State) bool { | ||
| 266 | return self.bits & 2 == 2; | ||
| 267 | } | ||
| 268 | fn setRegional(self: *State) void { | ||
| 269 | self.bits |= 2; | ||
| 270 | } | ||
| 271 | fn unsetRegional(self: *State) void { | ||
| 272 | self.bits ^= 2; | ||
| 273 | } | ||
| 274 | |||
| 275 | // Indic Conjunct | ||
| 276 | fn hasIndic(self: State) bool { | ||
| 277 | return self.bits & 4 == 4; | ||
| 278 | } | ||
| 279 | fn setIndic(self: *State) void { | ||
| 280 | self.bits |= 4; | ||
| 281 | } | ||
| 282 | fn unsetIndic(self: *State) void { | ||
| 283 | self.bits ^= 4; | ||
| 284 | } | ||
| 285 | }; | ||
| 286 | |||
| 287 | /// `graphemeBreak` returns true only if a grapheme break point is required | 512 | /// `graphemeBreak` returns true only if a grapheme break point is required |
| 288 | /// between `cp1` and `cp2`. `state` should start out as 0. If calling | 513 | /// between `cp1` and `cp2`. `state` should start out as 0. If calling |
| 289 | /// iteratively over a sequence of code points, this function must be called | 514 | /// iteratively over a sequence of code points, this function must be called |
| @@ -294,7 +519,7 @@ pub fn graphemeBreak( | |||
| 294 | cp1: u21, | 519 | cp1: u21, |
| 295 | cp2: u21, | 520 | cp2: u21, |
| 296 | data: *const Graphemes, | 521 | data: *const Graphemes, |
| 297 | state: *State, | 522 | state: *IterState, |
| 298 | ) bool { | 523 | ) bool { |
| 299 | // Extract relevant properties. | 524 | // Extract relevant properties. |
| 300 | const cp1_gbp_prop = data.gbp(cp1); | 525 | const cp1_gbp_prop = data.gbp(cp1); |
| @@ -306,9 +531,9 @@ pub fn graphemeBreak( | |||
| 306 | const cp2_is_emoji = data.isEmoji(cp2); | 531 | const cp2_is_emoji = data.isEmoji(cp2); |
| 307 | 532 | ||
| 308 | // GB11: Emoji Extend* ZWJ x Emoji | 533 | // GB11: Emoji Extend* ZWJ x Emoji |
| 309 | if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); | 534 | if (!state.xpic and cp1_is_emoji) state.xpic = true; |
| 310 | // GB9c: Indic Conjunct Break | 535 | // GB9c: Indic Conjunct Break |
| 311 | if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic(); | 536 | if (!state.indic and cp1_indic_prop == .Consonant) state.indic = true; |
| 312 | 537 | ||
| 313 | // GB3: CR x LF | 538 | // GB3: CR x LF |
| 314 | if (cp1 == '\r' and cp2 == '\n') return false; | 539 | if (cp1 == '\r' and cp2 == '\n') return false; |
| @@ -317,11 +542,11 @@ pub fn graphemeBreak( | |||
| 317 | if (isBreaker(cp1, data)) return true; | 542 | if (isBreaker(cp1, data)) return true; |
| 318 | 543 | ||
| 319 | // GB11: Emoji Extend* ZWJ x Emoji | 544 | // GB11: Emoji Extend* ZWJ x Emoji |
| 320 | if (state.hasXpic() and | 545 | if (state.xpic and |
| 321 | cp1_gbp_prop == .ZWJ and | 546 | cp1_gbp_prop == .ZWJ and |
| 322 | cp2_is_emoji) | 547 | cp2_is_emoji) |
| 323 | { | 548 | { |
| 324 | state.unsetXpic(); | 549 | state.xpic = false; |
| 325 | return false; | 550 | return false; |
| 326 | } | 551 | } |
| 327 | 552 | ||
| @@ -336,11 +561,11 @@ pub fn graphemeBreak( | |||
| 336 | 561 | ||
| 337 | // GB12, GB13: RI x RI | 562 | // GB12, GB13: RI x RI |
| 338 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { | 563 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { |
| 339 | if (state.hasRegional()) { | 564 | if (state.regional) { |
| 340 | state.unsetRegional(); | 565 | state.regional = false; |
| 341 | return true; | 566 | return true; |
| 342 | } else { | 567 | } else { |
| 343 | state.setRegional(); | 568 | state.regional = true; |
| 344 | return false; | 569 | return false; |
| 345 | } | 570 | } |
| 346 | } | 571 | } |
| @@ -365,25 +590,25 @@ pub fn graphemeBreak( | |||
| 365 | } | 590 | } |
| 366 | 591 | ||
| 367 | // GB9c: Indic Conjunct Break | 592 | // GB9c: Indic Conjunct Break |
| 368 | if (state.hasIndic() and | 593 | if (state.indic and |
| 369 | cp1_indic_prop == .Consonant and | 594 | cp1_indic_prop == .Consonant and |
| 370 | (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) | 595 | (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) |
| 371 | { | 596 | { |
| 372 | return false; | 597 | return false; |
| 373 | } | 598 | } |
| 374 | 599 | ||
| 375 | if (state.hasIndic() and | 600 | if (state.indic and |
| 376 | cp1_indic_prop == .Extend and | 601 | cp1_indic_prop == .Extend and |
| 377 | cp2_indic_prop == .Linker) | 602 | cp2_indic_prop == .Linker) |
| 378 | { | 603 | { |
| 379 | return false; | 604 | return false; |
| 380 | } | 605 | } |
| 381 | 606 | ||
| 382 | if (state.hasIndic() and | 607 | if (state.indic and |
| 383 | (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and | 608 | (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and |
| 384 | cp2_indic_prop == .Consonant) | 609 | cp2_indic_prop == .Consonant) |
| 385 | { | 610 | { |
| 386 | state.unsetIndic(); | 611 | state.indic = false; |
| 387 | return false; | 612 | return false; |
| 388 | } | 613 | } |
| 389 | 614 | ||
| @@ -421,3 +646,39 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { | |||
| 421 | try std.testing.expectEqual(@as(usize, 2), i); | 646 | try std.testing.expectEqual(@as(usize, 2), i); |
| 422 | } | 647 | } |
| 423 | } | 648 | } |
| 649 | |||
| 650 | test "Iterator.peek" { | ||
| 651 | const peek_seq = "aΔ👨🏻🌾→"; | ||
| 652 | const data = try Graphemes.init(std.testing.allocator); | ||
| 653 | defer data.deinit(std.testing.allocator); | ||
| 654 | |||
| 655 | var iter = data.iterator(peek_seq); | ||
| 656 | const peek_a = iter.peek().?; | ||
| 657 | const next_a = iter.next().?; | ||
| 658 | try std.testing.expectEqual(peek_a, next_a); | ||
| 659 | try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq)); | ||
| 660 | const peek_d1 = iter.peek().?; | ||
| 661 | const peek_d2 = iter.peek().?; | ||
| 662 | try std.testing.expectEqual(peek_d1, peek_d2); | ||
| 663 | const next_d = iter.next().?; | ||
| 664 | try std.testing.expectEqual(peek_d2, next_d); | ||
| 665 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 666 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 667 | try std.testing.expectEqual(null, iter.peek()); | ||
| 668 | try std.testing.expectEqual(null, iter.peek()); | ||
| 669 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 670 | } | ||
| 671 | |||
| 672 | const std = @import("std"); | ||
| 673 | const builtin = @import("builtin"); | ||
| 674 | const assert = std.debug.assert; | ||
| 675 | const mem = std.mem; | ||
| 676 | const Allocator = mem.Allocator; | ||
| 677 | const compress = std.compress; | ||
| 678 | const unicode = std.unicode; | ||
| 679 | |||
| 680 | const code_point = @import("code_point"); | ||
| 681 | const CodePoint = code_point.CodePoint; | ||
| 682 | const CodePointIterator = code_point.Iterator; | ||
| 683 | const CodePointReverseIterator = code_point.ReverseIterator; | ||
| 684 | const uoffset = code_point.uoffset; | ||
diff --git a/src/Words.zig b/src/Words.zig new file mode 100644 index 0000000..617c34d --- /dev/null +++ b/src/Words.zig | |||
| @@ -0,0 +1,773 @@ | |||
| 1 | //! Word Breaking Algorithm. | ||
| 2 | //! | ||
| 3 | //! https://www.unicode.org/reports/tr29/#Word_Boundaries | ||
| 4 | //! | ||
| 5 | |||
| 6 | const WordBreakProperty = enum(u5) { | ||
| 7 | none, | ||
| 8 | Double_Quote, | ||
| 9 | Single_Quote, | ||
| 10 | Hebrew_Letter, | ||
| 11 | CR, | ||
| 12 | LF, | ||
| 13 | Newline, | ||
| 14 | Extend, | ||
| 15 | Regional_Indicator, | ||
| 16 | Format, | ||
| 17 | Katakana, | ||
| 18 | ALetter, | ||
| 19 | MidLetter, | ||
| 20 | MidNum, | ||
| 21 | MidNumLet, | ||
| 22 | Numeric, | ||
| 23 | ExtendNumLet, | ||
| 24 | ZWJ, | ||
| 25 | WSegSpace, | ||
| 26 | }; | ||
| 27 | |||
| 28 | s1: []u16 = undefined, | ||
| 29 | s2: []u5 = undefined, | ||
| 30 | |||
| 31 | const Words = @This(); | ||
| 32 | |||
| 33 | pub fn init(allocator: Allocator) Allocator.Error!Words { | ||
| 34 | var wb: Words = undefined; | ||
| 35 | try wb.setup(allocator); | ||
| 36 | return wb; | ||
| 37 | } | ||
| 38 | |||
| 39 | pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void { | ||
| 40 | wb.setupImpl(allocator) catch |err| { | ||
| 41 | switch (err) { | ||
| 42 | error.OutOfMemory => |e| return e, | ||
| 43 | else => unreachable, | ||
| 44 | } | ||
| 45 | }; | ||
| 46 | } | ||
| 47 | |||
| 48 | pub fn deinit(words: *const Words, allocator: mem.Allocator) void { | ||
| 49 | allocator.free(words.s1); | ||
| 50 | allocator.free(words.s2); | ||
| 51 | } | ||
| 52 | |||
| 53 | /// Represents a Unicode word span, as an offset into the source string | ||
| 54 | /// and the length of the word. | ||
| 55 | pub const Word = struct { | ||
| 56 | offset: uoffset, | ||
| 57 | len: uoffset, | ||
| 58 | |||
| 59 | /// Returns a slice of the word given the source string. | ||
| 60 | pub fn bytes(word: Word, src: []const u8) []const u8 { | ||
| 61 | return src[word.offset..][0..word.len]; | ||
| 62 | } | ||
| 63 | }; | ||
| 64 | |||
| 65 | /// Returns the word break property type for `cp`. | ||
| 66 | pub fn breakProperty(words: *const Words, cp: u21) WordBreakProperty { | ||
| 67 | return @enumFromInt(words.s2[words.s1[cp >> 8] + (cp & 0xff)]); | ||
| 68 | } | ||
| 69 | |||
| 70 | /// Convenience function for working with CodePoints | ||
| 71 | fn breakProp(words: *const Words, point: CodePoint) WordBreakProperty { | ||
| 72 | return @enumFromInt(words.s2[words.s1[point.code >> 8] + (point.code & 0xff)]); | ||
| 73 | } | ||
| 74 | |||
| 75 | /// Returns the Word at the given index. Asserts that the index is less than | ||
| 76 | /// `string.len`, and that the string is not empty. Always returns a word. | ||
| 77 | /// The index does not have to be the start of a codepoint in the word. | ||
| 78 | pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word { | ||
| 79 | assert(index < string.len and string.len > 0); | ||
| 80 | var iter_back: ReverseIterator = reverseFromIndex(words, string, index); | ||
| 81 | const first_back = iter_back.prev(); | ||
| 82 | if (first_back) |back| { | ||
| 83 | if (back.offset == 0) { | ||
| 84 | var iter_fwd = words.iterator(string); | ||
| 85 | while (iter_fwd.next()) |word| { | ||
| 86 | if (word.offset <= index and index < word.offset + word.len) | ||
| 87 | return word; | ||
| 88 | } | ||
| 89 | } | ||
| 90 | } else { | ||
| 91 | var iter_fwd = words.iterator(string); | ||
| 92 | while (iter_fwd.next()) |word| { | ||
| 93 | if (word.offset <= index and index < word.offset + word.len) | ||
| 94 | return word; | ||
| 95 | } | ||
| 96 | } | ||
| 97 | _ = iter_back.prev(); | ||
| 98 | // There's sometimes flags: | ||
| 99 | if (iter_back.flags > 0) { | ||
| 100 | while (iter_back.flags > 0) { | ||
| 101 | if (iter_back.prev()) |_| { | ||
| 102 | continue; | ||
| 103 | } else { | ||
| 104 | break; | ||
| 105 | } | ||
| 106 | } | ||
| 107 | } | ||
| 108 | var iter_fwd = iter_back.forwardIterator(); | ||
| 109 | while (iter_fwd.next()) |word| { | ||
| 110 | if (word.offset <= index and index < word.offset + word.len) | ||
| 111 | return word; | ||
| 112 | } | ||
| 113 | unreachable; | ||
| 114 | } | ||
| 115 | |||
| 116 | /// Returns an iterator over words in `slice`. | ||
| 117 | pub fn iterator(words: *const Words, slice: []const u8) Iterator { | ||
| 118 | return Iterator.init(words, slice); | ||
| 119 | } | ||
| 120 | |||
| 121 | /// Returns a reverse iterator over the words in `slice`. | ||
| 122 | pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator { | ||
| 123 | return ReverseIterator.init(words, slice); | ||
| 124 | } | ||
| 125 | |||
| 126 | /// Returns an iterator after the `word` in `slice`. | ||
| 127 | pub fn iterateAfterWord(words: *const Words, slice: []const u8, word: Word) Iterator { | ||
| 128 | return forwardFromIndex(words, slice, word.offset + word.len); | ||
| 129 | } | ||
| 130 | |||
| 131 | /// Returns a reverse iterator before the `word` in `slice`. | ||
| 132 | pub fn iterateBeforeWord(words: *const Words, slice: []const u8, word: Word) ReverseIterator { | ||
| 133 | return reverseFromIndex(words, slice, word.offset); | ||
| 134 | } | ||
| 135 | |||
| 136 | /// An iterator, forward, over all words in a provided string. | ||
| 137 | pub const Iterator = struct { | ||
| 138 | this: ?CodePoint = null, | ||
| 139 | that: ?CodePoint = null, | ||
| 140 | cp_iter: CodepointIterator, | ||
| 141 | wb: *const Words, | ||
| 142 | |||
| 143 | /// Assumes `str` is valid UTF-8. | ||
| 144 | pub fn init(words: *const Words, str: []const u8) Iterator { | ||
| 145 | var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = words }; | ||
| 146 | wb_iter.advance(); | ||
| 147 | return wb_iter; | ||
| 148 | } | ||
| 149 | |||
| 150 | /// Returns the next word segment, without advancing. | ||
| 151 | pub fn peek(iter: *Iterator) ?Word { | ||
| 152 | const cache = .{ iter.this, iter.that, iter.cp_iter }; | ||
| 153 | defer { | ||
| 154 | iter.this, iter.that, iter.cp_iter = cache; | ||
| 155 | } | ||
| 156 | return iter.next(); | ||
| 157 | } | ||
| 158 | |||
| 159 | /// Returns a reverse iterator from the point this iterator is paused | ||
| 160 | /// at. Usually, and always when using the API to create iterators, | ||
| 161 | /// calling `prev()` will return the word just seen. | ||
| 162 | pub fn reverseIterator(iter: *Iterator) ReverseIterator { | ||
| 163 | var cp_it = iter.cp_iter.reverseIterator(); | ||
| 164 | if (iter.that) |_| | ||
| 165 | _ = cp_it.prev(); | ||
| 166 | if (iter.cp_iter.peek()) |_| | ||
| 167 | _ = cp_it.prev(); | ||
| 168 | return .{ | ||
| 169 | .wb = iter.wb, | ||
| 170 | .before = cp_it.prev(), | ||
| 171 | .after = iter.that, | ||
| 172 | .cp_iter = cp_it, | ||
| 173 | }; | ||
| 174 | } | ||
| 175 | |||
| 176 | /// Returns the next word segment, if any. | ||
| 177 | pub fn next(iter: *Iterator) ?Word { | ||
| 178 | iter.advance(); | ||
| 179 | |||
| 180 | // Done? | ||
| 181 | if (iter.this == null) return null; | ||
| 182 | // Last? | ||
| 183 | if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; | ||
| 184 | |||
| 185 | const word_start = iter.this.?.offset; | ||
| 186 | var word_len: uoffset = 0; | ||
| 187 | |||
| 188 | // State variables. | ||
| 189 | var last_p: WordBreakProperty = .none; | ||
| 190 | var last_last_p: WordBreakProperty = .none; | ||
| 191 | var ri_count: usize = 0; | ||
| 192 | |||
| 193 | scan: while (true) : (iter.advance()) { | ||
| 194 | const this = iter.this.?; | ||
| 195 | word_len += this.len; | ||
| 196 | if (iter.that) |that| { | ||
| 197 | const this_p = iter.wb.breakProp(this); | ||
| 198 | const that_p = iter.wb.breakProp(that); | ||
| 199 | if (!isIgnorable(this_p)) { | ||
| 200 | last_last_p = last_p; | ||
| 201 | last_p = this_p; | ||
| 202 | } | ||
| 203 | // WB3 CR × LF | ||
| 204 | if (this_p == .CR and that_p == .LF) continue :scan; | ||
| 205 | // WB3a (Newline | CR | LF) ÷ | ||
| 206 | if (isNewline(this_p)) break :scan; | ||
| 207 | // WB3b ÷ (Newline | CR | LF) | ||
| 208 | if (isNewline(that_p)) break :scan; | ||
| 209 | // WB3c ZWJ × \p{Extended_Pictographic} | ||
| 210 | if (this_p == .ZWJ and ext_pict.isMatch(that.bytes(iter.cp_iter.bytes))) { | ||
| 211 | continue :scan; | ||
| 212 | } | ||
| 213 | // WB3d WSegSpace × WSegSpace | ||
| 214 | if (this_p == .WSegSpace and that_p == .WSegSpace) continue :scan; | ||
| 215 | // WB4 X (Extend | Format | ZWJ)* → X | ||
| 216 | if (isIgnorable(that_p)) { | ||
| 217 | continue :scan; | ||
| 218 | } // Now we use last_p instead of this_p for ignorable's sake | ||
| 219 | if (isAHLetter(last_p)) { | ||
| 220 | // WB5 AHLetter × AHLetter | ||
| 221 | if (isAHLetter(that_p)) continue :scan; | ||
| 222 | // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter | ||
| 223 | if (isMidVal(that_p)) { | ||
| 224 | const next_val = iter.peekPast(); | ||
| 225 | if (next_val) |next_cp| { | ||
| 226 | const next_p = iter.wb.breakProp(next_cp); | ||
| 227 | if (isAHLetter(next_p)) { | ||
| 228 | continue :scan; | ||
| 229 | } | ||
| 230 | } | ||
| 231 | } | ||
| 232 | } | ||
| 233 | // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter | ||
| 234 | if (isAHLetter(last_last_p) and isMidVal(last_p) and isAHLetter(that_p)) { | ||
| 235 | continue :scan; | ||
| 236 | } | ||
| 237 | if (last_p == .Hebrew_Letter) { | ||
| 238 | // WB7a Hebrew_Letter × Single_Quote | ||
| 239 | if (that_p == .Single_Quote) continue :scan; | ||
| 240 | // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter | ||
| 241 | if (that_p == .Double_Quote) { | ||
| 242 | const next_val = iter.peekPast(); | ||
| 243 | if (next_val) |next_cp| { | ||
| 244 | const next_p = iter.wb.breakProp(next_cp); | ||
| 245 | if (next_p == .Hebrew_Letter) { | ||
| 246 | continue :scan; | ||
| 247 | } | ||
| 248 | } | ||
| 249 | } | ||
| 250 | } | ||
| 251 | // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter | ||
| 252 | if (last_last_p == .Hebrew_Letter and last_p == .Double_Quote and that_p == .Hebrew_Letter) | ||
| 253 | continue :scan; | ||
| 254 | // WB8 Numeric × Numeric | ||
| 255 | if (last_p == .Numeric and that_p == .Numeric) continue :scan; | ||
| 256 | // WB9 AHLetter × Numeric | ||
| 257 | if (isAHLetter(last_p) and that_p == .Numeric) continue :scan; | ||
| 258 | // WB10 Numeric × AHLetter | ||
| 259 | if (last_p == .Numeric and isAHLetter(that_p)) continue :scan; | ||
| 260 | // WB11 Numeric (MidNum | MidNumLetQ) × Numeric | ||
| 261 | if (last_last_p == .Numeric and isMidNum(last_p) and that_p == .Numeric) | ||
| 262 | continue :scan; | ||
| 263 | // WB12 Numeric × (MidNum | MidNumLetQ) Numeric | ||
| 264 | if (last_p == .Numeric and isMidNum(that_p)) { | ||
| 265 | const next_val = iter.peekPast(); | ||
| 266 | if (next_val) |next_cp| { | ||
| 267 | const next_p = iter.wb.breakProp(next_cp); | ||
| 268 | if (next_p == .Numeric) { | ||
| 269 | continue :scan; | ||
| 270 | } | ||
| 271 | } | ||
| 272 | } | ||
| 273 | // WB13 Katakana × Katakana | ||
| 274 | if (last_p == .Katakana and that_p == .Katakana) continue :scan; | ||
| 275 | // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet | ||
| 276 | if (isExtensible(last_p) and that_p == .ExtendNumLet) continue :scan; | ||
| 277 | // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) | ||
| 278 | if (last_p == .ExtendNumLet and isExtensible(that_p)) continue :scan; | ||
| 279 | // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI | ||
| 280 | const maybe_flag = that_p == .Regional_Indicator and last_p == .Regional_Indicator; | ||
| 281 | if (maybe_flag) { | ||
| 282 | ri_count += 1; | ||
| 283 | if (ri_count % 2 == 1) continue :scan; | ||
| 284 | } | ||
| 285 | // WB999 Any ÷ Any | ||
| 286 | break :scan; | ||
| 287 | } else { // iter.that == null | ||
| 288 | break :scan; | ||
| 289 | } | ||
| 290 | } | ||
| 291 | |||
| 292 | return Word{ .len = word_len, .offset = word_start }; | ||
| 293 | } | ||
| 294 | |||
| 295 | pub fn format(iter: Iterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { | ||
| 296 | try writer.print( | ||
| 297 | "Iterator {{ .this = {any}, .that = {any} }}", | ||
| 298 | .{ iter.this, iter.that }, | ||
| 299 | ); | ||
| 300 | } | ||
| 301 | |||
| 302 | fn advance(iter: *Iterator) void { | ||
| 303 | iter.this = iter.that; | ||
| 304 | iter.that = iter.cp_iter.next(); | ||
| 305 | } | ||
| 306 | |||
| 307 | fn peekPast(iter: *Iterator) ?CodePoint { | ||
| 308 | const save_cp = iter.cp_iter; | ||
| 309 | defer iter.cp_iter = save_cp; | ||
| 310 | while (iter.cp_iter.peek()) |peeked| { | ||
| 311 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | ||
| 312 | _ = iter.cp_iter.next(); | ||
| 313 | } | ||
| 314 | return null; | ||
| 315 | } | ||
| 316 | }; | ||
| 317 | |||
| 318 | /// An iterator, backward, over all words in a provided string. | ||
| 319 | pub const ReverseIterator = struct { | ||
| 320 | after: ?CodePoint = null, | ||
| 321 | before: ?CodePoint = null, | ||
| 322 | cp_iter: ReverseCodepointIterator, | ||
| 323 | wb: *const Words, | ||
| 324 | flags: usize = 0, | ||
| 325 | |||
| 326 | /// Assumes `str` is valid UTF-8. | ||
| 327 | pub fn init(words: *const Words, str: []const u8) ReverseIterator { | ||
| 328 | var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = words }; | ||
| 329 | wb_iter.advance(); | ||
| 330 | return wb_iter; | ||
| 331 | } | ||
| 332 | |||
| 333 | /// Returns the previous word segment, if any, without advancing. | ||
| 334 | pub fn peek(iter: *ReverseIterator) ?Word { | ||
| 335 | const cache = .{ iter.before, iter.after, iter.cp_iter, iter.flags }; | ||
| 336 | defer { | ||
| 337 | iter.before, iter.after, iter.cp_iter, iter.flags = cache; | ||
| 338 | } | ||
| 339 | return iter.prev(); | ||
| 340 | } | ||
| 341 | |||
| 342 | /// Return a forward iterator from where this iterator paused. Usually, | ||
| 343 | /// and always when using the API to create iterators, calling `next()` | ||
| 344 | /// will return the word just seen. | ||
| 345 | pub fn forwardIterator(iter: *ReverseIterator) Iterator { | ||
| 346 | var cp_it = iter.cp_iter.forwardIterator(); | ||
| 347 | if (iter.before) |_| | ||
| 348 | _ = cp_it.next(); | ||
| 349 | return .{ | ||
| 350 | .wb = iter.wb, | ||
| 351 | .this = cp_it.next(), | ||
| 352 | .that = iter.after, | ||
| 353 | .cp_iter = cp_it, | ||
| 354 | }; | ||
| 355 | } | ||
| 356 | |||
| 357 | /// Return the previous word, if any. | ||
| 358 | pub fn prev(iter: *ReverseIterator) ?Word { | ||
| 359 | iter.advance(); | ||
| 360 | |||
| 361 | // Done? | ||
| 362 | if (iter.after == null) return null; | ||
| 363 | // Last? | ||
| 364 | if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 }; | ||
| 365 | |||
| 366 | const word_end = iter.after.?.offset + iter.after.?.len; | ||
| 367 | var word_len: uoffset = 0; | ||
| 368 | |||
| 369 | // State variables. | ||
| 370 | var last_p: WordBreakProperty = .none; | ||
| 371 | var last_last_p: WordBreakProperty = .none; | ||
| 372 | |||
| 373 | scan: while (true) : (iter.advance()) { | ||
| 374 | const after = iter.after.?; | ||
| 375 | word_len += after.len; | ||
| 376 | if (iter.before) |before| { | ||
| 377 | var sneak = sneaky(iter); // 'sneaks' past ignorables | ||
| 378 | const after_p = iter.wb.breakProp(after); | ||
| 379 | var before_p = iter.wb.breakProp(before); | ||
| 380 | if (!isIgnorable(after_p)) { | ||
| 381 | last_last_p = last_p; | ||
| 382 | last_p = after_p; | ||
| 383 | } | ||
| 384 | // WB3 CR × LF | ||
| 385 | if (before_p == .CR and after_p == .LF) continue :scan; | ||
| 386 | // WB3a (Newline | CR | LF) ÷ | ||
| 387 | if (isNewline(before_p)) break :scan; | ||
| 388 | // WB3b ÷ (Newline | CR | LF) | ||
| 389 | if (isNewline(after_p)) break :scan; | ||
| 390 | // WB3c ZWJ × \p{Extended_Pictographic} | ||
| 391 | if (before_p == .ZWJ and ext_pict.isMatch(after.bytes(iter.cp_iter.bytes))) { | ||
| 392 | continue :scan; | ||
| 393 | } | ||
| 394 | // WB3d WSegSpace × WSegSpace | ||
| 395 | if (before_p == .WSegSpace and after_p == .WSegSpace) continue :scan; | ||
| 396 | // WB4 X (Extend | Format | ZWJ)* → X | ||
| 397 | if (isIgnorable(before_p)) { | ||
| 398 | const maybe_before = sneak.prev(); | ||
| 399 | if (maybe_before) |valid_before| { | ||
| 400 | before_p = iter.wb.breakProp(valid_before); | ||
| 401 | } else if (!isIgnorable(after_p)) { | ||
| 402 | // We're done | ||
| 403 | break :scan; | ||
| 404 | } | ||
| 405 | } | ||
| 406 | if (isIgnorable(after_p)) continue :scan; | ||
| 407 | // WB5 AHLetter × AHLetter | ||
| 408 | if (isAHLetter(last_p) and isAHLetter(before_p)) { | ||
| 409 | continue :scan; | ||
| 410 | } | ||
| 411 | // WB6 AHLetter × (MidLetter | MidNumLetQ) AHLetter | ||
| 412 | if (isAHLetter(before_p) and isMidVal(last_p) and isAHLetter(last_last_p)) { | ||
| 413 | continue :scan; | ||
| 414 | } | ||
| 415 | // WB7 AHLetter (MidLetter | MidNumLetQ) × AHLetter | ||
| 416 | if (isMidVal(before_p) and isAHLetter(last_p)) { | ||
| 417 | const prev_val = sneak.peek(); | ||
| 418 | if (prev_val) |prev_cp| { | ||
| 419 | const prev_p = iter.wb.breakProp(prev_cp); | ||
| 420 | if (isAHLetter(prev_p)) { | ||
| 421 | continue :scan; | ||
| 422 | } | ||
| 423 | } | ||
| 424 | } | ||
| 425 | // WB7a Hebrew_Letter × Single_Quote | ||
| 426 | if (before_p == .Hebrew_Letter and last_p == .Single_Quote) continue :scan; | ||
| 427 | // WB7b Hebrew_Letter × Double_Quote Hebrew_Letter | ||
| 428 | if (before_p == .Hebrew_Letter and last_p == .Double_Quote and last_last_p == .Hebrew_Letter) { | ||
| 429 | continue :scan; | ||
| 430 | } | ||
| 431 | // WB7c Hebrew_Letter Double_Quote × Hebrew_Letter | ||
| 432 | if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { | ||
| 433 | const prev_val = sneak.peek(); | ||
| 434 | if (prev_val) |prev_cp| { | ||
| 435 | const prev_p = iter.wb.breakProp(prev_cp); | ||
| 436 | if (prev_p == .Hebrew_Letter) { | ||
| 437 | continue :scan; | ||
| 438 | } | ||
| 439 | } | ||
| 440 | } | ||
| 441 | // WB8 Numeric × Numeric | ||
| 442 | if (before_p == .Numeric and last_p == .Numeric) continue :scan; | ||
| 443 | // WB9 AHLetter × Numeric | ||
| 444 | if (isAHLetter(before_p) and last_p == .Numeric) continue :scan; | ||
| 445 | // WB10 Numeric × AHLetter | ||
| 446 | if (before_p == .Numeric and isAHLetter(last_p)) continue :scan; | ||
| 447 | // WB11 Numeric (MidNum | MidNumLetQ) × Numeric | ||
| 448 | if (isMidNum(before_p) and last_p == .Numeric) { | ||
| 449 | const prev_val = sneak.peek(); | ||
| 450 | if (prev_val) |prev_cp| { | ||
| 451 | const prev_p = iter.wb.breakProp(prev_cp); | ||
| 452 | if (prev_p == .Numeric) { | ||
| 453 | continue :scan; | ||
| 454 | } | ||
| 455 | } | ||
| 456 | } | ||
| 457 | // WB12 Numeric × (MidNum | MidNumLetQ) Numeric | ||
| 458 | if (before_p == .Numeric and isMidNum(last_p) and last_last_p == .Numeric) { | ||
| 459 | continue :scan; | ||
| 460 | } | ||
| 461 | // WB13 Katakana × Katakana | ||
| 462 | if (before_p == .Katakana and last_p == .Katakana) continue :scan; | ||
| 463 | // WB13a (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet | ||
| 464 | if (isExtensible(before_p) and last_p == .ExtendNumLet) continue :scan; | ||
| 465 | // WB13b ExtendNumLet × (AHLetter | Numeric | Katakana) | ||
| 466 | if (before_p == .ExtendNumLet and isExtensible(last_p)) continue :scan; | ||
| 467 | // WB15, WB16 ([^RI] | sot) (RI RI)* RI × RI | ||
| 468 | // NOTE: | ||
| 469 | // So here we simply have to know whether a run of flags is even or odd. | ||
| 470 | // The whole run. To avoid quadratic behavior (and long flag runs are | ||
| 471 | // actually a thing in the wild), we have to count them once, store that | ||
| 472 | // on the iterator, and decrement each time we see two, possibly breaking | ||
| 473 | // once extra at the beginning. They break up one per flag, once we hit | ||
| 474 | // zero, that's all the flags. If we see another flag we do it again. | ||
| 475 | if (before_p == .Regional_Indicator and last_p == .Regional_Indicator) { | ||
| 476 | defer { | ||
| 477 | if (iter.flags > 0) iter.flags -= 1; | ||
| 478 | } | ||
| 479 | if (iter.flags == 0) { | ||
| 480 | iter.flags = sneak.countFlags(); | ||
| 481 | } | ||
| 482 | if (iter.flags % 2 == 0) { | ||
| 483 | continue :scan; | ||
| 484 | } | ||
| 485 | } | ||
| 486 | // WB999 Any ÷ Any | ||
| 487 | break :scan; | ||
| 488 | } | ||
| 489 | break :scan; | ||
| 490 | } | ||
| 491 | return Word{ .len = word_len, .offset = word_end - word_len }; | ||
| 492 | } | ||
| 493 | |||
| 494 | pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { | ||
| 495 | try writer.print( | ||
| 496 | "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}", | ||
| 497 | .{ iter.before, iter.after, iter.flags }, | ||
| 498 | ); | ||
| 499 | } | ||
| 500 | |||
| 501 | fn peekPast(iter: *ReverseIterator) ?CodePoint { | ||
| 502 | const save_cp = iter.cp_iter; | ||
| 503 | defer iter.cp_iter = save_cp; | ||
| 504 | while (iter.cp_iter.peek()) |peeked| { | ||
| 505 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | ||
| 506 | _ = iter.cp_iter.prev(); | ||
| 507 | } | ||
| 508 | return null; | ||
| 509 | } | ||
| 510 | |||
| 511 | fn advance(iter: *ReverseIterator) void { | ||
| 512 | iter.after = iter.before; | ||
| 513 | iter.before = iter.cp_iter.prev(); | ||
| 514 | } | ||
| 515 | }; | ||
| 516 | |||
| 517 | //| Implementation Details | ||
| 518 | |||
| 519 | /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. | ||
| 520 | fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator { | ||
| 521 | var idx: uoffset = @intCast(index); | ||
| 522 | // Find the next lead byte: | ||
| 523 | while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} | ||
| 524 | if (idx == string.len) return words.reverseIterator(string); | ||
| 525 | var iter: ReverseIterator = undefined; | ||
| 526 | iter.wb = words; | ||
| 527 | iter.flags = 0; | ||
| 528 | // We need to populate the CodePoints, and the codepoint iterator. | ||
| 529 | // Consider "abc| def" with the cursor as |. | ||
| 530 | // We need `before` to be `c` and `after` to be ' ', | ||
| 531 | // and `cp_iter.prev()` to be `b`. | ||
| 532 | var cp_iter: ReverseCodepointIterator = .{ .bytes = string, .i = idx }; | ||
| 533 | iter.after = cp_iter.prev(); | ||
| 534 | iter.before = cp_iter.prev(); | ||
| 535 | iter.cp_iter = cp_iter; | ||
| 536 | return iter; | ||
| 537 | } | ||
| 538 | |||
| 539 | fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator { | ||
| 540 | var idx: uoffset = @intCast(index); | ||
| 541 | if (idx == string.len) { | ||
| 542 | return .{ | ||
| 543 | .cp_iter = .{ .bytes = string, .i = idx }, | ||
| 544 | .this = null, | ||
| 545 | .that = null, | ||
| 546 | .wb = words, | ||
| 547 | }; | ||
| 548 | } | ||
| 549 | while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} | ||
| 550 | if (idx == 0) return words.iterator(string); | ||
| 551 | var iter: Iterator = undefined; | ||
| 552 | iter.wb = words; | ||
| 553 | // We need to populate the CodePoints, and the codepoint iterator. | ||
| 554 | // Consider "abc |def" with the cursor as |. | ||
| 555 | // We need `this` to be ` ` and `that` to be 'd', | ||
| 556 | // and `cp_iter.next()` to be `d`. | ||
| 557 | idx -= 1; | ||
| 558 | while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} | ||
| 559 | // "abc| def" | ||
| 560 | var cp_iter: CodepointIterator = .{ .bytes = string, .i = idx }; | ||
| 561 | iter.this = cp_iter.next(); | ||
| 562 | iter.that = cp_iter.next(); | ||
| 563 | iter.cp_iter = cp_iter; | ||
| 564 | return iter; | ||
| 565 | } | ||
| 566 | |||
| 567 | fn sneaky(iter: *const ReverseIterator) SneakIterator { | ||
| 568 | return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; | ||
| 569 | } | ||
| 570 | |||
| 571 | const SneakIterator = struct { | ||
| 572 | cp_iter: ReverseCodepointIterator, | ||
| 573 | wb: *const Words, | ||
| 574 | |||
| 575 | fn peek(iter: *SneakIterator) ?CodePoint { | ||
| 576 | const save_cp = iter.cp_iter; | ||
| 577 | defer iter.cp_iter = save_cp; | ||
| 578 | while (iter.cp_iter.peek()) |peeked| { | ||
| 579 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | ||
| 580 | _ = iter.cp_iter.prev(); | ||
| 581 | } | ||
| 582 | return null; | ||
| 583 | } | ||
| 584 | |||
| 585 | fn countFlags(iter: *SneakIterator) usize { | ||
| 586 | var flags: usize = 0; | ||
| 587 | const save_cp = iter.cp_iter; | ||
| 588 | defer iter.cp_iter = save_cp; | ||
| 589 | while (iter.cp_iter.prev()) |cp| { | ||
| 590 | const prop = iter.wb.breakProp(cp); | ||
| 591 | if (isIgnorable(prop)) continue; | ||
| 592 | if (prop == .Regional_Indicator) { | ||
| 593 | flags += 1; | ||
| 594 | } else break; | ||
| 595 | } | ||
| 596 | return flags; | ||
| 597 | } | ||
| 598 | |||
| 599 | fn prev(iter: *SneakIterator) ?CodePoint { | ||
| 600 | while (iter.cp_iter.prev()) |peeked| { | ||
| 601 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | ||
| 602 | } | ||
| 603 | return null; | ||
| 604 | } | ||
| 605 | }; | ||
| 606 | |||
| 607 | inline fn setupImpl(wb: *Words, allocator: Allocator) !void { | ||
| 608 | const decompressor = compress.flate.inflate.decompressor; | ||
| 609 | const in_bytes = @embedFile("wbp"); | ||
| 610 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 611 | var in_decomp = decompressor(.raw, in_fbs.reader()); | ||
| 612 | var reader = in_decomp.reader(); | ||
| 613 | |||
| 614 | const endian = builtin.cpu.arch.endian(); | ||
| 615 | |||
| 616 | const stage_1_len: u16 = try reader.readInt(u16, endian); | ||
| 617 | wb.s1 = try allocator.alloc(u16, stage_1_len); | ||
| 618 | errdefer allocator.free(wb.s1); | ||
| 619 | for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); | ||
| 620 | |||
| 621 | const stage_2_len: u16 = try reader.readInt(u16, endian); | ||
| 622 | wb.s2 = try allocator.alloc(u5, stage_2_len); | ||
| 623 | errdefer allocator.free(wb.s2); | ||
| 624 | for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); | ||
| 625 | var count_0: usize = 0; | ||
| 626 | for (wb.s2) |nyb| { | ||
| 627 | if (nyb == 0) count_0 += 1; | ||
| 628 | } | ||
| 629 | } | ||
| 630 | |||
| 631 | //| Predicates | ||
| 632 | |||
| 633 | inline fn isNewline(wbp: WordBreakProperty) bool { | ||
| 634 | return wbp == .CR or wbp == .LF or wbp == .Newline; | ||
| 635 | } | ||
| 636 | |||
| 637 | inline fn isIgnorable(wbp: WordBreakProperty) bool { | ||
| 638 | return switch (wbp) { | ||
| 639 | .Format, .Extend, .ZWJ => true, | ||
| 640 | else => false, | ||
| 641 | }; | ||
| 642 | } | ||
| 643 | |||
| 644 | inline fn isAHLetter(wbp: WordBreakProperty) bool { | ||
| 645 | return wbp == .ALetter or wbp == .Hebrew_Letter; | ||
| 646 | } | ||
| 647 | |||
| 648 | inline fn isMidVal(wbp: WordBreakProperty) bool { | ||
| 649 | return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; | ||
| 650 | } | ||
| 651 | |||
| 652 | inline fn isMidNum(wbp: WordBreakProperty) bool { | ||
| 653 | return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote; | ||
| 654 | } | ||
| 655 | |||
| 656 | inline fn isExtensible(wbp: WordBreakProperty) bool { | ||
| 657 | return switch (wbp) { | ||
| 658 | .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, | ||
| 659 | else => false, | ||
| 660 | }; | ||
| 661 | } | ||
| 662 | |||
| 663 | test "Word Break Properties" { | ||
| 664 | const wb = try Words.init(testing.allocator); | ||
| 665 | defer wb.deinit(testing.allocator); | ||
| 666 | try testing.expectEqual(.CR, wb.breakProperty('\r')); | ||
| 667 | try testing.expectEqual(.LF, wb.breakProperty('\n')); | ||
| 668 | try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); | ||
| 669 | try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); | ||
| 670 | } | ||
| 671 | |||
| 672 | test "ext_pict" { | ||
| 673 | try testing.expect(ext_pict.isMatch("👇")); | ||
| 674 | try testing.expect(ext_pict.isMatch("\u{2701}")); | ||
| 675 | } | ||
| 676 | |||
| 677 | test "Words" { | ||
| 678 | const wb = try Words.init(testing.allocator); | ||
| 679 | defer wb.deinit(testing.allocator); | ||
| 680 | const word_str = "Metonym Μετωνύμιο メトニム"; | ||
| 681 | var w_iter = wb.iterator(word_str); | ||
| 682 | try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str)); | ||
| 683 | // Spaces are "words" too! | ||
| 684 | try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str)); | ||
| 685 | const in_greek = w_iter.next().?; | ||
| 686 | for (in_greek.offset..in_greek.offset + in_greek.len) |i| { | ||
| 687 | const at_index = wb.wordAtIndex(word_str, i).bytes(word_str); | ||
| 688 | try testing.expectEqualStrings("Μετωνύμιο", at_index); | ||
| 689 | } | ||
| 690 | _ = w_iter.next(); | ||
| 691 | try testing.expectEqualStrings("メトニム", w_iter.next().?.bytes(word_str)); | ||
| 692 | } | ||
| 693 | |||
| 694 | test wordAtIndex { | ||
| 695 | const wb = try Words.init(testing.allocator); | ||
| 696 | defer wb.deinit(testing.allocator); | ||
| 697 | const t_string = "first second third"; | ||
| 698 | const second = wb.wordAtIndex(t_string, 8); | ||
| 699 | try testing.expectEqualStrings("second", second.bytes(t_string)); | ||
| 700 | const third = wb.wordAtIndex(t_string, 14); | ||
| 701 | try testing.expectEqualStrings("third", third.bytes(t_string)); | ||
| 702 | { | ||
| 703 | const first = wb.wordAtIndex(t_string, 3); | ||
| 704 | try testing.expectEqualStrings("first", first.bytes(t_string)); | ||
| 705 | } | ||
| 706 | { | ||
| 707 | const first = wb.wordAtIndex(t_string, 0); | ||
| 708 | try testing.expectEqualStrings("first", first.bytes(t_string)); | ||
| 709 | } | ||
| 710 | const last = wb.wordAtIndex(t_string, 14); | ||
| 711 | try testing.expectEqualStrings("third", last.bytes(t_string)); | ||
| 712 | } | ||
| 713 | |||
| 714 | const testr = "don't a:ka fin!"; | ||
| 715 | |||
| 716 | test "reversal" { | ||
| 717 | const wb = try Words.init(testing.allocator); | ||
| 718 | defer wb.deinit(testing.allocator); | ||
| 719 | { | ||
| 720 | var fwd = wb.iterator(testr); | ||
| 721 | var this_word: ?Word = fwd.next(); | ||
| 722 | |||
| 723 | while (this_word) |this| : (this_word = fwd.next()) { | ||
| 724 | var back = fwd.reverseIterator(); | ||
| 725 | const that_word = back.prev(); | ||
| 726 | if (that_word) |that| { | ||
| 727 | try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); | ||
| 728 | } else { | ||
| 729 | try testing.expect(false); | ||
| 730 | } | ||
| 731 | } | ||
| 732 | } | ||
| 733 | { | ||
| 734 | var back = wb.reverseIterator(testr); | ||
| 735 | var this_word: ?Word = back.prev(); | ||
| 736 | |||
| 737 | while (this_word) |this| : (this_word = back.prev()) { | ||
| 738 | var fwd = back.forwardIterator(); | ||
| 739 | const that_word = fwd.next(); | ||
| 740 | if (that_word) |that| { | ||
| 741 | try testing.expectEqualStrings(this.bytes(testr), that.bytes(testr)); | ||
| 742 | } else { | ||
| 743 | try testing.expect(false); | ||
| 744 | } | ||
| 745 | } | ||
| 746 | } | ||
| 747 | } | ||
| 748 | |||
| 749 | fn testAllocations(allocator: Allocator) !void { | ||
| 750 | const wb = try Words.init(allocator); | ||
| 751 | wb.deinit(allocator); | ||
| 752 | } | ||
| 753 | |||
| 754 | test "allocation safety" { | ||
| 755 | try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{}); | ||
| 756 | } | ||
| 757 | |||
| 758 | const std = @import("std"); | ||
| 759 | const builtin = @import("builtin"); | ||
| 760 | const compress = std.compress; | ||
| 761 | const mem = std.mem; | ||
| 762 | const Allocator = mem.Allocator; | ||
| 763 | const assert = std.debug.assert; | ||
| 764 | const testing = std.testing; | ||
| 765 | |||
| 766 | const uoffset = code_point.uoffset; | ||
| 767 | |||
| 768 | const code_point = @import("code_point"); | ||
| 769 | const CodepointIterator = code_point.Iterator; | ||
| 770 | const ReverseCodepointIterator = code_point.ReverseIterator; | ||
| 771 | const CodePoint = code_point.CodePoint; | ||
| 772 | |||
| 773 | const ext_pict = @import("micro_runeset.zig").Extended_Pictographic; | ||
diff --git a/src/code_point.zig b/src/code_point.zig index fe7ad6e..7a638af 100644 --- a/src/code_point.zig +++ b/src/code_point.zig | |||
| @@ -4,18 +4,33 @@ | |||
| 4 | //! Represents invalid data according to the Replacement of Maximal | 4 | //! Represents invalid data according to the Replacement of Maximal |
| 5 | //! Subparts algorithm. | 5 | //! Subparts algorithm. |
| 6 | 6 | ||
| 7 | pub const uoffset = if (@import("config").fat_offset) u64 else u32; | ||
| 8 | |||
| 7 | /// `CodePoint` represents a Unicode code point by its code, | 9 | /// `CodePoint` represents a Unicode code point by its code, |
| 8 | /// length, and offset in the source bytes. | 10 | /// length, and offset in the source bytes. |
| 9 | pub const CodePoint = struct { | 11 | pub const CodePoint = struct { |
| 10 | code: u21, | 12 | code: u21, |
| 11 | len: u3, | 13 | len: u3, |
| 12 | offset: u32, | 14 | offset: uoffset, |
| 15 | |||
| 16 | /// Return the slice of this codepoint, given the original string. | ||
| 17 | pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 { | ||
| 18 | return str[cp.offset..][0..cp.len]; | ||
| 19 | } | ||
| 20 | |||
| 21 | pub fn format(cp: CodePoint, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { | ||
| 22 | try writer.print("CodePoint '{u}' .{{ ", .{cp.code}); | ||
| 23 | try writer.print( | ||
| 24 | ".code = 0x{x}, .offset = {d}, .len = {d} }}", | ||
| 25 | .{ cp.code, cp.offset, cp.len }, | ||
| 26 | ); | ||
| 27 | } | ||
| 13 | }; | 28 | }; |
| 14 | 29 | ||
| 15 | /// This function is deprecated and will be removed in a later release. | 30 | /// This function is deprecated and will be removed in a later release. |
| 16 | /// Use `decodeAtIndex` or `decodeAtCursor`. | 31 | /// Use `decodeAtIndex` or `decodeAtCursor`. |
| 17 | pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { | 32 | pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint { |
| 18 | var off: u32 = 0; | 33 | var off: uoffset = 0; |
| 19 | var maybe_code = decodeAtCursor(bytes, &off); | 34 | var maybe_code = decodeAtCursor(bytes, &off); |
| 20 | if (maybe_code) |*code| { | 35 | if (maybe_code) |*code| { |
| 21 | code.offset = offset; | 36 | code.offset = offset; |
| @@ -24,15 +39,23 @@ pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { | |||
| 24 | return null; | 39 | return null; |
| 25 | } | 40 | } |
| 26 | 41 | ||
| 42 | /// Return the codepoint at `index`, even if `index` is in the middle | ||
| 43 | /// of that codepoint. | ||
| 44 | pub fn codepointAtIndex(bytes: []const u8, index: uoffset) ?CodePoint { | ||
| 45 | var idx = index; | ||
| 46 | while (idx > 0 and 0x80 <= bytes[idx] and bytes[idx] <= 0xbf) : (idx -= 1) {} | ||
| 47 | return decodeAtIndex(bytes, idx); | ||
| 48 | } | ||
| 49 | |||
| 27 | /// Decode the CodePoint, if any, at `bytes[idx]`. | 50 | /// Decode the CodePoint, if any, at `bytes[idx]`. |
| 28 | pub fn decodeAtIndex(bytes: []const u8, idx: u32) ?CodePoint { | 51 | pub fn decodeAtIndex(bytes: []const u8, index: uoffset) ?CodePoint { |
| 29 | var off = idx; | 52 | var off = index; |
| 30 | return decodeAtCursor(bytes, &off); | 53 | return decodeAtCursor(bytes, &off); |
| 31 | } | 54 | } |
| 32 | 55 | ||
| 33 | /// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the | 56 | /// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the |
| 34 | /// cursor will point at the next potential codepoint index. | 57 | /// cursor will point at the next potential codepoint index. |
| 35 | pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { | 58 | pub fn decodeAtCursor(bytes: []const u8, cursor: *uoffset) ?CodePoint { |
| 36 | // EOS | 59 | // EOS |
| 37 | if (cursor.* >= bytes.len) return null; | 60 | if (cursor.* >= bytes.len) return null; |
| 38 | 61 | ||
| @@ -98,6 +121,9 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { | |||
| 98 | } | 121 | } |
| 99 | if (st == RUNE_REJECT or cursor.* == bytes.len) { | 122 | if (st == RUNE_REJECT or cursor.* == bytes.len) { |
| 100 | @branchHint(.cold); | 123 | @branchHint(.cold); |
| 124 | // This, and the branch below, detect truncation, the | ||
| 125 | // only invalid state handled differently by the Maximal | ||
| 126 | // Subparts algorithm. | ||
| 101 | if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { | 127 | if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { |
| 102 | cursor.* -= 2; // +1 | 128 | cursor.* -= 2; // +1 |
| 103 | return .{ | 129 | return .{ |
| @@ -148,7 +174,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { | |||
| 148 | /// `Iterator` iterates a string one `CodePoint` at-a-time. | 174 | /// `Iterator` iterates a string one `CodePoint` at-a-time. |
| 149 | pub const Iterator = struct { | 175 | pub const Iterator = struct { |
| 150 | bytes: []const u8, | 176 | bytes: []const u8, |
| 151 | i: u32 = 0, | 177 | i: uoffset = 0, |
| 152 | 178 | ||
| 153 | pub fn init(bytes: []const u8) Iterator { | 179 | pub fn init(bytes: []const u8) Iterator { |
| 154 | return .{ .bytes = bytes, .i = 0 }; | 180 | return .{ .bytes = bytes, .i = 0 }; |
| @@ -158,10 +184,19 @@ pub const Iterator = struct { | |||
| 158 | return decodeAtCursor(self.bytes, &self.i); | 184 | return decodeAtCursor(self.bytes, &self.i); |
| 159 | } | 185 | } |
| 160 | 186 | ||
| 161 | pub fn peek(self: *Iterator) ?CodePoint { | 187 | pub fn peek(iter: *Iterator) ?CodePoint { |
| 162 | const saved_i = self.i; | 188 | const saved_i = iter.i; |
| 163 | defer self.i = saved_i; | 189 | defer iter.i = saved_i; |
| 164 | return self.next(); | 190 | return iter.next(); |
| 191 | } | ||
| 192 | |||
| 193 | /// Create a backward iterator at this point. It will repeat | ||
| 194 | /// the last CodePoint seen. | ||
| 195 | pub fn reverseIterator(iter: *const Iterator) ReverseIterator { | ||
| 196 | if (iter.i == iter.bytes.len) { | ||
| 197 | return .init(iter.bytes); | ||
| 198 | } | ||
| 199 | return .{ .i = iter.i, .bytes = iter.bytes }; | ||
| 165 | } | 200 | } |
| 166 | }; | 201 | }; |
| 167 | 202 | ||
| @@ -233,6 +268,55 @@ const class_mask: [12]u8 = .{ | |||
| 233 | 0, | 268 | 0, |
| 234 | }; | 269 | }; |
| 235 | 270 | ||
| 271 | pub const ReverseIterator = struct { | ||
| 272 | bytes: []const u8, | ||
| 273 | i: ?uoffset, | ||
| 274 | |||
| 275 | pub fn init(str: []const u8) ReverseIterator { | ||
| 276 | var r_iter: ReverseIterator = undefined; | ||
| 277 | r_iter.bytes = str; | ||
| 278 | r_iter.i = if (str.len == 0) 0 else @intCast(str.len - 1); | ||
| 279 | return r_iter; | ||
| 280 | } | ||
| 281 | |||
| 282 | pub fn prev(iter: *ReverseIterator) ?CodePoint { | ||
| 283 | if (iter.i == null) return null; | ||
| 284 | var i_prev = iter.i.?; | ||
| 285 | |||
| 286 | while (i_prev > 0) : (i_prev -= 1) { | ||
| 287 | if (!followbyte(iter.bytes[i_prev])) break; | ||
| 288 | } | ||
| 289 | |||
| 290 | if (i_prev > 0) | ||
| 291 | iter.i = i_prev - 1 | ||
| 292 | else | ||
| 293 | iter.i = null; | ||
| 294 | |||
| 295 | return decode(iter.bytes[i_prev..], i_prev); | ||
| 296 | } | ||
| 297 | |||
| 298 | pub fn peek(iter: *ReverseIterator) ?CodePoint { | ||
| 299 | const saved_i = iter.i; | ||
| 300 | defer iter.i = saved_i; | ||
| 301 | return iter.prev(); | ||
| 302 | } | ||
| 303 | |||
| 304 | /// Create a forward iterator at this point. It will repeat the | ||
| 305 | /// last CodePoint seen. | ||
| 306 | pub fn forwardIterator(iter: *const ReverseIterator) Iterator { | ||
| 307 | if (iter.i) |i| { | ||
| 308 | var fwd: Iterator = .{ .i = i, .bytes = iter.bytes }; | ||
| 309 | _ = fwd.next(); | ||
| 310 | return fwd; | ||
| 311 | } | ||
| 312 | return .{ .i = 0, .bytes = iter.bytes }; | ||
| 313 | } | ||
| 314 | }; | ||
| 315 | |||
| 316 | inline fn followbyte(b: u8) bool { | ||
| 317 | return 0x80 <= b and b <= 0xbf; | ||
| 318 | } | ||
| 319 | |||
| 236 | test "decode" { | 320 | test "decode" { |
| 237 | const bytes = "🌩️"; | 321 | const bytes = "🌩️"; |
| 238 | const res = decode(bytes, 0); | 322 | const res = decode(bytes, 0); |
| @@ -246,7 +330,7 @@ test "decode" { | |||
| 246 | } | 330 | } |
| 247 | } | 331 | } |
| 248 | 332 | ||
| 249 | test "peek" { | 333 | test Iterator { |
| 250 | var iter = Iterator{ .bytes = "Hi" }; | 334 | var iter = Iterator{ .bytes = "Hi" }; |
| 251 | 335 | ||
| 252 | try expectEqual(@as(u21, 'H'), iter.next().?.code); | 336 | try expectEqual(@as(u21, 'H'), iter.next().?.code); |
| @@ -256,6 +340,54 @@ test "peek" { | |||
| 256 | try expectEqual(@as(?CodePoint, null), iter.next()); | 340 | try expectEqual(@as(?CodePoint, null), iter.next()); |
| 257 | } | 341 | } |
| 258 | 342 | ||
| 343 | const code_point = @This(); | ||
| 344 | |||
| 345 | // Keep this in sync with the README | ||
| 346 | test "Code point iterator" { | ||
| 347 | const str = "Hi 😊"; | ||
| 348 | var iter: code_point.Iterator = .init(str); | ||
| 349 | var i: usize = 0; | ||
| 350 | |||
| 351 | while (iter.next()) |cp| : (i += 1) { | ||
| 352 | // The `code` field is the actual code point scalar as a `u21`. | ||
| 353 | if (i == 0) try expect(cp.code == 'H'); | ||
| 354 | if (i == 1) try expect(cp.code == 'i'); | ||
| 355 | if (i == 2) try expect(cp.code == ' '); | ||
| 356 | |||
| 357 | if (i == 3) { | ||
| 358 | try expect(cp.code == '😊'); | ||
| 359 | // The `offset` field is the byte offset in the | ||
| 360 | // source string. | ||
| 361 | try expect(cp.offset == 3); | ||
| 362 | try expectEqual(cp, code_point.decodeAtIndex(str, cp.offset).?); | ||
| 363 | // The `len` field is the length in bytes of the | ||
| 364 | // code point in the source string. | ||
| 365 | try expect(cp.len == 4); | ||
| 366 | // There is also a 'cursor' decode, like so: | ||
| 367 | { | ||
| 368 | var cursor = cp.offset; | ||
| 369 | try expectEqual(cp, code_point.decodeAtCursor(str, &cursor).?); | ||
| 370 | // Which advances the cursor variable to the next possible | ||
| 371 | // offset, in this case, `str.len`. Don't forget to account | ||
| 372 | // for this possibility! | ||
| 373 | try expectEqual(cp.offset + cp.len, cursor); | ||
| 374 | } | ||
| 375 | // There's also this, for when you aren't sure if you have the | ||
| 376 | // correct start for a code point: | ||
| 377 | try expectEqual(cp, code_point.codepointAtIndex(str, cp.offset + 1).?); | ||
| 378 | } | ||
| 379 | // Reverse iteration is also an option: | ||
| 380 | var r_iter: code_point.ReverseIterator = .init(str); | ||
| 381 | // Both iterators can be peeked: | ||
| 382 | try expectEqual('😊', r_iter.peek().?.code); | ||
| 383 | try expectEqual('😊', r_iter.prev().?.code); | ||
| 384 | // Both kinds of iterators can be reversed: | ||
| 385 | var fwd_iter = r_iter.forwardIterator(); // or iter.reverseIterator(); | ||
| 386 | // This will always return the last codepoint from | ||
| 387 | // the prior iterator, _if_ it yielded one: | ||
| 388 | try expectEqual('😊', fwd_iter.next().?.code); | ||
| 389 | } | ||
| 390 | } | ||
| 259 | test "overlongs" { | 391 | test "overlongs" { |
| 260 | // None of these should equal `/`, all should be byte-for-byte | 392 | // None of these should equal `/`, all should be byte-for-byte |
| 261 | // handled as replacement characters. | 393 | // handled as replacement characters. |
| @@ -346,6 +478,50 @@ test "truncation" { | |||
| 346 | } | 478 | } |
| 347 | } | 479 | } |
| 348 | 480 | ||
| 481 | test ReverseIterator { | ||
| 482 | { | ||
| 483 | var r_iter: ReverseIterator = .init("ABC"); | ||
| 484 | try testing.expectEqual(@as(u21, 'C'), r_iter.prev().?.code); | ||
| 485 | try testing.expectEqual(@as(u21, 'B'), r_iter.peek().?.code); | ||
| 486 | try testing.expectEqual(@as(u21, 'B'), r_iter.prev().?.code); | ||
| 487 | try testing.expectEqual(@as(u21, 'A'), r_iter.prev().?.code); | ||
| 488 | try testing.expectEqual(@as(?CodePoint, null), r_iter.peek()); | ||
| 489 | try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); | ||
| 490 | try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); | ||
| 491 | } | ||
| 492 | { | ||
| 493 | var r_iter: ReverseIterator = .init("∅δq🦾ă"); | ||
| 494 | try testing.expectEqual(@as(u21, 'ă'), r_iter.prev().?.code); | ||
| 495 | try testing.expectEqual(@as(u21, '🦾'), r_iter.prev().?.code); | ||
| 496 | try testing.expectEqual(@as(u21, 'q'), r_iter.prev().?.code); | ||
| 497 | try testing.expectEqual(@as(u21, 'δ'), r_iter.peek().?.code); | ||
| 498 | try testing.expectEqual(@as(u21, 'δ'), r_iter.prev().?.code); | ||
| 499 | try testing.expectEqual(@as(u21, '∅'), r_iter.peek().?.code); | ||
| 500 | try testing.expectEqual(@as(u21, '∅'), r_iter.peek().?.code); | ||
| 501 | try testing.expectEqual(@as(u21, '∅'), r_iter.prev().?.code); | ||
| 502 | try testing.expectEqual(@as(?CodePoint, null), r_iter.peek()); | ||
| 503 | try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); | ||
| 504 | try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); | ||
| 505 | } | ||
| 506 | { | ||
| 507 | var r_iter: ReverseIterator = .init("123"); | ||
| 508 | try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code); | ||
| 509 | try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); | ||
| 510 | try testing.expectEqual(@as(u21, '1'), r_iter.prev().?.code); | ||
| 511 | var iter = r_iter.forwardIterator(); | ||
| 512 | try testing.expectEqual(@as(u21, '1'), iter.next().?.code); | ||
| 513 | try testing.expectEqual(@as(u21, '2'), iter.next().?.code); | ||
| 514 | try testing.expectEqual(@as(u21, '3'), iter.next().?.code); | ||
| 515 | r_iter = iter.reverseIterator(); | ||
| 516 | try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code); | ||
| 517 | try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); | ||
| 518 | iter = r_iter.forwardIterator(); | ||
| 519 | r_iter = iter.reverseIterator(); | ||
| 520 | try testing.expectEqual(@as(u21, '2'), iter.next().?.code); | ||
| 521 | try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); | ||
| 522 | } | ||
| 523 | } | ||
| 524 | |||
| 349 | const std = @import("std"); | 525 | const std = @import("std"); |
| 350 | const testing = std.testing; | 526 | const testing = std.testing; |
| 351 | const expect = testing.expect; | 527 | const expect = testing.expect; |
diff --git a/src/micro_runeset.zig b/src/micro_runeset.zig new file mode 100644 index 0000000..80ce4bf --- /dev/null +++ b/src/micro_runeset.zig | |||
| @@ -0,0 +1,207 @@ | |||
| 1 | //! Minimal RuneSet implementation | ||
| 2 | //! | ||
| 3 | //! This is copied from the full RuneSet module, so that `zg` doesn't | ||
| 4 | //! depend on it. There's one spot in the WordBreak algorithm which | ||
| 5 | //! needs to identify the emoji Extended_Pictographic property, which | ||
| 6 | //! is not otherwise used in ZG. The Runeset is 89 words, while the | ||
| 7 | //! trie lookup used throughout ZG would be much larger. | ||
| 8 | //! | ||
| 9 | //! The RuneSet is borrowed from Runicode, which encodes Unicode things | ||
| 10 | //! in RuneSet form. This will need updating for each version of Unicode. | ||
| 11 | |||
| 12 | pub const Extended_Pictographic = RuneSet{ .body = &.{ 0x0, 0x0, 0x1000c00000004, 0x1f, 0x420000000000, 0x30107fc8d053, 0x401, 0x80000000, 0xffff0fffafffffff, 0x2800000, 0x2001000000000000, 0x210000, 0x180000e0, 0x30000000000000, 0x8001000200e00000, 0xf800b85090, 0x1801022057ff3f, 0xffffffffffffffff, 0xffffffffffff003f, 0xffffffffffffffff, 0xfffffffffff7ffbf, 0x7800000000000001, 0x400c0000000000, 0x4, 0x70ffe0000008000, 0x100, 0x1000c000000, 0x60003f00000, 0x200000400000000, 0x200, 0x1000000000000000, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x80000000e000, 0xc003f00000000000, 0xffffe00007fe4000, 0x3fffffffff, 0xf7fc80000400fffe, 0xfffffffffffffe00, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x7ffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff, 0xffffffffffffffc0, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xfff0000000000000, 0xffffffffffe00000, 0xf000, 0xfc00ff00, 0xffffc0000000ff00, 0xffffffffffffffff, 0xf7fffffffffff000, 0xffffffffffffffbf, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3fffffffffffffff } }; | ||
| 13 | |||
| 14 | // Meaningful names for the T1 slots | ||
| 15 | const LOW = 0; | ||
| 16 | const HI = 1; | ||
| 17 | const LEAD = 2; | ||
| 18 | const T4_OFF = 3; | ||
| 19 | |||
| 20 | /// Minimum Viable Runeset. Must be statically created, strictly boolean matching. | ||
| 21 | pub const RuneSet = struct { | ||
| 22 | body: []const u64, | ||
| 23 | |||
| 24 | // Returns whether the slice is a match. This assumes the validity of the | ||
| 25 | // string, which can be ensured by, in particular, deriving it from a CodePoint. | ||
| 26 | pub fn isMatch(runeset: RuneSet, str: []const u8) bool { | ||
| 27 | const set = runeset.body; | ||
| 28 | const a = codeunit(str[0]); | ||
| 29 | switch (a.kind) { | ||
| 30 | .follow => unreachable, | ||
| 31 | .low => { | ||
| 32 | const mask = toMask(set[LOW]); | ||
| 33 | if (mask.isIn(a)) | ||
| 34 | return true | ||
| 35 | else | ||
| 36 | return false; | ||
| 37 | }, | ||
| 38 | .hi => { | ||
| 39 | const mask = toMask(set[HI]); | ||
| 40 | if (mask.isIn(a)) | ||
| 41 | return true | ||
| 42 | else | ||
| 43 | return false; | ||
| 44 | }, | ||
| 45 | .lead => { | ||
| 46 | const nB = a.nMultiBytes().?; | ||
| 47 | const a_mask = toMask(set[LEAD]); | ||
| 48 | if (!a_mask.isIn(a)) return false; | ||
| 49 | const b = codeunit(str[1]); | ||
| 50 | const b_loc = 4 + a_mask.lowerThan(a).?; | ||
| 51 | const b_mask = toMask(set[b_loc]); | ||
| 52 | if (!b_mask.isIn(b)) return false; | ||
| 53 | if (nB == 2) return true; | ||
| 54 | const t3_off = 4 + @popCount(set[LEAD]); | ||
| 55 | const c = codeunit(str[2]); | ||
| 56 | // Slice is safe because we know the T2 span has at least one word. | ||
| 57 | const c_off = b_mask.higherThan(b).? + popCountSlice(set[b_loc + 1 .. t3_off]); | ||
| 58 | const c_loc = t3_off + c_off; | ||
| 59 | const c_mask = toMask(set[c_loc]); | ||
| 60 | if (!c_mask.isIn(c)) return false; | ||
| 61 | if (nB == 3) return true; | ||
| 62 | const d_off = c_mask.lowerThan(c).? + popCountSlice(set[t3_off..c_loc]); | ||
| 63 | const d_loc = set[T4_OFF] + d_off; | ||
| 64 | const d = codeunit(str[3]); | ||
| 65 | const d_mask = toMask(set[d_loc]); | ||
| 66 | if (d_mask.isIn(d)) return true else return false; | ||
| 67 | }, | ||
| 68 | } | ||
| 69 | } | ||
| 70 | }; | ||
| 71 | |||
| 72 | /// Kinds of most significant bits in UTF-8 | ||
| 73 | const RuneKind = enum(u2) { | ||
| 74 | low, | ||
| 75 | hi, | ||
| 76 | follow, | ||
| 77 | lead, | ||
| 78 | }; | ||
| 79 | |||
| 80 | /// Packed `u8` struct representing one codeunit of UTF-8. | ||
| 81 | const CodeUnit = packed struct(u8) { | ||
| 82 | body: u6, | ||
| 83 | kind: RuneKind, | ||
| 84 | |||
| 85 | /// Mask to check presence | ||
| 86 | pub inline fn inMask(self: *const CodeUnit) u64 { | ||
| 87 | return @as(u64, 1) << self.body; | ||
| 88 | } | ||
| 89 | |||
| 90 | // TODO consider an nMultiBytesFast, for the cases where we | ||
| 91 | // know that invalid lead bytes are never present (such as in set) | ||
| 92 | // operations, where we may assume that (and will assert that) the | ||
| 93 | // LEAD mask contains no such bytes. | ||
| 94 | |||
| 95 | /// Number of bytes in known multi-byte rune. | ||
| 96 | /// | ||
| 97 | /// Caller guarantees that the CodeUnit is a lead byte | ||
| 98 | /// of a multi-byte rune: `cu.kind == .lead`. | ||
| 99 | /// | ||
| 100 | /// Invalid lead bytes will return null. | ||
| 101 | pub inline fn nMultiBytes(self: *const CodeUnit) ?u8 { | ||
| 102 | return switch (self.body) { | ||
| 103 | // 0 and 1 are invalid for overlong reasons, | ||
| 104 | // but RuneSet supports overlong encodings | ||
| 105 | 0...31 => 2, | ||
| 106 | 32...47 => 3, | ||
| 107 | 48...55 => 4, | ||
| 108 | // Wasted space 56...61 is due entirely to Microsoft's | ||
| 109 | // lack of vision and insistence on a substandard | ||
| 110 | // and utterly inadequate encoding for Unicode | ||
| 111 | // "64k should be enough for anyone" <spits> | ||
| 112 | 56...63 => null, | ||
| 113 | }; | ||
| 114 | } | ||
| 115 | |||
| 116 | /// Given a valid lead byte, return the number of bytes that should | ||
| 117 | /// make up the code unit sequence. Will return `null` if the lead | ||
| 118 | /// byte is invalid. | ||
| 119 | pub inline fn nBytes(self: *const CodeUnit) ?u8 { | ||
| 120 | switch (self.kind) { | ||
| 121 | .low, .hi => return 1, | ||
| 122 | .lead => return self.nMultiBytes(), | ||
| 123 | .follow => return null, | ||
| 124 | } | ||
| 125 | } | ||
| 126 | |||
| 127 | /// Mask off all bits >= cu.body | ||
| 128 | pub inline fn hiMask(self: *const CodeUnit) u64 { | ||
| 129 | return (@as(u64, 1) << self.body) - 1; | ||
| 130 | } | ||
| 131 | |||
| 132 | /// Mask off all bits <= cu.body | ||
| 133 | pub inline fn lowMask(self: *const CodeUnit) u64 { | ||
| 134 | if (self.body == 63) | ||
| 135 | return 0 | ||
| 136 | else | ||
| 137 | return ~((@as(u64, 1) << (self.body + 1)) - 1); | ||
| 138 | } | ||
| 139 | |||
| 140 | /// Cast the `CodeUnit` to its backing `u8`. | ||
| 141 | pub inline fn byte(self: *const CodeUnit) u8 { | ||
| 142 | return @bitCast(self.*); | ||
| 143 | } | ||
| 144 | }; | ||
| 145 | |||
| 146 | /// Cast raw byte to CodeUnit | ||
| 147 | inline fn codeunit(b: u8) CodeUnit { | ||
| 148 | return @bitCast(b); | ||
| 149 | } | ||
| 150 | |||
| 151 | inline fn toMask(w: u64) Mask { | ||
| 152 | return Mask.toMask(w); | ||
| 153 | } | ||
| 154 | |||
| 155 | /// Bitmask for runesets | ||
| 156 | /// | ||
| 157 | /// We define our own bitset, because the operations we need to | ||
| 158 | /// perform only overlap with IntegerBitSet for trivial one-liners, | ||
| 159 | /// and furthermore, we need nondestructive versions of the basic | ||
| 160 | /// operations, which aren't a part of the IntegerBitSet interface. | ||
| 161 | /// | ||
| 162 | /// Note that Masks do not track which kind of byte they apply to, | ||
| 163 | /// since they will be stored as ordinary u64s. User code must | ||
| 164 | /// ensure that CodeUnits tested against a Mask are of the appropriate | ||
| 165 | /// type, and otherwise valid for the test performed. | ||
| 166 | /// | ||
| 167 | const Mask = struct { | ||
| 168 | m: u64, | ||
| 169 | |||
| 170 | pub fn toMask(w: u64) Mask { | ||
| 171 | return Mask{ .m = w }; | ||
| 172 | } | ||
| 173 | |||
| 174 | /// Test if a CodeUnit's low bytes are present in mask | ||
| 175 | pub inline fn isIn(self: Mask, cu: CodeUnit) bool { | ||
| 176 | return self.m | cu.inMask() == self.m; | ||
| 177 | } | ||
| 178 | |||
| 179 | /// Return number of bytes lower than cu.body in mask, | ||
| 180 | /// if cu inhabits the mask. Otherwise return null. | ||
| 181 | pub inline fn lowerThan(self: Mask, cu: CodeUnit) ?u64 { | ||
| 182 | if (self.isIn(cu)) { | ||
| 183 | const m = cu.hiMask(); | ||
| 184 | return @popCount(self.m & m); | ||
| 185 | } else { | ||
| 186 | return null; | ||
| 187 | } | ||
| 188 | } | ||
| 189 | |||
| 190 | /// Return number of bytes higher than cu.body in mask, | ||
| 191 | /// if cu inhabits the mask. Otherwise return null. | ||
| 192 | pub inline fn higherThan(self: Mask, cu: CodeUnit) ?u64 { | ||
| 193 | if (self.isIn(cu)) { | ||
| 194 | const m = cu.lowMask(); | ||
| 195 | return @popCount(self.m & m); | ||
| 196 | } else { | ||
| 197 | return null; | ||
| 198 | } | ||
| 199 | } | ||
| 200 | }; | ||
| 201 | |||
| 202 | /// Sum of @popCount of all words in region. | ||
| 203 | inline fn popCountSlice(region: []const u64) usize { | ||
| 204 | var ct: usize = 0; | ||
| 205 | for (region) |w| ct += @popCount(w); | ||
| 206 | return ct; | ||
| 207 | } | ||
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 2249007..ae177a9 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -1,43 +1,4 @@ | |||
| 1 | const std = @import("std"); | 1 | const dbg_print = false; |
| 2 | const fmt = std.fmt; | ||
| 3 | const fs = std.fs; | ||
| 4 | const io = std.io; | ||
| 5 | const heap = std.heap; | ||
| 6 | const mem = std.mem; | ||
| 7 | const testing = std.testing; | ||
| 8 | const unicode = std.unicode; | ||
| 9 | |||
| 10 | const grapheme = @import("Graphemes"); | ||
| 11 | const Grapheme = @import("Graphemes").Grapheme; | ||
| 12 | const Graphemes = @import("Graphemes"); | ||
| 13 | const GraphemeIterator = @import("Graphemes").Iterator; | ||
| 14 | const Normalize = @import("Normalize"); | ||
| 15 | |||
| 16 | comptime { | ||
| 17 | testing.refAllDecls(grapheme); | ||
| 18 | } | ||
| 19 | |||
| 20 | test "Iterator.peek" { | ||
| 21 | const peek_seq = "aΔ👨🏻🌾→"; | ||
| 22 | const data = try Graphemes.init(std.testing.allocator); | ||
| 23 | defer data.deinit(std.testing.allocator); | ||
| 24 | |||
| 25 | var iter = data.iterator(peek_seq); | ||
| 26 | const peek_a = iter.peek().?; | ||
| 27 | const next_a = iter.next().?; | ||
| 28 | try std.testing.expectEqual(peek_a, next_a); | ||
| 29 | try std.testing.expectEqualStrings("a", peek_a.bytes(peek_seq)); | ||
| 30 | const peek_d1 = iter.peek().?; | ||
| 31 | const peek_d2 = iter.peek().?; | ||
| 32 | try std.testing.expectEqual(peek_d1, peek_d2); | ||
| 33 | const next_d = iter.next().?; | ||
| 34 | try std.testing.expectEqual(peek_d2, next_d); | ||
| 35 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 36 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 37 | try std.testing.expectEqual(null, iter.peek()); | ||
| 38 | try std.testing.expectEqual(null, iter.peek()); | ||
| 39 | try std.testing.expectEqual(iter.peek(), iter.next()); | ||
| 40 | } | ||
| 41 | 2 | ||
| 42 | test "Unicode normalization tests" { | 3 | test "Unicode normalization tests" { |
| 43 | var arena = heap.ArenaAllocator.init(testing.allocator); | 4 | var arena = heap.ArenaAllocator.init(testing.allocator); |
| @@ -50,16 +11,14 @@ test "Unicode normalization tests" { | |||
| 50 | var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); | 11 | var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); |
| 51 | defer file.close(); | 12 | defer file.close(); |
| 52 | var buf_reader = io.bufferedReader(file.reader()); | 13 | var buf_reader = io.bufferedReader(file.reader()); |
| 53 | const input_stream = buf_reader.reader(); | 14 | var input_stream = buf_reader.reader(); |
| 54 | 15 | ||
| 55 | var line_no: usize = 0; | ||
| 56 | var buf: [4096]u8 = undefined; | 16 | var buf: [4096]u8 = undefined; |
| 57 | var cp_buf: [4]u8 = undefined; | 17 | var cp_buf: [4]u8 = undefined; |
| 58 | 18 | ||
| 59 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| { | 19 | var line_iter: IterRead = .{ .read = &input_stream }; |
| 60 | line_no += 1; | 20 | |
| 61 | // Skip comments or empty lines. | 21 | while (try line_iter.next(&buf)) |line| { |
| 62 | if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; | ||
| 63 | // Iterate over fields. | 22 | // Iterate over fields. |
| 64 | var fields = mem.splitScalar(u8, line, ';'); | 23 | var fields = mem.splitScalar(u8, line, ';'); |
| 65 | var field_index: usize = 0; | 24 | var field_index: usize = 0; |
| @@ -80,7 +39,7 @@ test "Unicode normalization tests" { | |||
| 80 | 39 | ||
| 81 | input = try i_buf.toOwnedSlice(); | 40 | input = try i_buf.toOwnedSlice(); |
| 82 | } else if (field_index == 1) { | 41 | } else if (field_index == 1) { |
| 83 | //debug.print("\n*** {s} ***\n", .{line}); | 42 | if (dbg_print) debug.print("\n*** {s} ***\n", .{line}); |
| 84 | // NFC, time to test. | 43 | // NFC, time to test. |
| 85 | var w_buf = std.ArrayList(u8).init(allocator); | 44 | var w_buf = std.ArrayList(u8).init(allocator); |
| 86 | defer w_buf.deinit(); | 45 | defer w_buf.deinit(); |
| @@ -162,20 +121,17 @@ test "Segmentation GraphemeIterator" { | |||
| 162 | var buf_reader = std.io.bufferedReader(file.reader()); | 121 | var buf_reader = std.io.bufferedReader(file.reader()); |
| 163 | var input_stream = buf_reader.reader(); | 122 | var input_stream = buf_reader.reader(); |
| 164 | 123 | ||
| 165 | const data = try Graphemes.init(allocator); | 124 | const graph = try Graphemes.init(allocator); |
| 166 | defer data.deinit(allocator); | 125 | defer graph.deinit(allocator); |
| 167 | 126 | ||
| 168 | var buf: [4096]u8 = undefined; | 127 | var buf: [4096]u8 = undefined; |
| 169 | var line_no: usize = 1; | 128 | var line_iter: IterRead = .{ .read = &input_stream }; |
| 170 | |||
| 171 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { | ||
| 172 | // Skip comments or empty lines. | ||
| 173 | if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; | ||
| 174 | 129 | ||
| 130 | while (try line_iter.next(&buf)) |raw| { | ||
| 175 | // Clean up. | 131 | // Clean up. |
| 176 | var line = std.mem.trimLeft(u8, raw, "÷ "); | 132 | var line = std.mem.trimLeft(u8, raw, "÷ "); |
| 177 | if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { | 133 | if (std.mem.indexOf(u8, line, " ÷\t")) |final| { |
| 178 | line = line[0..octo]; | 134 | line = line[0..final]; |
| 179 | } | 135 | } |
| 180 | // Iterate over fields. | 136 | // Iterate over fields. |
| 181 | var want = std.ArrayList(Grapheme).init(allocator); | 137 | var want = std.ArrayList(Grapheme).init(allocator); |
| @@ -185,12 +141,12 @@ test "Segmentation GraphemeIterator" { | |||
| 185 | defer all_bytes.deinit(); | 141 | defer all_bytes.deinit(); |
| 186 | 142 | ||
| 187 | var graphemes = std.mem.splitSequence(u8, line, " ÷ "); | 143 | var graphemes = std.mem.splitSequence(u8, line, " ÷ "); |
| 188 | var bytes_index: u32 = 0; | 144 | var bytes_index: uoffset = 0; |
| 189 | 145 | ||
| 190 | while (graphemes.next()) |field| { | 146 | while (graphemes.next()) |field| { |
| 191 | var code_points = std.mem.splitScalar(u8, field, ' '); | 147 | var code_points = std.mem.splitScalar(u8, field, ' '); |
| 192 | var cp_buf: [4]u8 = undefined; | 148 | var cp_buf: [4]u8 = undefined; |
| 193 | var cp_index: u32 = 0; | 149 | var cp_index: uoffset = 0; |
| 194 | var gc_len: u8 = 0; | 150 | var gc_len: u8 = 0; |
| 195 | 151 | ||
| 196 | while (code_points.next()) |code_point| { | 152 | while (code_points.next()) |code_point| { |
| @@ -206,16 +162,324 @@ test "Segmentation GraphemeIterator" { | |||
| 206 | bytes_index += cp_index; | 162 | bytes_index += cp_index; |
| 207 | } | 163 | } |
| 208 | 164 | ||
| 209 | // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); | 165 | const this_str = all_bytes.items; |
| 210 | var iter = data.iterator(all_bytes.items); | 166 | |
| 167 | { | ||
| 168 | var iter = graph.iterator(this_str); | ||
| 169 | |||
| 170 | // Check. | ||
| 171 | for (want.items, 1..) |want_gc, idx| { | ||
| 172 | const got_gc = (iter.next()).?; | ||
| 173 | try std.testing.expectEqualStrings( | ||
| 174 | want_gc.bytes(this_str), | ||
| 175 | got_gc.bytes(this_str), | ||
| 176 | ); | ||
| 177 | for (got_gc.offset..got_gc.offset + got_gc.len) |i| { | ||
| 178 | const this_gc = graph.graphemeAtIndex(this_str, i); | ||
| 179 | std.testing.expectEqualSlices( | ||
| 180 | u8, | ||
| 181 | got_gc.bytes(this_str), | ||
| 182 | this_gc.bytes(this_str), | ||
| 183 | ) catch |err| { | ||
| 184 | debug.print("Wrong grapheme on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i }); | ||
| 185 | return err; | ||
| 186 | }; | ||
| 187 | } | ||
| 188 | var after_iter = graph.iterateAfterGrapheme(this_str, got_gc); | ||
| 189 | if (after_iter.next()) |next_gc| { | ||
| 190 | if (iter.peek()) |next_peek| { | ||
| 191 | std.testing.expectEqualSlices( | ||
| 192 | u8, | ||
| 193 | next_gc.bytes(this_str), | ||
| 194 | next_peek.bytes(this_str), | ||
| 195 | ) catch |err| { | ||
| 196 | debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, idx }); | ||
| 197 | return err; | ||
| 198 | }; | ||
| 199 | } else { | ||
| 200 | debug.print("Mismatch: peek missing, next found, line {d} #{d}\n", .{ line_iter.line, idx }); | ||
| 201 | try testing.expect(false); | ||
| 202 | } | ||
| 203 | } else { | ||
| 204 | try testing.expectEqual(null, iter.peek()); | ||
| 205 | } | ||
| 206 | } | ||
| 207 | } | ||
| 208 | { | ||
| 209 | var iter = graph.reverseIterator(this_str); | ||
| 210 | |||
| 211 | // Check. | ||
| 212 | var i: usize = want.items.len; | ||
| 213 | while (i > 0) { | ||
| 214 | i -= 1; | ||
| 215 | const want_gc = want.items[i]; | ||
| 216 | const got_gc = iter.prev() orelse { | ||
| 217 | std.debug.print( | ||
| 218 | "line {d} grapheme {d}: expected {any} found null\n", | ||
| 219 | .{ line_iter.line, i, want_gc }, | ||
| 220 | ); | ||
| 221 | return error.TestExpectedEqual; | ||
| 222 | }; | ||
| 223 | std.testing.expectEqualStrings( | ||
| 224 | want_gc.bytes(this_str), | ||
| 225 | got_gc.bytes(this_str), | ||
| 226 | ) catch |err| { | ||
| 227 | std.debug.print( | ||
| 228 | "line {d} grapheme {d}: expected {any} found {any}\n", | ||
| 229 | .{ line_iter.line, i, want_gc, got_gc }, | ||
| 230 | ); | ||
| 231 | return err; | ||
| 232 | }; | ||
| 233 | var before_iter = graph.iterateBeforeGrapheme(this_str, got_gc); | ||
| 234 | if (before_iter.prev()) |prev_gc| { | ||
| 235 | if (iter.peek()) |prev_peek| { | ||
| 236 | std.testing.expectEqualSlices( | ||
| 237 | u8, | ||
| 238 | prev_gc.bytes(this_str), | ||
| 239 | prev_peek.bytes(this_str), | ||
| 240 | ) catch |err| { | ||
| 241 | debug.print("Peeks differ on line {d} #{d} \n", .{ line_iter.line, i }); | ||
| 242 | return err; | ||
| 243 | }; | ||
| 244 | } else { | ||
| 245 | debug.print("Mismatch: peek missing, prev found, line {d} #{d}\n", .{ line_iter.line, i }); | ||
| 246 | try testing.expect(false); | ||
| 247 | } | ||
| 248 | } else { | ||
| 249 | try testing.expectEqual(null, iter.peek()); | ||
| 250 | } | ||
| 251 | } | ||
| 252 | } | ||
| 253 | } | ||
| 254 | } | ||
| 255 | |||
| 256 | test "Segmentation Word Iterator" { | ||
| 257 | const allocator = std.testing.allocator; | ||
| 258 | var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{}); | ||
| 259 | defer file.close(); | ||
| 260 | var buf_reader = std.io.bufferedReader(file.reader()); | ||
| 261 | var input_stream = buf_reader.reader(); | ||
| 262 | |||
| 263 | const wb = try Words.init(allocator); | ||
| 264 | defer wb.deinit(allocator); | ||
| 265 | |||
| 266 | var buf: [4096]u8 = undefined; | ||
| 267 | var line_iter: IterRead = .{ .read = &input_stream }; | ||
| 268 | |||
| 269 | while (try line_iter.next(&buf)) |raw| { | ||
| 270 | // Clean up. | ||
| 271 | var line = std.mem.trimLeft(u8, raw, "÷ "); | ||
| 272 | if (std.mem.indexOf(u8, line, " ÷\t")) |final| { | ||
| 273 | line = line[0..final]; | ||
| 274 | } | ||
| 275 | // Iterate over fields. | ||
| 276 | var want = std.ArrayList(Word).init(allocator); | ||
| 277 | defer want.deinit(); | ||
| 278 | |||
| 279 | var all_bytes = std.ArrayList(u8).init(allocator); | ||
| 280 | defer all_bytes.deinit(); | ||
| 281 | |||
| 282 | var words = std.mem.splitSequence(u8, line, " ÷ "); | ||
| 283 | var bytes_index: uoffset = 0; | ||
| 284 | |||
| 285 | while (words.next()) |field| { | ||
| 286 | var code_points = std.mem.splitScalar(u8, field, ' '); | ||
| 287 | var cp_buf: [4]u8 = undefined; | ||
| 288 | var cp_index: uoffset = 0; | ||
| 289 | var gc_len: u8 = 0; | ||
| 211 | 290 | ||
| 212 | // Check. | 291 | while (code_points.next()) |code_point| { |
| 213 | for (want.items) |want_gc| { | 292 | if (std.mem.eql(u8, code_point, "×")) continue; |
| 214 | const got_gc = (iter.next()).?; | 293 | const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); |
| 215 | try std.testing.expectEqualStrings( | 294 | const len = try unicode.utf8Encode(cp, &cp_buf); |
| 216 | want_gc.bytes(all_bytes.items), | 295 | try all_bytes.appendSlice(cp_buf[0..len]); |
| 217 | got_gc.bytes(all_bytes.items), | 296 | cp_index += len; |
| 218 | ); | 297 | gc_len += len; |
| 298 | } | ||
| 299 | |||
| 300 | try want.append(Word{ .len = gc_len, .offset = bytes_index }); | ||
| 301 | bytes_index += cp_index; | ||
| 302 | } | ||
| 303 | const this_str = all_bytes.items; | ||
| 304 | |||
| 305 | { | ||
| 306 | var iter = wb.iterator(this_str); | ||
| 307 | var peeked: ?Word = iter.peek(); | ||
| 308 | |||
| 309 | // Check. | ||
| 310 | for (want.items, 1..) |want_word, idx| { | ||
| 311 | const got_word = (iter.next()).?; | ||
| 312 | std.testing.expectEqualStrings( | ||
| 313 | want_word.bytes(this_str), | ||
| 314 | got_word.bytes(this_str), | ||
| 315 | ) catch |err| { | ||
| 316 | debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx }); | ||
| 317 | return err; | ||
| 318 | }; | ||
| 319 | std.testing.expectEqualStrings( | ||
| 320 | peeked.?.bytes(this_str), | ||
| 321 | got_word.bytes(this_str), | ||
| 322 | ) catch |err| { | ||
| 323 | debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx }); | ||
| 324 | return err; | ||
| 325 | }; | ||
| 326 | var r_iter = iter.reverseIterator(); | ||
| 327 | const if_r_word = r_iter.prev(); | ||
| 328 | if (if_r_word) |r_word| { | ||
| 329 | std.testing.expectEqualStrings( | ||
| 330 | want_word.bytes(this_str), | ||
| 331 | r_word.bytes(this_str), | ||
| 332 | ) catch |err| { | ||
| 333 | debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx }); | ||
| 334 | return err; | ||
| 335 | }; | ||
| 336 | } else { | ||
| 337 | try testing.expect(false); | ||
| 338 | } | ||
| 339 | var peek_iter = wb.iterateAfterWord(this_str, got_word); | ||
| 340 | const peek_1 = peek_iter.next(); | ||
| 341 | if (peek_1) |p1| { | ||
| 342 | const peek_2 = iter.peek(); | ||
| 343 | if (peek_2) |p2| { | ||
| 344 | std.testing.expectEqualSlices( | ||
| 345 | u8, | ||
| 346 | p1.bytes(this_str), | ||
| 347 | p2.bytes(this_str), | ||
| 348 | ) catch |err| { | ||
| 349 | debug.print("Bad peek on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, idx }); | ||
| 350 | return err; | ||
| 351 | }; | ||
| 352 | } else { | ||
| 353 | try testing.expect(false); | ||
| 354 | } | ||
| 355 | } else { | ||
| 356 | try testing.expectEqual(null, iter.peek()); | ||
| 357 | } | ||
| 358 | for (got_word.offset..got_word.offset + got_word.len) |i| { | ||
| 359 | const this_word = wb.wordAtIndex(this_str, i); | ||
| 360 | std.testing.expectEqualSlices( | ||
| 361 | u8, | ||
| 362 | got_word.bytes(this_str), | ||
| 363 | this_word.bytes(this_str), | ||
| 364 | ) catch |err| { | ||
| 365 | debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx, i }); | ||
| 366 | return err; | ||
| 367 | }; | ||
| 368 | } | ||
| 369 | peeked = iter.peek(); | ||
| 370 | } | ||
| 371 | } | ||
| 372 | { | ||
| 373 | var r_iter = wb.reverseIterator(this_str); | ||
| 374 | var peeked: ?Word = r_iter.peek(); | ||
| 375 | var idx = want.items.len - 1; | ||
| 376 | |||
| 377 | while (true) : (idx -= 1) { | ||
| 378 | const want_word = want.items[idx]; | ||
| 379 | const got_word = r_iter.prev().?; | ||
| 380 | std.testing.expectEqualSlices( | ||
| 381 | u8, | ||
| 382 | want_word.bytes(this_str), | ||
| 383 | got_word.bytes(this_str), | ||
| 384 | ) catch |err| { | ||
| 385 | debug.print("Error on line {d}, #{d}\n", .{ line_iter.line, idx + 1 }); | ||
| 386 | return err; | ||
| 387 | }; | ||
| 388 | std.testing.expectEqualStrings( | ||
| 389 | peeked.?.bytes(this_str), | ||
| 390 | got_word.bytes(this_str), | ||
| 391 | ) catch |err| { | ||
| 392 | debug.print("Peek != word on line {d} #{d}\n", .{ line_iter.line, idx + 1 }); | ||
| 393 | return err; | ||
| 394 | }; | ||
| 395 | var f_iter = r_iter.forwardIterator(); | ||
| 396 | const if_f_word = f_iter.next(); | ||
| 397 | if (if_f_word) |f_word| { | ||
| 398 | std.testing.expectEqualStrings( | ||
| 399 | want_word.bytes(this_str), | ||
| 400 | f_word.bytes(this_str), | ||
| 401 | ) catch |err| { | ||
| 402 | debug.print("Reversal Error on line {d}, #{d}\n", .{ line_iter.line, idx }); | ||
| 403 | return err; | ||
| 404 | }; | ||
| 405 | } else { | ||
| 406 | try testing.expect(false); | ||
| 407 | } | ||
| 408 | var peek_iter = wb.iterateBeforeWord(this_str, got_word); | ||
| 409 | const peek_1 = peek_iter.prev(); | ||
| 410 | if (peek_1) |p1| { | ||
| 411 | const peek_2 = r_iter.peek(); | ||
| 412 | if (peek_2) |p2| { | ||
| 413 | std.testing.expectEqualSlices( | ||
| 414 | u8, | ||
| 415 | p1.bytes(this_str), | ||
| 416 | p2.bytes(this_str), | ||
| 417 | ) catch |err| { | ||
| 418 | debug.print("Bad peek on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, idx }); | ||
| 419 | return err; | ||
| 420 | }; | ||
| 421 | } else { | ||
| 422 | try testing.expect(false); | ||
| 423 | } | ||
| 424 | } else { | ||
| 425 | try testing.expectEqual(null, r_iter.peek()); | ||
| 426 | } | ||
| 427 | for (got_word.offset..got_word.offset + got_word.len) |i| { | ||
| 428 | const this_word = wb.wordAtIndex(this_str, i); | ||
| 429 | std.testing.expectEqualSlices( | ||
| 430 | u8, | ||
| 431 | got_word.bytes(this_str), | ||
| 432 | this_word.bytes(this_str), | ||
| 433 | ) catch |err| { | ||
| 434 | debug.print("Wrong word on line {d} #{d} offset {d}\n", .{ line_iter.line, idx + 1, i }); | ||
| 435 | return err; | ||
| 436 | }; | ||
| 437 | } | ||
| 438 | peeked = r_iter.peek(); | ||
| 439 | if (idx == 0) break; | ||
| 440 | } | ||
| 219 | } | 441 | } |
| 220 | } | 442 | } |
| 221 | } | 443 | } |
| 444 | |||
| 445 | const IterRead = struct { | ||
| 446 | read: *Reader, | ||
| 447 | line: usize = 0, | ||
| 448 | |||
| 449 | pub fn next(iter: *IterRead, buf: []u8) !?[]const u8 { | ||
| 450 | defer iter.line += 1; | ||
| 451 | const maybe_line = try iter.read.readUntilDelimiterOrEof(buf, '#'); | ||
| 452 | if (maybe_line) |this_line| { | ||
| 453 | try iter.read.skipUntilDelimiterOrEof('\n'); | ||
| 454 | if (this_line.len == 0 or this_line[0] == '@') { | ||
| 455 | // comment, next line | ||
| 456 | return iter.next(buf); | ||
| 457 | } else { | ||
| 458 | return this_line; | ||
| 459 | } | ||
| 460 | } else { | ||
| 461 | return null; | ||
| 462 | } | ||
| 463 | } | ||
| 464 | }; | ||
| 465 | |||
| 466 | const std = @import("std"); | ||
| 467 | const fmt = std.fmt; | ||
| 468 | const fs = std.fs; | ||
| 469 | const io = std.io; | ||
| 470 | const Reader = io.BufferedReader(4096, fs.File.Reader).Reader; | ||
| 471 | const heap = std.heap; | ||
| 472 | const mem = std.mem; | ||
| 473 | const debug = std.debug; | ||
| 474 | const testing = std.testing; | ||
| 475 | const unicode = std.unicode; | ||
| 476 | |||
| 477 | const uoffset = @FieldType(Word, "offset"); | ||
| 478 | |||
| 479 | const Grapheme = @import("Graphemes").Grapheme; | ||
| 480 | const Graphemes = @import("Graphemes"); | ||
| 481 | const GraphemeIterator = @import("Graphemes").Iterator; | ||
| 482 | const Normalize = @import("Normalize"); | ||
| 483 | |||
| 484 | const Words = @import("Words"); | ||
| 485 | const Word = Words.Word; | ||