diff options
| author | 2026-02-04 15:31:50 -0500 | |
|---|---|---|
| committer | 2026-02-04 15:31:50 -0500 | |
| commit | 5199401c536d0b0032c5908c55d5c0bb34b76d12 (patch) | |
| tree | 13a4cec36ebe49733197ab4a0df6ee232345c34a /src | |
| parent | Port DisplayWidth (diff) | |
| download | zg-5199401c536d0b0032c5908c55d5c0bb34b76d12.tar.gz zg-5199401c536d0b0032c5908c55d5c0bb34b76d12.tar.xz zg-5199401c536d0b0032c5908c55d5c0bb34b76d12.zip | |
Convert Words module to no-allocation
Diffstat (limited to 'src')
| -rw-r--r-- | src/Words.zig | 215 | ||||
| -rw-r--r-- | src/unicode_tests.zig | 14 |
2 files changed, 85 insertions, 144 deletions
diff --git a/src/Words.zig b/src/Words.zig index ce3203f..aeb25d1 100644 --- a/src/Words.zig +++ b/src/Words.zig | |||
| @@ -3,6 +3,8 @@ | |||
| 3 | //! https://www.unicode.org/reports/tr29/#Word_Boundaries | 3 | //! https://www.unicode.org/reports/tr29/#Word_Boundaries |
| 4 | //! | 4 | //! |
| 5 | 5 | ||
| 6 | const Words = @This(); | ||
| 7 | |||
| 6 | const WordBreakProperty = enum(u5) { | 8 | const WordBreakProperty = enum(u5) { |
| 7 | none, | 9 | none, |
| 8 | Double_Quote, | 10 | Double_Quote, |
| @@ -25,30 +27,18 @@ const WordBreakProperty = enum(u5) { | |||
| 25 | WSegSpace, | 27 | WSegSpace, |
| 26 | }; | 28 | }; |
| 27 | 29 | ||
| 28 | s1: []u16 = undefined, | 30 | const Data = struct { |
| 29 | s2: []u5 = undefined, | 31 | s1: []const u16 = undefined, |
| 30 | 32 | s2: []const u5 = undefined, | |
| 31 | const Words = @This(); | 33 | }; |
| 32 | |||
| 33 | pub fn init(allocator: Allocator) Allocator.Error!Words { | ||
| 34 | var wb: Words = undefined; | ||
| 35 | try wb.setup(allocator); | ||
| 36 | return wb; | ||
| 37 | } | ||
| 38 | 34 | ||
| 39 | pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void { | 35 | const wbp = display_width: { |
| 40 | wb.setupImpl(allocator) catch |err| { | 36 | const data = @import("wbp"); |
| 41 | switch (err) { | 37 | break :display_width Data{ |
| 42 | error.OutOfMemory => |e| return e, | 38 | .s1 = &data.s1, |
| 43 | else => unreachable, | 39 | .s2 = &data.s2, |
| 44 | } | ||
| 45 | }; | 40 | }; |
| 46 | } | 41 | }; |
| 47 | |||
| 48 | pub fn deinit(words: *const Words, allocator: mem.Allocator) void { | ||
| 49 | allocator.free(words.s1); | ||
| 50 | allocator.free(words.s2); | ||
| 51 | } | ||
| 52 | 42 | ||
| 53 | /// Represents a Unicode word span, as an offset into the source string | 43 | /// Represents a Unicode word span, as an offset into the source string |
| 54 | /// and the length of the word. | 44 | /// and the length of the word. |
| @@ -63,32 +53,32 @@ pub const Word = struct { | |||
| 63 | }; | 53 | }; |
| 64 | 54 | ||
| 65 | /// Returns the word break property type for `cp`. | 55 | /// Returns the word break property type for `cp`. |
| 66 | pub fn breakProperty(words: *const Words, cp: u21) WordBreakProperty { | 56 | pub fn breakProperty(cp: u21) WordBreakProperty { |
| 67 | return @enumFromInt(words.s2[words.s1[cp >> 8] + (cp & 0xff)]); | 57 | return @enumFromInt(wbp.s2[wbp.s1[cp >> 8] + (cp & 0xff)]); |
| 68 | } | 58 | } |
| 69 | 59 | ||
| 70 | /// Convenience function for working with CodePoints | 60 | /// Convenience function for working with CodePoints |
| 71 | fn breakProp(words: *const Words, point: CodePoint) WordBreakProperty { | 61 | fn breakProp(point: CodePoint) WordBreakProperty { |
| 72 | return @enumFromInt(words.s2[words.s1[point.code >> 8] + (point.code & 0xff)]); | 62 | return @enumFromInt(wbp.s2[wbp.s1[point.code >> 8] + (point.code & 0xff)]); |
| 73 | } | 63 | } |
| 74 | 64 | ||
| 75 | /// Returns the Word at the given index. Asserts that the index is less than | 65 | /// Returns the Word at the given index. Asserts that the index is less than |
| 76 | /// `string.len`, and that the string is not empty. Always returns a word. | 66 | /// `string.len`, and that the string is not empty. Always returns a word. |
| 77 | /// The index does not have to be the start of a codepoint in the word. | 67 | /// The index does not have to be the start of a codepoint in the word. |
| 78 | pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word { | 68 | pub fn wordAtIndex(string: []const u8, index: usize) Word { |
| 79 | assert(index < string.len and string.len > 0); | 69 | assert(index < string.len and string.len > 0); |
| 80 | var iter_back: ReverseIterator = reverseFromIndex(words, string, index); | 70 | var iter_back: ReverseIterator = reverseFromIndex(string, index); |
| 81 | const first_back = iter_back.prev(); | 71 | const first_back = iter_back.prev(); |
| 82 | if (first_back) |back| { | 72 | if (first_back) |back| { |
| 83 | if (back.offset == 0) { | 73 | if (back.offset == 0) { |
| 84 | var iter_fwd = words.iterator(string); | 74 | var iter_fwd = Words.iterator(string); |
| 85 | while (iter_fwd.next()) |word| { | 75 | while (iter_fwd.next()) |word| { |
| 86 | if (word.offset <= index and index < word.offset + word.len) | 76 | if (word.offset <= index and index < word.offset + word.len) |
| 87 | return word; | 77 | return word; |
| 88 | } | 78 | } |
| 89 | } | 79 | } |
| 90 | } else { | 80 | } else { |
| 91 | var iter_fwd = words.iterator(string); | 81 | var iter_fwd = Words.iterator(string); |
| 92 | while (iter_fwd.next()) |word| { | 82 | while (iter_fwd.next()) |word| { |
| 93 | if (word.offset <= index and index < word.offset + word.len) | 83 | if (word.offset <= index and index < word.offset + word.len) |
| 94 | return word; | 84 | return word; |
| @@ -114,23 +104,23 @@ pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word { | |||
| 114 | } | 104 | } |
| 115 | 105 | ||
| 116 | /// Returns an iterator over words in `slice`. | 106 | /// Returns an iterator over words in `slice`. |
| 117 | pub fn iterator(words: *const Words, slice: []const u8) Iterator { | 107 | pub fn iterator(slice: []const u8) Iterator { |
| 118 | return Iterator.init(words, slice); | 108 | return Iterator.init(slice); |
| 119 | } | 109 | } |
| 120 | 110 | ||
| 121 | /// Returns a reverse iterator over the words in `slice`. | 111 | /// Returns a reverse iterator over the words in `slice`. |
| 122 | pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator { | 112 | pub fn reverseIterator(slice: []const u8) ReverseIterator { |
| 123 | return ReverseIterator.init(words, slice); | 113 | return ReverseIterator.init(slice); |
| 124 | } | 114 | } |
| 125 | 115 | ||
| 126 | /// Returns an iterator after the `word` in `slice`. | 116 | /// Returns an iterator after the `word` in `slice`. |
| 127 | pub fn iterateAfterWord(words: *const Words, slice: []const u8, word: Word) Iterator { | 117 | pub fn iterateAfterWord(slice: []const u8, word: Word) Iterator { |
| 128 | return forwardFromIndex(words, slice, word.offset + word.len); | 118 | return forwardFromIndex(slice, word.offset + word.len); |
| 129 | } | 119 | } |
| 130 | 120 | ||
| 131 | /// Returns a reverse iterator before the `word` in `slice`. | 121 | /// Returns a reverse iterator before the `word` in `slice`. |
| 132 | pub fn iterateBeforeWord(words: *const Words, slice: []const u8, word: Word) ReverseIterator { | 122 | pub fn iterateBeforeWord(slice: []const u8, word: Word) ReverseIterator { |
| 133 | return reverseFromIndex(words, slice, word.offset); | 123 | return reverseFromIndex(slice, word.offset); |
| 134 | } | 124 | } |
| 135 | 125 | ||
| 136 | /// An iterator, forward, over all words in a provided string. | 126 | /// An iterator, forward, over all words in a provided string. |
| @@ -138,11 +128,10 @@ pub const Iterator = struct { | |||
| 138 | this: ?CodePoint = null, | 128 | this: ?CodePoint = null, |
| 139 | that: ?CodePoint = null, | 129 | that: ?CodePoint = null, |
| 140 | cp_iter: CodepointIterator, | 130 | cp_iter: CodepointIterator, |
| 141 | wb: *const Words, | ||
| 142 | 131 | ||
| 143 | /// Assumes `str` is valid UTF-8. | 132 | /// Assumes `str` is valid UTF-8. |
| 144 | pub fn init(words: *const Words, str: []const u8) Iterator { | 133 | pub fn init(str: []const u8) Iterator { |
| 145 | var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = words }; | 134 | var wb_iter: Iterator = .{ .cp_iter = .init(str) }; |
| 146 | wb_iter.advance(); | 135 | wb_iter.advance(); |
| 147 | return wb_iter; | 136 | return wb_iter; |
| 148 | } | 137 | } |
| @@ -166,7 +155,6 @@ pub const Iterator = struct { | |||
| 166 | if (iter.cp_iter.peek()) |_| | 155 | if (iter.cp_iter.peek()) |_| |
| 167 | _ = cp_it.prev(); | 156 | _ = cp_it.prev(); |
| 168 | return .{ | 157 | return .{ |
| 169 | .wb = iter.wb, | ||
| 170 | .before = cp_it.prev(), | 158 | .before = cp_it.prev(), |
| 171 | .after = iter.that, | 159 | .after = iter.that, |
| 172 | .cp_iter = cp_it, | 160 | .cp_iter = cp_it, |
| @@ -194,8 +182,8 @@ pub const Iterator = struct { | |||
| 194 | const this = iter.this.?; | 182 | const this = iter.this.?; |
| 195 | word_len += this.len; | 183 | word_len += this.len; |
| 196 | if (iter.that) |that| { | 184 | if (iter.that) |that| { |
| 197 | const this_p = iter.wb.breakProp(this); | 185 | const this_p = Words.breakProp(this); |
| 198 | const that_p = iter.wb.breakProp(that); | 186 | const that_p = Words.breakProp(that); |
| 199 | if (!isIgnorable(this_p)) { | 187 | if (!isIgnorable(this_p)) { |
| 200 | last_last_p = last_p; | 188 | last_last_p = last_p; |
| 201 | last_p = this_p; | 189 | last_p = this_p; |
| @@ -223,7 +211,7 @@ pub const Iterator = struct { | |||
| 223 | if (isMidVal(that_p)) { | 211 | if (isMidVal(that_p)) { |
| 224 | const next_val = iter.peekPast(); | 212 | const next_val = iter.peekPast(); |
| 225 | if (next_val) |next_cp| { | 213 | if (next_val) |next_cp| { |
| 226 | const next_p = iter.wb.breakProp(next_cp); | 214 | const next_p = Words.breakProp(next_cp); |
| 227 | if (isAHLetter(next_p)) { | 215 | if (isAHLetter(next_p)) { |
| 228 | continue :scan; | 216 | continue :scan; |
| 229 | } | 217 | } |
| @@ -241,7 +229,7 @@ pub const Iterator = struct { | |||
| 241 | if (that_p == .Double_Quote) { | 229 | if (that_p == .Double_Quote) { |
| 242 | const next_val = iter.peekPast(); | 230 | const next_val = iter.peekPast(); |
| 243 | if (next_val) |next_cp| { | 231 | if (next_val) |next_cp| { |
| 244 | const next_p = iter.wb.breakProp(next_cp); | 232 | const next_p = Words.breakProp(next_cp); |
| 245 | if (next_p == .Hebrew_Letter) { | 233 | if (next_p == .Hebrew_Letter) { |
| 246 | continue :scan; | 234 | continue :scan; |
| 247 | } | 235 | } |
| @@ -264,7 +252,7 @@ pub const Iterator = struct { | |||
| 264 | if (last_p == .Numeric and isMidNum(that_p)) { | 252 | if (last_p == .Numeric and isMidNum(that_p)) { |
| 265 | const next_val = iter.peekPast(); | 253 | const next_val = iter.peekPast(); |
| 266 | if (next_val) |next_cp| { | 254 | if (next_val) |next_cp| { |
| 267 | const next_p = iter.wb.breakProp(next_cp); | 255 | const next_p = Words.breakProp(next_cp); |
| 268 | if (next_p == .Numeric) { | 256 | if (next_p == .Numeric) { |
| 269 | continue :scan; | 257 | continue :scan; |
| 270 | } | 258 | } |
| @@ -308,7 +296,7 @@ pub const Iterator = struct { | |||
| 308 | const save_cp = iter.cp_iter; | 296 | const save_cp = iter.cp_iter; |
| 309 | defer iter.cp_iter = save_cp; | 297 | defer iter.cp_iter = save_cp; |
| 310 | while (iter.cp_iter.peek()) |peeked| { | 298 | while (iter.cp_iter.peek()) |peeked| { |
| 311 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | 299 | if (!isIgnorable(Words.breakProp(peeked))) return peeked; |
| 312 | _ = iter.cp_iter.next(); | 300 | _ = iter.cp_iter.next(); |
| 313 | } | 301 | } |
| 314 | return null; | 302 | return null; |
| @@ -320,12 +308,11 @@ pub const ReverseIterator = struct { | |||
| 320 | after: ?CodePoint = null, | 308 | after: ?CodePoint = null, |
| 321 | before: ?CodePoint = null, | 309 | before: ?CodePoint = null, |
| 322 | cp_iter: ReverseCodepointIterator, | 310 | cp_iter: ReverseCodepointIterator, |
| 323 | wb: *const Words, | ||
| 324 | flags: usize = 0, | 311 | flags: usize = 0, |
| 325 | 312 | ||
| 326 | /// Assumes `str` is valid UTF-8. | 313 | /// Assumes `str` is valid UTF-8. |
| 327 | pub fn init(words: *const Words, str: []const u8) ReverseIterator { | 314 | pub fn init(str: []const u8) ReverseIterator { |
| 328 | var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = words }; | 315 | var wb_iter: ReverseIterator = .{ .cp_iter = .init(str) }; |
| 329 | wb_iter.advance(); | 316 | wb_iter.advance(); |
| 330 | return wb_iter; | 317 | return wb_iter; |
| 331 | } | 318 | } |
| @@ -347,7 +334,6 @@ pub const ReverseIterator = struct { | |||
| 347 | if (iter.before) |_| | 334 | if (iter.before) |_| |
| 348 | _ = cp_it.next(); | 335 | _ = cp_it.next(); |
| 349 | return .{ | 336 | return .{ |
| 350 | .wb = iter.wb, | ||
| 351 | .this = cp_it.next(), | 337 | .this = cp_it.next(), |
| 352 | .that = iter.after, | 338 | .that = iter.after, |
| 353 | .cp_iter = cp_it, | 339 | .cp_iter = cp_it, |
| @@ -375,8 +361,8 @@ pub const ReverseIterator = struct { | |||
| 375 | word_len += after.len; | 361 | word_len += after.len; |
| 376 | if (iter.before) |before| { | 362 | if (iter.before) |before| { |
| 377 | var sneak = sneaky(iter); // 'sneaks' past ignorables | 363 | var sneak = sneaky(iter); // 'sneaks' past ignorables |
| 378 | const after_p = iter.wb.breakProp(after); | 364 | const after_p = Words.breakProp(after); |
| 379 | var before_p = iter.wb.breakProp(before); | 365 | var before_p = Words.breakProp(before); |
| 380 | if (!isIgnorable(after_p)) { | 366 | if (!isIgnorable(after_p)) { |
| 381 | last_last_p = last_p; | 367 | last_last_p = last_p; |
| 382 | last_p = after_p; | 368 | last_p = after_p; |
| @@ -397,7 +383,7 @@ pub const ReverseIterator = struct { | |||
| 397 | if (isIgnorable(before_p)) { | 383 | if (isIgnorable(before_p)) { |
| 398 | const maybe_before = sneak.prev(); | 384 | const maybe_before = sneak.prev(); |
| 399 | if (maybe_before) |valid_before| { | 385 | if (maybe_before) |valid_before| { |
| 400 | before_p = iter.wb.breakProp(valid_before); | 386 | before_p = Words.breakProp(valid_before); |
| 401 | } else if (!isIgnorable(after_p)) { | 387 | } else if (!isIgnorable(after_p)) { |
| 402 | // We're done | 388 | // We're done |
| 403 | break :scan; | 389 | break :scan; |
| @@ -416,7 +402,7 @@ pub const ReverseIterator = struct { | |||
| 416 | if (isMidVal(before_p) and isAHLetter(last_p)) { | 402 | if (isMidVal(before_p) and isAHLetter(last_p)) { |
| 417 | const prev_val = sneak.peek(); | 403 | const prev_val = sneak.peek(); |
| 418 | if (prev_val) |prev_cp| { | 404 | if (prev_val) |prev_cp| { |
| 419 | const prev_p = iter.wb.breakProp(prev_cp); | 405 | const prev_p = Words.breakProp(prev_cp); |
| 420 | if (isAHLetter(prev_p)) { | 406 | if (isAHLetter(prev_p)) { |
| 421 | continue :scan; | 407 | continue :scan; |
| 422 | } | 408 | } |
| @@ -432,7 +418,7 @@ pub const ReverseIterator = struct { | |||
| 432 | if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { | 418 | if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { |
| 433 | const prev_val = sneak.peek(); | 419 | const prev_val = sneak.peek(); |
| 434 | if (prev_val) |prev_cp| { | 420 | if (prev_val) |prev_cp| { |
| 435 | const prev_p = iter.wb.breakProp(prev_cp); | 421 | const prev_p = Words.breakProp(prev_cp); |
| 436 | if (prev_p == .Hebrew_Letter) { | 422 | if (prev_p == .Hebrew_Letter) { |
| 437 | continue :scan; | 423 | continue :scan; |
| 438 | } | 424 | } |
| @@ -448,7 +434,7 @@ pub const ReverseIterator = struct { | |||
| 448 | if (isMidNum(before_p) and last_p == .Numeric) { | 434 | if (isMidNum(before_p) and last_p == .Numeric) { |
| 449 | const prev_val = sneak.peek(); | 435 | const prev_val = sneak.peek(); |
| 450 | if (prev_val) |prev_cp| { | 436 | if (prev_val) |prev_cp| { |
| 451 | const prev_p = iter.wb.breakProp(prev_cp); | 437 | const prev_p = Words.breakProp(prev_cp); |
| 452 | if (prev_p == .Numeric) { | 438 | if (prev_p == .Numeric) { |
| 453 | continue :scan; | 439 | continue :scan; |
| 454 | } | 440 | } |
| @@ -491,7 +477,7 @@ pub const ReverseIterator = struct { | |||
| 491 | return Word{ .len = word_len, .offset = word_end - word_len }; | 477 | return Word{ .len = word_len, .offset = word_end - word_len }; |
| 492 | } | 478 | } |
| 493 | 479 | ||
| 494 | pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { | 480 | pub fn format(iter: ReverseIterator, writer: anytype) !void { |
| 495 | try writer.print( | 481 | try writer.print( |
| 496 | "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}", | 482 | "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}", |
| 497 | .{ iter.before, iter.after, iter.flags }, | 483 | .{ iter.before, iter.after, iter.flags }, |
| @@ -502,7 +488,7 @@ pub const ReverseIterator = struct { | |||
| 502 | const save_cp = iter.cp_iter; | 488 | const save_cp = iter.cp_iter; |
| 503 | defer iter.cp_iter = save_cp; | 489 | defer iter.cp_iter = save_cp; |
| 504 | while (iter.cp_iter.peek()) |peeked| { | 490 | while (iter.cp_iter.peek()) |peeked| { |
| 505 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | 491 | if (!isIgnorable(Words.breakProp(peeked))) return peeked; |
| 506 | _ = iter.cp_iter.prev(); | 492 | _ = iter.cp_iter.prev(); |
| 507 | } | 493 | } |
| 508 | return null; | 494 | return null; |
| @@ -517,13 +503,12 @@ pub const ReverseIterator = struct { | |||
| 517 | //| Implementation Details | 503 | //| Implementation Details |
| 518 | 504 | ||
| 519 | /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. | 505 | /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. |
| 520 | fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator { | 506 | fn reverseFromIndex(string: []const u8, index: usize) ReverseIterator { |
| 521 | var idx: uoffset = @intCast(index); | 507 | var idx: uoffset = @intCast(index); |
| 522 | // Find the next lead byte: | 508 | // Find the next lead byte: |
| 523 | while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} | 509 | while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} |
| 524 | if (idx == string.len) return words.reverseIterator(string); | 510 | if (idx == string.len) return Words.reverseIterator(string); |
| 525 | var iter: ReverseIterator = undefined; | 511 | var iter: ReverseIterator = undefined; |
| 526 | iter.wb = words; | ||
| 527 | iter.flags = 0; | 512 | iter.flags = 0; |
| 528 | // We need to populate the CodePoints, and the codepoint iterator. | 513 | // We need to populate the CodePoints, and the codepoint iterator. |
| 529 | // Consider "abc| def" with the cursor as |. | 514 | // Consider "abc| def" with the cursor as |. |
| @@ -536,20 +521,18 @@ fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) Rever | |||
| 536 | return iter; | 521 | return iter; |
| 537 | } | 522 | } |
| 538 | 523 | ||
| 539 | fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator { | 524 | fn forwardFromIndex(string: []const u8, index: usize) Iterator { |
| 540 | var idx: uoffset = @intCast(index); | 525 | var idx: uoffset = @intCast(index); |
| 541 | if (idx == string.len) { | 526 | if (idx == string.len) { |
| 542 | return .{ | 527 | return .{ |
| 543 | .cp_iter = .{ .bytes = string, .i = idx }, | 528 | .cp_iter = .{ .bytes = string, .i = idx }, |
| 544 | .this = null, | 529 | .this = null, |
| 545 | .that = null, | 530 | .that = null, |
| 546 | .wb = words, | ||
| 547 | }; | 531 | }; |
| 548 | } | 532 | } |
| 549 | while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} | 533 | while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} |
| 550 | if (idx == 0) return words.iterator(string); | 534 | if (idx == 0) return Words.iterator(string); |
| 551 | var iter: Iterator = undefined; | 535 | var iter: Iterator = undefined; |
| 552 | iter.wb = words; | ||
| 553 | // We need to populate the CodePoints, and the codepoint iterator. | 536 | // We need to populate the CodePoints, and the codepoint iterator. |
| 554 | // Consider "abc |def" with the cursor as |. | 537 | // Consider "abc |def" with the cursor as |. |
| 555 | // We need `this` to be ` ` and `that` to be 'd', | 538 | // We need `this` to be ` ` and `that` to be 'd', |
| @@ -565,18 +548,17 @@ fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Itera | |||
| 565 | } | 548 | } |
| 566 | 549 | ||
| 567 | fn sneaky(iter: *const ReverseIterator) SneakIterator { | 550 | fn sneaky(iter: *const ReverseIterator) SneakIterator { |
| 568 | return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; | 551 | return .{ .cp_iter = iter.cp_iter }; |
| 569 | } | 552 | } |
| 570 | 553 | ||
| 571 | const SneakIterator = struct { | 554 | const SneakIterator = struct { |
| 572 | cp_iter: ReverseCodepointIterator, | 555 | cp_iter: ReverseCodepointIterator, |
| 573 | wb: *const Words, | ||
| 574 | 556 | ||
| 575 | fn peek(iter: *SneakIterator) ?CodePoint { | 557 | fn peek(iter: *SneakIterator) ?CodePoint { |
| 576 | const save_cp = iter.cp_iter; | 558 | const save_cp = iter.cp_iter; |
| 577 | defer iter.cp_iter = save_cp; | 559 | defer iter.cp_iter = save_cp; |
| 578 | while (iter.cp_iter.peek()) |peeked| { | 560 | while (iter.cp_iter.peek()) |peeked| { |
| 579 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | 561 | if (!isIgnorable(Words.breakProp(peeked))) return peeked; |
| 580 | _ = iter.cp_iter.prev(); | 562 | _ = iter.cp_iter.prev(); |
| 581 | } | 563 | } |
| 582 | return null; | 564 | return null; |
| @@ -587,7 +569,7 @@ const SneakIterator = struct { | |||
| 587 | const save_cp = iter.cp_iter; | 569 | const save_cp = iter.cp_iter; |
| 588 | defer iter.cp_iter = save_cp; | 570 | defer iter.cp_iter = save_cp; |
| 589 | while (iter.cp_iter.prev()) |cp| { | 571 | while (iter.cp_iter.prev()) |cp| { |
| 590 | const prop = iter.wb.breakProp(cp); | 572 | const prop = Words.breakProp(cp); |
| 591 | if (isIgnorable(prop)) continue; | 573 | if (isIgnorable(prop)) continue; |
| 592 | if (prop == .Regional_Indicator) { | 574 | if (prop == .Regional_Indicator) { |
| 593 | flags += 1; | 575 | flags += 1; |
| @@ -598,73 +580,49 @@ const SneakIterator = struct { | |||
| 598 | 580 | ||
| 599 | fn prev(iter: *SneakIterator) ?CodePoint { | 581 | fn prev(iter: *SneakIterator) ?CodePoint { |
| 600 | while (iter.cp_iter.prev()) |peeked| { | 582 | while (iter.cp_iter.prev()) |peeked| { |
| 601 | if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; | 583 | if (!isIgnorable(Words.breakProp(peeked))) return peeked; |
| 602 | } | 584 | } |
| 603 | return null; | 585 | return null; |
| 604 | } | 586 | } |
| 605 | }; | 587 | }; |
| 606 | 588 | ||
| 607 | inline fn setupImpl(wb: *Words, allocator: Allocator) !void { | ||
| 608 | const in_bytes = @embedFile("wbp"); | ||
| 609 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 610 | var reader = in_fbs.reader(); | ||
| 611 | |||
| 612 | const endian = builtin.cpu.arch.endian(); | ||
| 613 | |||
| 614 | const stage_1_len: u16 = try reader.readInt(u16, endian); | ||
| 615 | wb.s1 = try allocator.alloc(u16, stage_1_len); | ||
| 616 | errdefer allocator.free(wb.s1); | ||
| 617 | for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); | ||
| 618 | |||
| 619 | const stage_2_len: u16 = try reader.readInt(u16, endian); | ||
| 620 | wb.s2 = try allocator.alloc(u5, stage_2_len); | ||
| 621 | errdefer allocator.free(wb.s2); | ||
| 622 | for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); | ||
| 623 | var count_0: usize = 0; | ||
| 624 | for (wb.s2) |nyb| { | ||
| 625 | if (nyb == 0) count_0 += 1; | ||
| 626 | } | ||
| 627 | } | ||
| 628 | |||
| 629 | //| Predicates | 589 | //| Predicates |
| 630 | 590 | ||
| 631 | inline fn isNewline(wbp: WordBreakProperty) bool { | 591 | inline fn isNewline(w_prop: WordBreakProperty) bool { |
| 632 | return wbp == .CR or wbp == .LF or wbp == .Newline; | 592 | return w_prop == .CR or w_prop == .LF or w_prop == .Newline; |
| 633 | } | 593 | } |
| 634 | 594 | ||
| 635 | inline fn isIgnorable(wbp: WordBreakProperty) bool { | 595 | inline fn isIgnorable(w_prop: WordBreakProperty) bool { |
| 636 | return switch (wbp) { | 596 | return switch (w_prop) { |
| 637 | .Format, .Extend, .ZWJ => true, | 597 | .Format, .Extend, .ZWJ => true, |
| 638 | else => false, | 598 | else => false, |
| 639 | }; | 599 | }; |
| 640 | } | 600 | } |
| 641 | 601 | ||
| 642 | inline fn isAHLetter(wbp: WordBreakProperty) bool { | 602 | inline fn isAHLetter(w_prop: WordBreakProperty) bool { |
| 643 | return wbp == .ALetter or wbp == .Hebrew_Letter; | 603 | return w_prop == .ALetter or w_prop == .Hebrew_Letter; |
| 644 | } | 604 | } |
| 645 | 605 | ||
| 646 | inline fn isMidVal(wbp: WordBreakProperty) bool { | 606 | inline fn isMidVal(w_prop: WordBreakProperty) bool { |
| 647 | return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; | 607 | return w_prop == .MidLetter or w_prop == .MidNumLet or w_prop == .Single_Quote; |
| 648 | } | 608 | } |
| 649 | 609 | ||
| 650 | inline fn isMidNum(wbp: WordBreakProperty) bool { | 610 | inline fn isMidNum(w_prop: WordBreakProperty) bool { |
| 651 | return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote; | 611 | return w_prop == .MidNum or w_prop == .MidNumLet or w_prop == .Single_Quote; |
| 652 | } | 612 | } |
| 653 | 613 | ||
| 654 | inline fn isExtensible(wbp: WordBreakProperty) bool { | 614 | inline fn isExtensible(w_prop: WordBreakProperty) bool { |
| 655 | return switch (wbp) { | 615 | return switch (w_prop) { |
| 656 | .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, | 616 | .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, |
| 657 | else => false, | 617 | else => false, |
| 658 | }; | 618 | }; |
| 659 | } | 619 | } |
| 660 | 620 | ||
| 661 | test "Word Break Properties" { | 621 | test "Word Break Properties" { |
| 662 | const wb = try Words.init(testing.allocator); | 622 | try testing.expectEqual(.CR, Words.breakProperty('\r')); |
| 663 | defer wb.deinit(testing.allocator); | 623 | try testing.expectEqual(.LF, Words.breakProperty('\n')); |
| 664 | try testing.expectEqual(.CR, wb.breakProperty('\r')); | 624 | try testing.expectEqual(.Hebrew_Letter, Words.breakProperty('ש')); |
| 665 | try testing.expectEqual(.LF, wb.breakProperty('\n')); | 625 | try testing.expectEqual(.Katakana, Words.breakProperty('\u{30ff}')); |
| 666 | try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); | ||
| 667 | try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); | ||
| 668 | } | 626 | } |
| 669 | 627 | ||
| 670 | test "ext_pict" { | 628 | test "ext_pict" { |
| @@ -673,16 +631,14 @@ test "ext_pict" { | |||
| 673 | } | 631 | } |
| 674 | 632 | ||
| 675 | test "Words" { | 633 | test "Words" { |
| 676 | const wb = try Words.init(testing.allocator); | ||
| 677 | defer wb.deinit(testing.allocator); | ||
| 678 | const word_str = "Metonym Μετωνύμιο メトニム"; | 634 | const word_str = "Metonym Μετωνύμιο メトニム"; |
| 679 | var w_iter = wb.iterator(word_str); | 635 | var w_iter = Words.iterator(word_str); |
| 680 | try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str)); | 636 | try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str)); |
| 681 | // Spaces are "words" too! | 637 | // Spaces are "words" too! |
| 682 | try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str)); | 638 | try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str)); |
| 683 | const in_greek = w_iter.next().?; | 639 | const in_greek = w_iter.next().?; |
| 684 | for (in_greek.offset..in_greek.offset + in_greek.len) |i| { | 640 | for (in_greek.offset..in_greek.offset + in_greek.len) |i| { |
| 685 | const at_index = wb.wordAtIndex(word_str, i).bytes(word_str); | 641 | const at_index = Words.wordAtIndex(word_str, i).bytes(word_str); |
| 686 | try testing.expectEqualStrings("Μετωνύμιο", at_index); | 642 | try testing.expectEqualStrings("Μετωνύμιο", at_index); |
| 687 | } | 643 | } |
| 688 | _ = w_iter.next(); | 644 | _ = w_iter.next(); |
| @@ -690,32 +646,28 @@ test "Words" { | |||
| 690 | } | 646 | } |
| 691 | 647 | ||
| 692 | test wordAtIndex { | 648 | test wordAtIndex { |
| 693 | const wb = try Words.init(testing.allocator); | ||
| 694 | defer wb.deinit(testing.allocator); | ||
| 695 | const t_string = "first second third"; | 649 | const t_string = "first second third"; |
| 696 | const second = wb.wordAtIndex(t_string, 8); | 650 | const second = Words.wordAtIndex(t_string, 8); |
| 697 | try testing.expectEqualStrings("second", second.bytes(t_string)); | 651 | try testing.expectEqualStrings("second", second.bytes(t_string)); |
| 698 | const third = wb.wordAtIndex(t_string, 14); | 652 | const third = Words.wordAtIndex(t_string, 14); |
| 699 | try testing.expectEqualStrings("third", third.bytes(t_string)); | 653 | try testing.expectEqualStrings("third", third.bytes(t_string)); |
| 700 | { | 654 | { |
| 701 | const first = wb.wordAtIndex(t_string, 3); | 655 | const first = Words.wordAtIndex(t_string, 3); |
| 702 | try testing.expectEqualStrings("first", first.bytes(t_string)); | 656 | try testing.expectEqualStrings("first", first.bytes(t_string)); |
| 703 | } | 657 | } |
| 704 | { | 658 | { |
| 705 | const first = wb.wordAtIndex(t_string, 0); | 659 | const first = Words.wordAtIndex(t_string, 0); |
| 706 | try testing.expectEqualStrings("first", first.bytes(t_string)); | 660 | try testing.expectEqualStrings("first", first.bytes(t_string)); |
| 707 | } | 661 | } |
| 708 | const last = wb.wordAtIndex(t_string, 14); | 662 | const last = Words.wordAtIndex(t_string, 14); |
| 709 | try testing.expectEqualStrings("third", last.bytes(t_string)); | 663 | try testing.expectEqualStrings("third", last.bytes(t_string)); |
| 710 | } | 664 | } |
| 711 | 665 | ||
| 712 | const testr = "don't a:ka fin!"; | 666 | const testr = "don't a:ka fin!"; |
| 713 | 667 | ||
| 714 | test "reversal" { | 668 | test "reversal" { |
| 715 | const wb = try Words.init(testing.allocator); | ||
| 716 | defer wb.deinit(testing.allocator); | ||
| 717 | { | 669 | { |
| 718 | var fwd = wb.iterator(testr); | 670 | var fwd = Words.iterator(testr); |
| 719 | var this_word: ?Word = fwd.next(); | 671 | var this_word: ?Word = fwd.next(); |
| 720 | 672 | ||
| 721 | while (this_word) |this| : (this_word = fwd.next()) { | 673 | while (this_word) |this| : (this_word = fwd.next()) { |
| @@ -729,7 +681,7 @@ test "reversal" { | |||
| 729 | } | 681 | } |
| 730 | } | 682 | } |
| 731 | { | 683 | { |
| 732 | var back = wb.reverseIterator(testr); | 684 | var back = Words.reverseIterator(testr); |
| 733 | var this_word: ?Word = back.prev(); | 685 | var this_word: ?Word = back.prev(); |
| 734 | 686 | ||
| 735 | while (this_word) |this| : (this_word = back.prev()) { | 687 | while (this_word) |this| : (this_word = back.prev()) { |
| @@ -744,15 +696,6 @@ test "reversal" { | |||
| 744 | } | 696 | } |
| 745 | } | 697 | } |
| 746 | 698 | ||
| 747 | fn testAllocations(allocator: Allocator) !void { | ||
| 748 | const wb = try Words.init(allocator); | ||
| 749 | wb.deinit(allocator); | ||
| 750 | } | ||
| 751 | |||
| 752 | test "allocation safety" { | ||
| 753 | try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{}); | ||
| 754 | } | ||
| 755 | |||
| 756 | const std = @import("std"); | 699 | const std = @import("std"); |
| 757 | const builtin = @import("builtin"); | 700 | const builtin = @import("builtin"); |
| 758 | const compress = std.compress; | 701 | const compress = std.compress; |
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 946c197..50b8824 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -255,8 +255,6 @@ test "Segmentation GraphemeIterator" { | |||
| 255 | test "Segmentation Word Iterator" { | 255 | test "Segmentation Word Iterator" { |
| 256 | const allocator = std.testing.allocator; | 256 | const allocator = std.testing.allocator; |
| 257 | var reader = std.io.Reader.fixed(@embedFile("WordBreakTest.txt")); | 257 | var reader = std.io.Reader.fixed(@embedFile("WordBreakTest.txt")); |
| 258 | const wb = try Words.init(allocator); | ||
| 259 | defer wb.deinit(allocator); | ||
| 260 | 258 | ||
| 261 | var line_iter: IterRead = .{ .read = &reader }; | 259 | var line_iter: IterRead = .{ .read = &reader }; |
| 262 | 260 | ||
| @@ -297,7 +295,7 @@ test "Segmentation Word Iterator" { | |||
| 297 | const this_str = all_bytes.items; | 295 | const this_str = all_bytes.items; |
| 298 | 296 | ||
| 299 | { | 297 | { |
| 300 | var iter = wb.iterator(this_str); | 298 | var iter = Words.iterator(this_str); |
| 301 | var peeked: ?Word = iter.peek(); | 299 | var peeked: ?Word = iter.peek(); |
| 302 | 300 | ||
| 303 | // Check. | 301 | // Check. |
| @@ -330,7 +328,7 @@ test "Segmentation Word Iterator" { | |||
| 330 | } else { | 328 | } else { |
| 331 | try testing.expect(false); | 329 | try testing.expect(false); |
| 332 | } | 330 | } |
| 333 | var peek_iter = wb.iterateAfterWord(this_str, got_word); | 331 | var peek_iter = Words.iterateAfterWord(this_str, got_word); |
| 334 | const peek_1 = peek_iter.next(); | 332 | const peek_1 = peek_iter.next(); |
| 335 | if (peek_1) |p1| { | 333 | if (peek_1) |p1| { |
| 336 | const peek_2 = iter.peek(); | 334 | const peek_2 = iter.peek(); |
| @@ -350,7 +348,7 @@ test "Segmentation Word Iterator" { | |||
| 350 | try testing.expectEqual(null, iter.peek()); | 348 | try testing.expectEqual(null, iter.peek()); |
| 351 | } | 349 | } |
| 352 | for (got_word.offset..got_word.offset + got_word.len) |i| { | 350 | for (got_word.offset..got_word.offset + got_word.len) |i| { |
| 353 | const this_word = wb.wordAtIndex(this_str, i); | 351 | const this_word = Words.wordAtIndex(this_str, i); |
| 354 | std.testing.expectEqualSlices( | 352 | std.testing.expectEqualSlices( |
| 355 | u8, | 353 | u8, |
| 356 | got_word.bytes(this_str), | 354 | got_word.bytes(this_str), |
| @@ -364,7 +362,7 @@ test "Segmentation Word Iterator" { | |||
| 364 | } | 362 | } |
| 365 | } | 363 | } |
| 366 | { | 364 | { |
| 367 | var r_iter = wb.reverseIterator(this_str); | 365 | var r_iter = Words.reverseIterator(this_str); |
| 368 | var peeked: ?Word = r_iter.peek(); | 366 | var peeked: ?Word = r_iter.peek(); |
| 369 | var idx = want.items.len - 1; | 367 | var idx = want.items.len - 1; |
| 370 | 368 | ||
| @@ -399,7 +397,7 @@ test "Segmentation Word Iterator" { | |||
| 399 | } else { | 397 | } else { |
| 400 | try testing.expect(false); | 398 | try testing.expect(false); |
| 401 | } | 399 | } |
| 402 | var peek_iter = wb.iterateBeforeWord(this_str, got_word); | 400 | var peek_iter = Words.iterateBeforeWord(this_str, got_word); |
| 403 | const peek_1 = peek_iter.prev(); | 401 | const peek_1 = peek_iter.prev(); |
| 404 | if (peek_1) |p1| { | 402 | if (peek_1) |p1| { |
| 405 | const peek_2 = r_iter.peek(); | 403 | const peek_2 = r_iter.peek(); |
| @@ -419,7 +417,7 @@ test "Segmentation Word Iterator" { | |||
| 419 | try testing.expectEqual(null, r_iter.peek()); | 417 | try testing.expectEqual(null, r_iter.peek()); |
| 420 | } | 418 | } |
| 421 | for (got_word.offset..got_word.offset + got_word.len) |i| { | 419 | for (got_word.offset..got_word.offset + got_word.len) |i| { |
| 422 | const this_word = wb.wordAtIndex(this_str, i); | 420 | const this_word = Words.wordAtIndex(this_str, i); |
| 423 | std.testing.expectEqualSlices( | 421 | std.testing.expectEqualSlices( |
| 424 | u8, | 422 | u8, |
| 425 | got_word.bytes(this_str), | 423 | got_word.bytes(this_str), |