From 5199401c536d0b0032c5908c55d5c0bb34b76d12 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 4 Feb 2026 15:31:50 -0500 Subject: Convert Words module to no-allocation --- src/Words.zig | 215 +++++++++++++++++++++------------------------------------- 1 file changed, 79 insertions(+), 136 deletions(-) (limited to 'src/Words.zig') diff --git a/src/Words.zig b/src/Words.zig index ce3203f..aeb25d1 100644 --- a/src/Words.zig +++ b/src/Words.zig @@ -3,6 +3,8 @@ //! https://www.unicode.org/reports/tr29/#Word_Boundaries //! +const Words = @This(); + const WordBreakProperty = enum(u5) { none, Double_Quote, @@ -25,30 +27,18 @@ const WordBreakProperty = enum(u5) { WSegSpace, }; -s1: []u16 = undefined, -s2: []u5 = undefined, - -const Words = @This(); - -pub fn init(allocator: Allocator) Allocator.Error!Words { - var wb: Words = undefined; - try wb.setup(allocator); - return wb; -} +const Data = struct { + s1: []const u16 = undefined, + s2: []const u5 = undefined, +}; -pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void { - wb.setupImpl(allocator) catch |err| { - switch (err) { - error.OutOfMemory => |e| return e, - else => unreachable, - } +const wbp = display_width: { + const data = @import("wbp"); + break :display_width Data{ + .s1 = &data.s1, + .s2 = &data.s2, }; -} - -pub fn deinit(words: *const Words, allocator: mem.Allocator) void { - allocator.free(words.s1); - allocator.free(words.s2); -} +}; /// Represents a Unicode word span, as an offset into the source string /// and the length of the word. @@ -63,32 +53,32 @@ pub const Word = struct { }; /// Returns the word break property type for `cp`. -pub fn breakProperty(words: *const Words, cp: u21) WordBreakProperty { - return @enumFromInt(words.s2[words.s1[cp >> 8] + (cp & 0xff)]); +pub fn breakProperty(cp: u21) WordBreakProperty { + return @enumFromInt(wbp.s2[wbp.s1[cp >> 8] + (cp & 0xff)]); } /// Convenience function for working with CodePoints -fn breakProp(words: *const Words, point: CodePoint) WordBreakProperty { - return @enumFromInt(words.s2[words.s1[point.code >> 8] + (point.code & 0xff)]); +fn breakProp(point: CodePoint) WordBreakProperty { + return @enumFromInt(wbp.s2[wbp.s1[point.code >> 8] + (point.code & 0xff)]); } /// Returns the Word at the given index. Asserts that the index is less than /// `string.len`, and that the string is not empty. Always returns a word. /// The index does not have to be the start of a codepoint in the word. -pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word { +pub fn wordAtIndex(string: []const u8, index: usize) Word { assert(index < string.len and string.len > 0); - var iter_back: ReverseIterator = reverseFromIndex(words, string, index); + var iter_back: ReverseIterator = reverseFromIndex(string, index); const first_back = iter_back.prev(); if (first_back) |back| { if (back.offset == 0) { - var iter_fwd = words.iterator(string); + var iter_fwd = Words.iterator(string); while (iter_fwd.next()) |word| { if (word.offset <= index and index < word.offset + word.len) return word; } } } else { - var iter_fwd = words.iterator(string); + var iter_fwd = Words.iterator(string); while (iter_fwd.next()) |word| { if (word.offset <= index and index < word.offset + word.len) return word; @@ -114,23 +104,23 @@ pub fn wordAtIndex(words: *const Words, string: []const u8, index: usize) Word { } /// Returns an iterator over words in `slice`. -pub fn iterator(words: *const Words, slice: []const u8) Iterator { - return Iterator.init(words, slice); +pub fn iterator(slice: []const u8) Iterator { + return Iterator.init(slice); } /// Returns a reverse iterator over the words in `slice`. -pub fn reverseIterator(words: *const Words, slice: []const u8) ReverseIterator { - return ReverseIterator.init(words, slice); +pub fn reverseIterator(slice: []const u8) ReverseIterator { + return ReverseIterator.init(slice); } /// Returns an iterator after the `word` in `slice`. -pub fn iterateAfterWord(words: *const Words, slice: []const u8, word: Word) Iterator { - return forwardFromIndex(words, slice, word.offset + word.len); +pub fn iterateAfterWord(slice: []const u8, word: Word) Iterator { + return forwardFromIndex(slice, word.offset + word.len); } /// Returns a reverse iterator before the `word` in `slice`. -pub fn iterateBeforeWord(words: *const Words, slice: []const u8, word: Word) ReverseIterator { - return reverseFromIndex(words, slice, word.offset); +pub fn iterateBeforeWord(slice: []const u8, word: Word) ReverseIterator { + return reverseFromIndex(slice, word.offset); } /// An iterator, forward, over all words in a provided string. @@ -138,11 +128,10 @@ pub const Iterator = struct { this: ?CodePoint = null, that: ?CodePoint = null, cp_iter: CodepointIterator, - wb: *const Words, /// Assumes `str` is valid UTF-8. - pub fn init(words: *const Words, str: []const u8) Iterator { - var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = words }; + pub fn init(str: []const u8) Iterator { + var wb_iter: Iterator = .{ .cp_iter = .init(str) }; wb_iter.advance(); return wb_iter; } @@ -166,7 +155,6 @@ pub const Iterator = struct { if (iter.cp_iter.peek()) |_| _ = cp_it.prev(); return .{ - .wb = iter.wb, .before = cp_it.prev(), .after = iter.that, .cp_iter = cp_it, @@ -194,8 +182,8 @@ pub const Iterator = struct { const this = iter.this.?; word_len += this.len; if (iter.that) |that| { - const this_p = iter.wb.breakProp(this); - const that_p = iter.wb.breakProp(that); + const this_p = Words.breakProp(this); + const that_p = Words.breakProp(that); if (!isIgnorable(this_p)) { last_last_p = last_p; last_p = this_p; @@ -223,7 +211,7 @@ pub const Iterator = struct { if (isMidVal(that_p)) { const next_val = iter.peekPast(); if (next_val) |next_cp| { - const next_p = iter.wb.breakProp(next_cp); + const next_p = Words.breakProp(next_cp); if (isAHLetter(next_p)) { continue :scan; } @@ -241,7 +229,7 @@ pub const Iterator = struct { if (that_p == .Double_Quote) { const next_val = iter.peekPast(); if (next_val) |next_cp| { - const next_p = iter.wb.breakProp(next_cp); + const next_p = Words.breakProp(next_cp); if (next_p == .Hebrew_Letter) { continue :scan; } @@ -264,7 +252,7 @@ pub const Iterator = struct { if (last_p == .Numeric and isMidNum(that_p)) { const next_val = iter.peekPast(); if (next_val) |next_cp| { - const next_p = iter.wb.breakProp(next_cp); + const next_p = Words.breakProp(next_cp); if (next_p == .Numeric) { continue :scan; } @@ -308,7 +296,7 @@ pub const Iterator = struct { const save_cp = iter.cp_iter; defer iter.cp_iter = save_cp; while (iter.cp_iter.peek()) |peeked| { - if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + if (!isIgnorable(Words.breakProp(peeked))) return peeked; _ = iter.cp_iter.next(); } return null; @@ -320,12 +308,11 @@ pub const ReverseIterator = struct { after: ?CodePoint = null, before: ?CodePoint = null, cp_iter: ReverseCodepointIterator, - wb: *const Words, flags: usize = 0, /// Assumes `str` is valid UTF-8. - pub fn init(words: *const Words, str: []const u8) ReverseIterator { - var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = words }; + pub fn init(str: []const u8) ReverseIterator { + var wb_iter: ReverseIterator = .{ .cp_iter = .init(str) }; wb_iter.advance(); return wb_iter; } @@ -347,7 +334,6 @@ pub const ReverseIterator = struct { if (iter.before) |_| _ = cp_it.next(); return .{ - .wb = iter.wb, .this = cp_it.next(), .that = iter.after, .cp_iter = cp_it, @@ -375,8 +361,8 @@ pub const ReverseIterator = struct { word_len += after.len; if (iter.before) |before| { var sneak = sneaky(iter); // 'sneaks' past ignorables - const after_p = iter.wb.breakProp(after); - var before_p = iter.wb.breakProp(before); + const after_p = Words.breakProp(after); + var before_p = Words.breakProp(before); if (!isIgnorable(after_p)) { last_last_p = last_p; last_p = after_p; @@ -397,7 +383,7 @@ pub const ReverseIterator = struct { if (isIgnorable(before_p)) { const maybe_before = sneak.prev(); if (maybe_before) |valid_before| { - before_p = iter.wb.breakProp(valid_before); + before_p = Words.breakProp(valid_before); } else if (!isIgnorable(after_p)) { // We're done break :scan; @@ -416,7 +402,7 @@ pub const ReverseIterator = struct { if (isMidVal(before_p) and isAHLetter(last_p)) { const prev_val = sneak.peek(); if (prev_val) |prev_cp| { - const prev_p = iter.wb.breakProp(prev_cp); + const prev_p = Words.breakProp(prev_cp); if (isAHLetter(prev_p)) { continue :scan; } @@ -432,7 +418,7 @@ pub const ReverseIterator = struct { if (before_p == .Double_Quote and last_p == .Hebrew_Letter) { const prev_val = sneak.peek(); if (prev_val) |prev_cp| { - const prev_p = iter.wb.breakProp(prev_cp); + const prev_p = Words.breakProp(prev_cp); if (prev_p == .Hebrew_Letter) { continue :scan; } @@ -448,7 +434,7 @@ pub const ReverseIterator = struct { if (isMidNum(before_p) and last_p == .Numeric) { const prev_val = sneak.peek(); if (prev_val) |prev_cp| { - const prev_p = iter.wb.breakProp(prev_cp); + const prev_p = Words.breakProp(prev_cp); if (prev_p == .Numeric) { continue :scan; } @@ -491,7 +477,7 @@ pub const ReverseIterator = struct { return Word{ .len = word_len, .offset = word_end - word_len }; } - pub fn format(iter: ReverseIterator, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + pub fn format(iter: ReverseIterator, writer: anytype) !void { try writer.print( "ReverseIterator {{ .before = {any}, .after = {any}, .flags = {d} }}", .{ iter.before, iter.after, iter.flags }, @@ -502,7 +488,7 @@ pub const ReverseIterator = struct { const save_cp = iter.cp_iter; defer iter.cp_iter = save_cp; while (iter.cp_iter.peek()) |peeked| { - if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + if (!isIgnorable(Words.breakProp(peeked))) return peeked; _ = iter.cp_iter.prev(); } return null; @@ -517,13 +503,12 @@ pub const ReverseIterator = struct { //| Implementation Details /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. -fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator { +fn reverseFromIndex(string: []const u8, index: usize) ReverseIterator { var idx: uoffset = @intCast(index); // Find the next lead byte: while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} - if (idx == string.len) return words.reverseIterator(string); + if (idx == string.len) return Words.reverseIterator(string); var iter: ReverseIterator = undefined; - iter.wb = words; iter.flags = 0; // We need to populate the CodePoints, and the codepoint iterator. // Consider "abc| def" with the cursor as |. @@ -536,20 +521,18 @@ fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) Rever return iter; } -fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator { +fn forwardFromIndex(string: []const u8, index: usize) Iterator { var idx: uoffset = @intCast(index); if (idx == string.len) { return .{ .cp_iter = .{ .bytes = string, .i = idx }, .this = null, .that = null, - .wb = words, }; } while (idx > 0 and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx -= 1) {} - if (idx == 0) return words.iterator(string); + if (idx == 0) return Words.iterator(string); var iter: Iterator = undefined; - iter.wb = words; // We need to populate the CodePoints, and the codepoint iterator. // Consider "abc |def" with the cursor as |. // We need `this` to be ` ` and `that` to be 'd', @@ -565,18 +548,17 @@ fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Itera } fn sneaky(iter: *const ReverseIterator) SneakIterator { - return .{ .cp_iter = iter.cp_iter, .wb = iter.wb }; + return .{ .cp_iter = iter.cp_iter }; } const SneakIterator = struct { cp_iter: ReverseCodepointIterator, - wb: *const Words, fn peek(iter: *SneakIterator) ?CodePoint { const save_cp = iter.cp_iter; defer iter.cp_iter = save_cp; while (iter.cp_iter.peek()) |peeked| { - if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + if (!isIgnorable(Words.breakProp(peeked))) return peeked; _ = iter.cp_iter.prev(); } return null; @@ -587,7 +569,7 @@ const SneakIterator = struct { const save_cp = iter.cp_iter; defer iter.cp_iter = save_cp; while (iter.cp_iter.prev()) |cp| { - const prop = iter.wb.breakProp(cp); + const prop = Words.breakProp(cp); if (isIgnorable(prop)) continue; if (prop == .Regional_Indicator) { flags += 1; @@ -598,73 +580,49 @@ const SneakIterator = struct { fn prev(iter: *SneakIterator) ?CodePoint { while (iter.cp_iter.prev()) |peeked| { - if (!isIgnorable(iter.wb.breakProp(peeked))) return peeked; + if (!isIgnorable(Words.breakProp(peeked))) return peeked; } return null; } }; -inline fn setupImpl(wb: *Words, allocator: Allocator) !void { - const in_bytes = @embedFile("wbp"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var reader = in_fbs.reader(); - - const endian = builtin.cpu.arch.endian(); - - const stage_1_len: u16 = try reader.readInt(u16, endian); - wb.s1 = try allocator.alloc(u16, stage_1_len); - errdefer allocator.free(wb.s1); - for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); - - const stage_2_len: u16 = try reader.readInt(u16, endian); - wb.s2 = try allocator.alloc(u5, stage_2_len); - errdefer allocator.free(wb.s2); - for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); - var count_0: usize = 0; - for (wb.s2) |nyb| { - if (nyb == 0) count_0 += 1; - } -} - //| Predicates -inline fn isNewline(wbp: WordBreakProperty) bool { - return wbp == .CR or wbp == .LF or wbp == .Newline; +inline fn isNewline(w_prop: WordBreakProperty) bool { + return w_prop == .CR or w_prop == .LF or w_prop == .Newline; } -inline fn isIgnorable(wbp: WordBreakProperty) bool { - return switch (wbp) { +inline fn isIgnorable(w_prop: WordBreakProperty) bool { + return switch (w_prop) { .Format, .Extend, .ZWJ => true, else => false, }; } -inline fn isAHLetter(wbp: WordBreakProperty) bool { - return wbp == .ALetter or wbp == .Hebrew_Letter; +inline fn isAHLetter(w_prop: WordBreakProperty) bool { + return w_prop == .ALetter or w_prop == .Hebrew_Letter; } -inline fn isMidVal(wbp: WordBreakProperty) bool { - return wbp == .MidLetter or wbp == .MidNumLet or wbp == .Single_Quote; +inline fn isMidVal(w_prop: WordBreakProperty) bool { + return w_prop == .MidLetter or w_prop == .MidNumLet or w_prop == .Single_Quote; } -inline fn isMidNum(wbp: WordBreakProperty) bool { - return wbp == .MidNum or wbp == .MidNumLet or wbp == .Single_Quote; +inline fn isMidNum(w_prop: WordBreakProperty) bool { + return w_prop == .MidNum or w_prop == .MidNumLet or w_prop == .Single_Quote; } -inline fn isExtensible(wbp: WordBreakProperty) bool { - return switch (wbp) { +inline fn isExtensible(w_prop: WordBreakProperty) bool { + return switch (w_prop) { .ALetter, .Hebrew_Letter, .Katakana, .Numeric, .ExtendNumLet => true, else => false, }; } test "Word Break Properties" { - const wb = try Words.init(testing.allocator); - defer wb.deinit(testing.allocator); - try testing.expectEqual(.CR, wb.breakProperty('\r')); - try testing.expectEqual(.LF, wb.breakProperty('\n')); - try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); - try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); + try testing.expectEqual(.CR, Words.breakProperty('\r')); + try testing.expectEqual(.LF, Words.breakProperty('\n')); + try testing.expectEqual(.Hebrew_Letter, Words.breakProperty('ש')); + try testing.expectEqual(.Katakana, Words.breakProperty('\u{30ff}')); } test "ext_pict" { @@ -673,16 +631,14 @@ test "ext_pict" { } test "Words" { - const wb = try Words.init(testing.allocator); - defer wb.deinit(testing.allocator); const word_str = "Metonym Μετωνύμιο メトニム"; - var w_iter = wb.iterator(word_str); + var w_iter = Words.iterator(word_str); try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str)); // Spaces are "words" too! try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str)); const in_greek = w_iter.next().?; for (in_greek.offset..in_greek.offset + in_greek.len) |i| { - const at_index = wb.wordAtIndex(word_str, i).bytes(word_str); + const at_index = Words.wordAtIndex(word_str, i).bytes(word_str); try testing.expectEqualStrings("Μετωνύμιο", at_index); } _ = w_iter.next(); @@ -690,32 +646,28 @@ test "Words" { } test wordAtIndex { - const wb = try Words.init(testing.allocator); - defer wb.deinit(testing.allocator); const t_string = "first second third"; - const second = wb.wordAtIndex(t_string, 8); + const second = Words.wordAtIndex(t_string, 8); try testing.expectEqualStrings("second", second.bytes(t_string)); - const third = wb.wordAtIndex(t_string, 14); + const third = Words.wordAtIndex(t_string, 14); try testing.expectEqualStrings("third", third.bytes(t_string)); { - const first = wb.wordAtIndex(t_string, 3); + const first = Words.wordAtIndex(t_string, 3); try testing.expectEqualStrings("first", first.bytes(t_string)); } { - const first = wb.wordAtIndex(t_string, 0); + const first = Words.wordAtIndex(t_string, 0); try testing.expectEqualStrings("first", first.bytes(t_string)); } - const last = wb.wordAtIndex(t_string, 14); + const last = Words.wordAtIndex(t_string, 14); try testing.expectEqualStrings("third", last.bytes(t_string)); } const testr = "don't a:ka fin!"; test "reversal" { - const wb = try Words.init(testing.allocator); - defer wb.deinit(testing.allocator); { - var fwd = wb.iterator(testr); + var fwd = Words.iterator(testr); var this_word: ?Word = fwd.next(); while (this_word) |this| : (this_word = fwd.next()) { @@ -729,7 +681,7 @@ test "reversal" { } } { - var back = wb.reverseIterator(testr); + var back = Words.reverseIterator(testr); var this_word: ?Word = back.prev(); while (this_word) |this| : (this_word = back.prev()) { @@ -744,15 +696,6 @@ test "reversal" { } } -fn testAllocations(allocator: Allocator) !void { - const wb = try Words.init(allocator); - wb.deinit(allocator); -} - -test "allocation safety" { - try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{}); -} - const std = @import("std"); const builtin = @import("builtin"); const compress = std.compress; -- cgit v1.2.3