diff options
| author | 2025-05-16 12:06:36 -0400 | |
|---|---|---|
| committer | 2025-05-16 12:06:36 -0400 | |
| commit | aa20bebade8eeb3ca75199dc252feb3edb203fb1 (patch) | |
| tree | 2e832616bbf554ca3a20588d050c0dc764f4cf65 /src | |
| parent | Move WordBreak to Words (diff) | |
| download | zg-aa20bebade8eeb3ca75199dc252feb3edb203fb1.tar.gz zg-aa20bebade8eeb3ca75199dc252feb3edb203fb1.tar.xz zg-aa20bebade8eeb3ca75199dc252feb3edb203fb1.zip | |
Words module
In keeping with the new nomenclature, we're calling the module "Words",
not "WordBreak". The latter is Unicode jargon, the module provides word
iterators. Words are the figure, word breaks are the ground.
Diffstat (limited to 'src')
| -rw-r--r-- | src/Words.zig | 42 | ||||
| -rw-r--r-- | src/unicode_tests.zig | 6 |
2 files changed, 24 insertions, 24 deletions
diff --git a/src/Words.zig b/src/Words.zig index 6a532f5..565a2fb 100644 --- a/src/Words.zig +++ b/src/Words.zig | |||
| @@ -25,15 +25,15 @@ const WordBreakProperty = enum(u5) { | |||
| 25 | s1: []u16 = undefined, | 25 | s1: []u16 = undefined, |
| 26 | s2: []u5 = undefined, | 26 | s2: []u5 = undefined, |
| 27 | 27 | ||
| 28 | const WordBreak = @This(); | 28 | const Words = @This(); |
| 29 | 29 | ||
| 30 | pub fn init(allocator: Allocator) Allocator.Error!WordBreak { | 30 | pub fn init(allocator: Allocator) Allocator.Error!Words { |
| 31 | var wb: WordBreak = undefined; | 31 | var wb: Words = undefined; |
| 32 | try wb.setup(allocator); | 32 | try wb.setup(allocator); |
| 33 | return wb; | 33 | return wb; |
| 34 | } | 34 | } |
| 35 | 35 | ||
| 36 | pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { | 36 | pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void { |
| 37 | wb.setupImpl(allocator) catch |err| { | 37 | wb.setupImpl(allocator) catch |err| { |
| 38 | switch (err) { | 38 | switch (err) { |
| 39 | error.OutOfMemory => |e| return e, | 39 | error.OutOfMemory => |e| return e, |
| @@ -42,7 +42,7 @@ pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { | |||
| 42 | }; | 42 | }; |
| 43 | } | 43 | } |
| 44 | 44 | ||
| 45 | pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { | 45 | pub fn deinit(wordbreak: *const Words, allocator: mem.Allocator) void { |
| 46 | allocator.free(wordbreak.s1); | 46 | allocator.free(wordbreak.s1); |
| 47 | allocator.free(wordbreak.s2); | 47 | allocator.free(wordbreak.s2); |
| 48 | } | 48 | } |
| @@ -60,19 +60,19 @@ pub const Word = struct { | |||
| 60 | }; | 60 | }; |
| 61 | 61 | ||
| 62 | /// Returns the word break property type for `cp`. | 62 | /// Returns the word break property type for `cp`. |
| 63 | pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { | 63 | pub fn breakProperty(wordbreak: *const Words, cp: u21) WordBreakProperty { |
| 64 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); | 64 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); |
| 65 | } | 65 | } |
| 66 | 66 | ||
| 67 | /// Convenience function for working with CodePoints | 67 | /// Convenience function for working with CodePoints |
| 68 | fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { | 68 | fn breakProp(wb: *const Words, point: CodePoint) WordBreakProperty { |
| 69 | return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); | 69 | return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); |
| 70 | } | 70 | } |
| 71 | 71 | ||
| 72 | /// Returns the Word at the given index. Asserts that the index is less than | 72 | /// Returns the Word at the given index. Asserts that the index is less than |
| 73 | /// `string.len`, and that the string is not empty. Always returns a word. | 73 | /// `string.len`, and that the string is not empty. Always returns a word. |
| 74 | /// The index does not have to be the start of a codepoint in the word. | 74 | /// The index does not have to be the start of a codepoint in the word. |
| 75 | pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { | 75 | pub fn wordAtIndex(wordbreak: *const Words, string: []const u8, index: usize) Word { |
| 76 | assert(index < string.len and string.len > 0); | 76 | assert(index < string.len and string.len > 0); |
| 77 | var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); | 77 | var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); |
| 78 | const first_back = iter_back.prev(); | 78 | const first_back = iter_back.prev(); |
| @@ -118,12 +118,12 @@ pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize | |||
| 118 | } | 118 | } |
| 119 | 119 | ||
| 120 | /// Returns an iterator over words in `slice`. | 120 | /// Returns an iterator over words in `slice`. |
| 121 | pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { | 121 | pub fn iterator(wordbreak: *const Words, slice: []const u8) Iterator { |
| 122 | return Iterator.init(wordbreak, slice); | 122 | return Iterator.init(wordbreak, slice); |
| 123 | } | 123 | } |
| 124 | 124 | ||
| 125 | /// Returns a reverse iterator over the words in `slice`. | 125 | /// Returns a reverse iterator over the words in `slice`. |
| 126 | pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator { | 126 | pub fn reverseIterator(wordbreak: *const Words, slice: []const u8) ReverseIterator { |
| 127 | return ReverseIterator.init(wordbreak, slice); | 127 | return ReverseIterator.init(wordbreak, slice); |
| 128 | } | 128 | } |
| 129 | 129 | ||
| @@ -132,10 +132,10 @@ pub const Iterator = struct { | |||
| 132 | this: ?CodePoint = null, | 132 | this: ?CodePoint = null, |
| 133 | that: ?CodePoint = null, | 133 | that: ?CodePoint = null, |
| 134 | cp_iter: CodepointIterator, | 134 | cp_iter: CodepointIterator, |
| 135 | wb: *const WordBreak, | 135 | wb: *const Words, |
| 136 | 136 | ||
| 137 | /// Assumes `str` is valid UTF-8. | 137 | /// Assumes `str` is valid UTF-8. |
| 138 | pub fn init(wb: *const WordBreak, str: []const u8) Iterator { | 138 | pub fn init(wb: *const Words, str: []const u8) Iterator { |
| 139 | var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; | 139 | var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; |
| 140 | wb_iter.advance(); | 140 | wb_iter.advance(); |
| 141 | return wb_iter; | 141 | return wb_iter; |
| @@ -314,11 +314,11 @@ pub const ReverseIterator = struct { | |||
| 314 | after: ?CodePoint = null, | 314 | after: ?CodePoint = null, |
| 315 | before: ?CodePoint = null, | 315 | before: ?CodePoint = null, |
| 316 | cp_iter: ReverseCodepointIterator, | 316 | cp_iter: ReverseCodepointIterator, |
| 317 | wb: *const WordBreak, | 317 | wb: *const Words, |
| 318 | flags: usize = 0, | 318 | flags: usize = 0, |
| 319 | 319 | ||
| 320 | /// Assumes `str` is valid UTF-8. | 320 | /// Assumes `str` is valid UTF-8. |
| 321 | pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { | 321 | pub fn init(wb: *const Words, str: []const u8) ReverseIterator { |
| 322 | var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; | 322 | var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; |
| 323 | wb_iter.advance(); | 323 | wb_iter.advance(); |
| 324 | return wb_iter; | 324 | return wb_iter; |
| @@ -511,7 +511,7 @@ pub const ReverseIterator = struct { | |||
| 511 | //| Implementation Details | 511 | //| Implementation Details |
| 512 | 512 | ||
| 513 | /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. | 513 | /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. |
| 514 | fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator { | 514 | fn initAtIndex(wb: *const Words, string: []const u8, index: usize) ReverseIterator { |
| 515 | var idx: u32 = @intCast(index); | 515 | var idx: u32 = @intCast(index); |
| 516 | // Find the next lead byte: | 516 | // Find the next lead byte: |
| 517 | while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} | 517 | while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} |
| @@ -536,7 +536,7 @@ fn sneaky(iter: *const ReverseIterator) SneakIterator { | |||
| 536 | 536 | ||
| 537 | const SneakIterator = struct { | 537 | const SneakIterator = struct { |
| 538 | cp_iter: ReverseCodepointIterator, | 538 | cp_iter: ReverseCodepointIterator, |
| 539 | wb: *const WordBreak, | 539 | wb: *const Words, |
| 540 | 540 | ||
| 541 | fn peek(iter: *SneakIterator) ?CodePoint { | 541 | fn peek(iter: *SneakIterator) ?CodePoint { |
| 542 | const save_cp = iter.cp_iter; | 542 | const save_cp = iter.cp_iter; |
| @@ -570,7 +570,7 @@ const SneakIterator = struct { | |||
| 570 | } | 570 | } |
| 571 | }; | 571 | }; |
| 572 | 572 | ||
| 573 | inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { | 573 | inline fn setupImpl(wb: *Words, allocator: Allocator) !void { |
| 574 | const decompressor = compress.flate.inflate.decompressor; | 574 | const decompressor = compress.flate.inflate.decompressor; |
| 575 | const in_bytes = @embedFile("wbp"); | 575 | const in_bytes = @embedFile("wbp"); |
| 576 | var in_fbs = std.io.fixedBufferStream(in_bytes); | 576 | var in_fbs = std.io.fixedBufferStream(in_bytes); |
| @@ -627,7 +627,7 @@ inline fn isExtensible(wbp: WordBreakProperty) bool { | |||
| 627 | } | 627 | } |
| 628 | 628 | ||
| 629 | test "Word Break Properties" { | 629 | test "Word Break Properties" { |
| 630 | const wb = try WordBreak.init(testing.allocator); | 630 | const wb = try Words.init(testing.allocator); |
| 631 | defer wb.deinit(testing.allocator); | 631 | defer wb.deinit(testing.allocator); |
| 632 | try testing.expectEqual(.CR, wb.breakProperty('\r')); | 632 | try testing.expectEqual(.CR, wb.breakProperty('\r')); |
| 633 | try testing.expectEqual(.LF, wb.breakProperty('\n')); | 633 | try testing.expectEqual(.LF, wb.breakProperty('\n')); |
| @@ -641,7 +641,7 @@ test "ext_pict" { | |||
| 641 | } | 641 | } |
| 642 | 642 | ||
| 643 | test wordAtIndex { | 643 | test wordAtIndex { |
| 644 | const wb = try WordBreak.init(testing.allocator); | 644 | const wb = try Words.init(testing.allocator); |
| 645 | defer wb.deinit(testing.allocator); | 645 | defer wb.deinit(testing.allocator); |
| 646 | const t_string = "first second third"; | 646 | const t_string = "first second third"; |
| 647 | const second = wb.wordAtIndex(t_string, 8); | 647 | const second = wb.wordAtIndex(t_string, 8); |
| @@ -663,7 +663,7 @@ test wordAtIndex { | |||
| 663 | const testr = "don't a:ka fin!"; | 663 | const testr = "don't a:ka fin!"; |
| 664 | 664 | ||
| 665 | test "reversal" { | 665 | test "reversal" { |
| 666 | const wb = try WordBreak.init(testing.allocator); | 666 | const wb = try Words.init(testing.allocator); |
| 667 | defer wb.deinit(testing.allocator); | 667 | defer wb.deinit(testing.allocator); |
| 668 | { | 668 | { |
| 669 | var fwd = wb.iterator(testr); | 669 | var fwd = wb.iterator(testr); |
| @@ -696,7 +696,7 @@ test "reversal" { | |||
| 696 | } | 696 | } |
| 697 | 697 | ||
| 698 | fn testAllocations(allocator: Allocator) !void { | 698 | fn testAllocations(allocator: Allocator) !void { |
| 699 | const wb = try WordBreak.init(allocator); | 699 | const wb = try Words.init(allocator); |
| 700 | wb.deinit(allocator); | 700 | wb.deinit(allocator); |
| 701 | } | 701 | } |
| 702 | 702 | ||
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 7139d4c..18f1814 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -211,7 +211,7 @@ test "Segmentation Word Iterator" { | |||
| 211 | var buf_reader = std.io.bufferedReader(file.reader()); | 211 | var buf_reader = std.io.bufferedReader(file.reader()); |
| 212 | var input_stream = buf_reader.reader(); | 212 | var input_stream = buf_reader.reader(); |
| 213 | 213 | ||
| 214 | const wb = try WordBreak.init(allocator); | 214 | const wb = try Words.init(allocator); |
| 215 | defer wb.deinit(allocator); | 215 | defer wb.deinit(allocator); |
| 216 | 216 | ||
| 217 | var buf: [4096]u8 = undefined; | 217 | var buf: [4096]u8 = undefined; |
| @@ -392,5 +392,5 @@ const Graphemes = @import("Graphemes"); | |||
| 392 | const GraphemeIterator = @import("Graphemes").Iterator; | 392 | const GraphemeIterator = @import("Graphemes").Iterator; |
| 393 | const Normalize = @import("Normalize"); | 393 | const Normalize = @import("Normalize"); |
| 394 | 394 | ||
| 395 | const WordBreak = @import("WordBreak"); | 395 | const Words = @import("Words"); |
| 396 | const Word = WordBreak.Word; | 396 | const Word = Words.Word; |