diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/Words.zig | 42 | ||||
| -rw-r--r-- | src/unicode_tests.zig | 6 |
2 files changed, 24 insertions, 24 deletions
diff --git a/src/Words.zig b/src/Words.zig index 6a532f5..565a2fb 100644 --- a/src/Words.zig +++ b/src/Words.zig | |||
| @@ -25,15 +25,15 @@ const WordBreakProperty = enum(u5) { | |||
| 25 | s1: []u16 = undefined, | 25 | s1: []u16 = undefined, |
| 26 | s2: []u5 = undefined, | 26 | s2: []u5 = undefined, |
| 27 | 27 | ||
| 28 | const WordBreak = @This(); | 28 | const Words = @This(); |
| 29 | 29 | ||
| 30 | pub fn init(allocator: Allocator) Allocator.Error!WordBreak { | 30 | pub fn init(allocator: Allocator) Allocator.Error!Words { |
| 31 | var wb: WordBreak = undefined; | 31 | var wb: Words = undefined; |
| 32 | try wb.setup(allocator); | 32 | try wb.setup(allocator); |
| 33 | return wb; | 33 | return wb; |
| 34 | } | 34 | } |
| 35 | 35 | ||
| 36 | pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { | 36 | pub fn setup(wb: *Words, allocator: Allocator) Allocator.Error!void { |
| 37 | wb.setupImpl(allocator) catch |err| { | 37 | wb.setupImpl(allocator) catch |err| { |
| 38 | switch (err) { | 38 | switch (err) { |
| 39 | error.OutOfMemory => |e| return e, | 39 | error.OutOfMemory => |e| return e, |
| @@ -42,7 +42,7 @@ pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { | |||
| 42 | }; | 42 | }; |
| 43 | } | 43 | } |
| 44 | 44 | ||
| 45 | pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { | 45 | pub fn deinit(wordbreak: *const Words, allocator: mem.Allocator) void { |
| 46 | allocator.free(wordbreak.s1); | 46 | allocator.free(wordbreak.s1); |
| 47 | allocator.free(wordbreak.s2); | 47 | allocator.free(wordbreak.s2); |
| 48 | } | 48 | } |
| @@ -60,19 +60,19 @@ pub const Word = struct { | |||
| 60 | }; | 60 | }; |
| 61 | 61 | ||
| 62 | /// Returns the word break property type for `cp`. | 62 | /// Returns the word break property type for `cp`. |
| 63 | pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { | 63 | pub fn breakProperty(wordbreak: *const Words, cp: u21) WordBreakProperty { |
| 64 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); | 64 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); |
| 65 | } | 65 | } |
| 66 | 66 | ||
| 67 | /// Convenience function for working with CodePoints | 67 | /// Convenience function for working with CodePoints |
| 68 | fn breakProp(wb: *const WordBreak, point: CodePoint) WordBreakProperty { | 68 | fn breakProp(wb: *const Words, point: CodePoint) WordBreakProperty { |
| 69 | return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); | 69 | return @enumFromInt(wb.s2[wb.s1[point.code >> 8] + (point.code & 0xff)]); |
| 70 | } | 70 | } |
| 71 | 71 | ||
| 72 | /// Returns the Word at the given index. Asserts that the index is less than | 72 | /// Returns the Word at the given index. Asserts that the index is less than |
| 73 | /// `string.len`, and that the string is not empty. Always returns a word. | 73 | /// `string.len`, and that the string is not empty. Always returns a word. |
| 74 | /// The index does not have to be the start of a codepoint in the word. | 74 | /// The index does not have to be the start of a codepoint in the word. |
| 75 | pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize) Word { | 75 | pub fn wordAtIndex(wordbreak: *const Words, string: []const u8, index: usize) Word { |
| 76 | assert(index < string.len and string.len > 0); | 76 | assert(index < string.len and string.len > 0); |
| 77 | var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); | 77 | var iter_back: ReverseIterator = initAtIndex(wordbreak, string, index); |
| 78 | const first_back = iter_back.prev(); | 78 | const first_back = iter_back.prev(); |
| @@ -118,12 +118,12 @@ pub fn wordAtIndex(wordbreak: *const WordBreak, string: []const u8, index: usize | |||
| 118 | } | 118 | } |
| 119 | 119 | ||
| 120 | /// Returns an iterator over words in `slice`. | 120 | /// Returns an iterator over words in `slice`. |
| 121 | pub fn iterator(wordbreak: *const WordBreak, slice: []const u8) Iterator { | 121 | pub fn iterator(wordbreak: *const Words, slice: []const u8) Iterator { |
| 122 | return Iterator.init(wordbreak, slice); | 122 | return Iterator.init(wordbreak, slice); |
| 123 | } | 123 | } |
| 124 | 124 | ||
| 125 | /// Returns a reverse iterator over the words in `slice`. | 125 | /// Returns a reverse iterator over the words in `slice`. |
| 126 | pub fn reverseIterator(wordbreak: *const WordBreak, slice: []const u8) ReverseIterator { | 126 | pub fn reverseIterator(wordbreak: *const Words, slice: []const u8) ReverseIterator { |
| 127 | return ReverseIterator.init(wordbreak, slice); | 127 | return ReverseIterator.init(wordbreak, slice); |
| 128 | } | 128 | } |
| 129 | 129 | ||
| @@ -132,10 +132,10 @@ pub const Iterator = struct { | |||
| 132 | this: ?CodePoint = null, | 132 | this: ?CodePoint = null, |
| 133 | that: ?CodePoint = null, | 133 | that: ?CodePoint = null, |
| 134 | cp_iter: CodepointIterator, | 134 | cp_iter: CodepointIterator, |
| 135 | wb: *const WordBreak, | 135 | wb: *const Words, |
| 136 | 136 | ||
| 137 | /// Assumes `str` is valid UTF-8. | 137 | /// Assumes `str` is valid UTF-8. |
| 138 | pub fn init(wb: *const WordBreak, str: []const u8) Iterator { | 138 | pub fn init(wb: *const Words, str: []const u8) Iterator { |
| 139 | var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; | 139 | var wb_iter: Iterator = .{ .cp_iter = .init(str), .wb = wb }; |
| 140 | wb_iter.advance(); | 140 | wb_iter.advance(); |
| 141 | return wb_iter; | 141 | return wb_iter; |
| @@ -314,11 +314,11 @@ pub const ReverseIterator = struct { | |||
| 314 | after: ?CodePoint = null, | 314 | after: ?CodePoint = null, |
| 315 | before: ?CodePoint = null, | 315 | before: ?CodePoint = null, |
| 316 | cp_iter: ReverseCodepointIterator, | 316 | cp_iter: ReverseCodepointIterator, |
| 317 | wb: *const WordBreak, | 317 | wb: *const Words, |
| 318 | flags: usize = 0, | 318 | flags: usize = 0, |
| 319 | 319 | ||
| 320 | /// Assumes `str` is valid UTF-8. | 320 | /// Assumes `str` is valid UTF-8. |
| 321 | pub fn init(wb: *const WordBreak, str: []const u8) ReverseIterator { | 321 | pub fn init(wb: *const Words, str: []const u8) ReverseIterator { |
| 322 | var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; | 322 | var wb_iter: ReverseIterator = .{ .cp_iter = .init(str), .wb = wb }; |
| 323 | wb_iter.advance(); | 323 | wb_iter.advance(); |
| 324 | return wb_iter; | 324 | return wb_iter; |
| @@ -511,7 +511,7 @@ pub const ReverseIterator = struct { | |||
| 511 | //| Implementation Details | 511 | //| Implementation Details |
| 512 | 512 | ||
| 513 | /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. | 513 | /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. |
| 514 | fn initAtIndex(wb: *const WordBreak, string: []const u8, index: usize) ReverseIterator { | 514 | fn initAtIndex(wb: *const Words, string: []const u8, index: usize) ReverseIterator { |
| 515 | var idx: u32 = @intCast(index); | 515 | var idx: u32 = @intCast(index); |
| 516 | // Find the next lead byte: | 516 | // Find the next lead byte: |
| 517 | while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} | 517 | while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} |
| @@ -536,7 +536,7 @@ fn sneaky(iter: *const ReverseIterator) SneakIterator { | |||
| 536 | 536 | ||
| 537 | const SneakIterator = struct { | 537 | const SneakIterator = struct { |
| 538 | cp_iter: ReverseCodepointIterator, | 538 | cp_iter: ReverseCodepointIterator, |
| 539 | wb: *const WordBreak, | 539 | wb: *const Words, |
| 540 | 540 | ||
| 541 | fn peek(iter: *SneakIterator) ?CodePoint { | 541 | fn peek(iter: *SneakIterator) ?CodePoint { |
| 542 | const save_cp = iter.cp_iter; | 542 | const save_cp = iter.cp_iter; |
| @@ -570,7 +570,7 @@ const SneakIterator = struct { | |||
| 570 | } | 570 | } |
| 571 | }; | 571 | }; |
| 572 | 572 | ||
| 573 | inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { | 573 | inline fn setupImpl(wb: *Words, allocator: Allocator) !void { |
| 574 | const decompressor = compress.flate.inflate.decompressor; | 574 | const decompressor = compress.flate.inflate.decompressor; |
| 575 | const in_bytes = @embedFile("wbp"); | 575 | const in_bytes = @embedFile("wbp"); |
| 576 | var in_fbs = std.io.fixedBufferStream(in_bytes); | 576 | var in_fbs = std.io.fixedBufferStream(in_bytes); |
| @@ -627,7 +627,7 @@ inline fn isExtensible(wbp: WordBreakProperty) bool { | |||
| 627 | } | 627 | } |
| 628 | 628 | ||
| 629 | test "Word Break Properties" { | 629 | test "Word Break Properties" { |
| 630 | const wb = try WordBreak.init(testing.allocator); | 630 | const wb = try Words.init(testing.allocator); |
| 631 | defer wb.deinit(testing.allocator); | 631 | defer wb.deinit(testing.allocator); |
| 632 | try testing.expectEqual(.CR, wb.breakProperty('\r')); | 632 | try testing.expectEqual(.CR, wb.breakProperty('\r')); |
| 633 | try testing.expectEqual(.LF, wb.breakProperty('\n')); | 633 | try testing.expectEqual(.LF, wb.breakProperty('\n')); |
| @@ -641,7 +641,7 @@ test "ext_pict" { | |||
| 641 | } | 641 | } |
| 642 | 642 | ||
| 643 | test wordAtIndex { | 643 | test wordAtIndex { |
| 644 | const wb = try WordBreak.init(testing.allocator); | 644 | const wb = try Words.init(testing.allocator); |
| 645 | defer wb.deinit(testing.allocator); | 645 | defer wb.deinit(testing.allocator); |
| 646 | const t_string = "first second third"; | 646 | const t_string = "first second third"; |
| 647 | const second = wb.wordAtIndex(t_string, 8); | 647 | const second = wb.wordAtIndex(t_string, 8); |
| @@ -663,7 +663,7 @@ test wordAtIndex { | |||
| 663 | const testr = "don't a:ka fin!"; | 663 | const testr = "don't a:ka fin!"; |
| 664 | 664 | ||
| 665 | test "reversal" { | 665 | test "reversal" { |
| 666 | const wb = try WordBreak.init(testing.allocator); | 666 | const wb = try Words.init(testing.allocator); |
| 667 | defer wb.deinit(testing.allocator); | 667 | defer wb.deinit(testing.allocator); |
| 668 | { | 668 | { |
| 669 | var fwd = wb.iterator(testr); | 669 | var fwd = wb.iterator(testr); |
| @@ -696,7 +696,7 @@ test "reversal" { | |||
| 696 | } | 696 | } |
| 697 | 697 | ||
| 698 | fn testAllocations(allocator: Allocator) !void { | 698 | fn testAllocations(allocator: Allocator) !void { |
| 699 | const wb = try WordBreak.init(allocator); | 699 | const wb = try Words.init(allocator); |
| 700 | wb.deinit(allocator); | 700 | wb.deinit(allocator); |
| 701 | } | 701 | } |
| 702 | 702 | ||
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 7139d4c..18f1814 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -211,7 +211,7 @@ test "Segmentation Word Iterator" { | |||
| 211 | var buf_reader = std.io.bufferedReader(file.reader()); | 211 | var buf_reader = std.io.bufferedReader(file.reader()); |
| 212 | var input_stream = buf_reader.reader(); | 212 | var input_stream = buf_reader.reader(); |
| 213 | 213 | ||
| 214 | const wb = try WordBreak.init(allocator); | 214 | const wb = try Words.init(allocator); |
| 215 | defer wb.deinit(allocator); | 215 | defer wb.deinit(allocator); |
| 216 | 216 | ||
| 217 | var buf: [4096]u8 = undefined; | 217 | var buf: [4096]u8 = undefined; |
| @@ -392,5 +392,5 @@ const Graphemes = @import("Graphemes"); | |||
| 392 | const GraphemeIterator = @import("Graphemes").Iterator; | 392 | const GraphemeIterator = @import("Graphemes").Iterator; |
| 393 | const Normalize = @import("Normalize"); | 393 | const Normalize = @import("Normalize"); |
| 394 | 394 | ||
| 395 | const WordBreak = @import("WordBreak"); | 395 | const Words = @import("Words"); |
| 396 | const Word = WordBreak.Word; | 396 | const Word = Words.Word; |