diff options
Diffstat (limited to 'src/WordBreak.zig')
| -rw-r--r-- | src/WordBreak.zig | 102 |
1 files changed, 102 insertions, 0 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig new file mode 100644 index 0000000..9044740 --- /dev/null +++ b/src/WordBreak.zig | |||
| @@ -0,0 +1,102 @@ | |||
| 1 | //! Word Breaking Algorithm. | ||
| 2 | |||
| 3 | const WordBreakProperty = enum(u5) { | ||
| 4 | none, | ||
| 5 | Double_Quote, | ||
| 6 | Single_Quote, | ||
| 7 | Hebrew_Letter, | ||
| 8 | CR, | ||
| 9 | LF, | ||
| 10 | Newline, | ||
| 11 | Extend, | ||
| 12 | Regional_Indicator, | ||
| 13 | Format, | ||
| 14 | Katakana, | ||
| 15 | ALetter, | ||
| 16 | MidLetter, | ||
| 17 | MidNum, | ||
| 18 | MidNumLet, | ||
| 19 | Numeric, | ||
| 20 | ExtendNumLet, | ||
| 21 | ZWJ, | ||
| 22 | WSegSpace, | ||
| 23 | }; | ||
| 24 | |||
| 25 | s1: []u16 = undefined, | ||
| 26 | s2: []u5 = undefined, | ||
| 27 | |||
| 28 | const WordBreak = @This(); | ||
| 29 | |||
| 30 | pub fn init(allocator: Allocator) Allocator.Error!WordBreak { | ||
| 31 | var wb: WordBreak = undefined; | ||
| 32 | try wb.setup(allocator); | ||
| 33 | return wb; | ||
| 34 | } | ||
| 35 | |||
| 36 | pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { | ||
| 37 | wb.setupImpl(allocator) catch |err| { | ||
| 38 | switch (err) { | ||
| 39 | error.OutOfMemory => |e| return e, | ||
| 40 | else => unreachable, | ||
| 41 | } | ||
| 42 | }; | ||
| 43 | } | ||
| 44 | |||
| 45 | inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { | ||
| 46 | const decompressor = compress.flate.inflate.decompressor; | ||
| 47 | const in_bytes = @embedFile("wbp"); | ||
| 48 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 49 | var in_decomp = decompressor(.raw, in_fbs.reader()); | ||
| 50 | var reader = in_decomp.reader(); | ||
| 51 | |||
| 52 | const endian = builtin.cpu.arch.endian(); | ||
| 53 | |||
| 54 | const stage_1_len: u16 = try reader.readInt(u16, endian); | ||
| 55 | wb.s1 = try allocator.alloc(u16, stage_1_len); | ||
| 56 | errdefer allocator.free(wb.s1); | ||
| 57 | for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); | ||
| 58 | |||
| 59 | const stage_2_len: u16 = try reader.readInt(u16, endian); | ||
| 60 | wb.s2 = try allocator.alloc(u5, stage_2_len); | ||
| 61 | errdefer allocator.free(wb.s2); | ||
| 62 | for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); | ||
| 63 | var count_0: usize = 0; | ||
| 64 | for (wb.s2) |nyb| { | ||
| 65 | if (nyb == 0) count_0 += 1; | ||
| 66 | } | ||
| 67 | } | ||
| 68 | |||
| 69 | pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { | ||
| 70 | allocator.free(wordbreak.s1); | ||
| 71 | allocator.free(wordbreak.s2); | ||
| 72 | } | ||
| 73 | |||
| 74 | /// Returns the word break property type for `cp`. | ||
| 75 | pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { | ||
| 76 | return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); | ||
| 77 | } | ||
| 78 | |||
| 79 | test "Word Break Properties" { | ||
| 80 | const wb = try WordBreak.init(testing.allocator); | ||
| 81 | defer wb.deinit(testing.allocator); | ||
| 82 | try testing.expectEqual(.CR, wb.breakProperty('\r')); | ||
| 83 | try testing.expectEqual(.LF, wb.breakProperty('\n')); | ||
| 84 | try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); | ||
| 85 | try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); | ||
| 86 | } | ||
| 87 | |||
| 88 | fn testAllocations(allocator: Allocator) !void { | ||
| 89 | const wb = try WordBreak.init(allocator); | ||
| 90 | wb.deinit(allocator); | ||
| 91 | } | ||
| 92 | |||
| 93 | test "allocation safety" { | ||
| 94 | try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{}); | ||
| 95 | } | ||
| 96 | |||
| 97 | const std = @import("std"); | ||
| 98 | const builtin = @import("builtin"); | ||
| 99 | const compress = std.compress; | ||
| 100 | const mem = std.mem; | ||
| 101 | const Allocator = mem.Allocator; | ||
| 102 | const testing = std.testing; | ||