summaryrefslogtreecommitdiff
path: root/src/WordBreak.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/WordBreak.zig')
-rw-r--r--src/WordBreak.zig102
1 files changed, 102 insertions, 0 deletions
diff --git a/src/WordBreak.zig b/src/WordBreak.zig
new file mode 100644
index 0000000..9044740
--- /dev/null
+++ b/src/WordBreak.zig
@@ -0,0 +1,102 @@
1//! Word Breaking Algorithm.
2
3const WordBreakProperty = enum(u5) {
4 none,
5 Double_Quote,
6 Single_Quote,
7 Hebrew_Letter,
8 CR,
9 LF,
10 Newline,
11 Extend,
12 Regional_Indicator,
13 Format,
14 Katakana,
15 ALetter,
16 MidLetter,
17 MidNum,
18 MidNumLet,
19 Numeric,
20 ExtendNumLet,
21 ZWJ,
22 WSegSpace,
23};
24
25s1: []u16 = undefined,
26s2: []u5 = undefined,
27
28const WordBreak = @This();
29
30pub fn init(allocator: Allocator) Allocator.Error!WordBreak {
31 var wb: WordBreak = undefined;
32 try wb.setup(allocator);
33 return wb;
34}
35
36pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void {
37 wb.setupImpl(allocator) catch |err| {
38 switch (err) {
39 error.OutOfMemory => |e| return e,
40 else => unreachable,
41 }
42 };
43}
44
45inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void {
46 const decompressor = compress.flate.inflate.decompressor;
47 const in_bytes = @embedFile("wbp");
48 var in_fbs = std.io.fixedBufferStream(in_bytes);
49 var in_decomp = decompressor(.raw, in_fbs.reader());
50 var reader = in_decomp.reader();
51
52 const endian = builtin.cpu.arch.endian();
53
54 const stage_1_len: u16 = try reader.readInt(u16, endian);
55 wb.s1 = try allocator.alloc(u16, stage_1_len);
56 errdefer allocator.free(wb.s1);
57 for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian);
58
59 const stage_2_len: u16 = try reader.readInt(u16, endian);
60 wb.s2 = try allocator.alloc(u5, stage_2_len);
61 errdefer allocator.free(wb.s2);
62 for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian));
63 var count_0: usize = 0;
64 for (wb.s2) |nyb| {
65 if (nyb == 0) count_0 += 1;
66 }
67}
68
69pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void {
70 allocator.free(wordbreak.s1);
71 allocator.free(wordbreak.s2);
72}
73
74/// Returns the word break property type for `cp`.
75pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty {
76 return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]);
77}
78
79test "Word Break Properties" {
80 const wb = try WordBreak.init(testing.allocator);
81 defer wb.deinit(testing.allocator);
82 try testing.expectEqual(.CR, wb.breakProperty('\r'));
83 try testing.expectEqual(.LF, wb.breakProperty('\n'));
84 try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש'));
85 try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}'));
86}
87
88fn testAllocations(allocator: Allocator) !void {
89 const wb = try WordBreak.init(allocator);
90 wb.deinit(allocator);
91}
92
93test "allocation safety" {
94 try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{});
95}
96
97const std = @import("std");
98const builtin = @import("builtin");
99const compress = std.compress;
100const mem = std.mem;
101const Allocator = mem.Allocator;
102const testing = std.testing;