From e3dbcc70688321e48ac31599105c51edac2736af Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Sun, 11 May 2025 16:30:47 -0400 Subject: Add WordBreakPropertyData Passes some simple lookup tests. --- src/WordBreak.zig | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 src/WordBreak.zig (limited to 'src/WordBreak.zig') diff --git a/src/WordBreak.zig b/src/WordBreak.zig new file mode 100644 index 0000000..9044740 --- /dev/null +++ b/src/WordBreak.zig @@ -0,0 +1,102 @@ +//! Word Breaking Algorithm. + +const WordBreakProperty = enum(u5) { + none, + Double_Quote, + Single_Quote, + Hebrew_Letter, + CR, + LF, + Newline, + Extend, + Regional_Indicator, + Format, + Katakana, + ALetter, + MidLetter, + MidNum, + MidNumLet, + Numeric, + ExtendNumLet, + ZWJ, + WSegSpace, +}; + +s1: []u16 = undefined, +s2: []u5 = undefined, + +const WordBreak = @This(); + +pub fn init(allocator: Allocator) Allocator.Error!WordBreak { + var wb: WordBreak = undefined; + try wb.setup(allocator); + return wb; +} + +pub fn setup(wb: *WordBreak, allocator: Allocator) Allocator.Error!void { + wb.setupImpl(allocator) catch |err| { + switch (err) { + error.OutOfMemory => |e| return e, + else => unreachable, + } + }; +} + +inline fn setupImpl(wb: *WordBreak, allocator: Allocator) !void { + const decompressor = compress.flate.inflate.decompressor; + const in_bytes = @embedFile("wbp"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = decompressor(.raw, in_fbs.reader()); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + const stage_1_len: u16 = try reader.readInt(u16, endian); + wb.s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(wb.s1); + for (0..stage_1_len) |i| wb.s1[i] = try reader.readInt(u16, endian); + + const stage_2_len: u16 = try reader.readInt(u16, endian); + wb.s2 = try allocator.alloc(u5, stage_2_len); + errdefer allocator.free(wb.s2); + for (0..stage_2_len) |i| wb.s2[i] = @intCast(try reader.readInt(u8, endian)); + var count_0: usize = 0; + for (wb.s2) |nyb| { + if (nyb == 0) count_0 += 1; + } +} + +pub fn deinit(wordbreak: *const WordBreak, allocator: mem.Allocator) void { + allocator.free(wordbreak.s1); + allocator.free(wordbreak.s2); +} + +/// Returns the word break property type for `cp`. +pub fn breakProperty(wordbreak: *const WordBreak, cp: u21) WordBreakProperty { + return @enumFromInt(wordbreak.s2[wordbreak.s1[cp >> 8] + (cp & 0xff)]); +} + +test "Word Break Properties" { + const wb = try WordBreak.init(testing.allocator); + defer wb.deinit(testing.allocator); + try testing.expectEqual(.CR, wb.breakProperty('\r')); + try testing.expectEqual(.LF, wb.breakProperty('\n')); + try testing.expectEqual(.Hebrew_Letter, wb.breakProperty('ש')); + try testing.expectEqual(.Katakana, wb.breakProperty('\u{30ff}')); +} + +fn testAllocations(allocator: Allocator) !void { + const wb = try WordBreak.init(allocator); + wb.deinit(allocator); +} + +test "allocation safety" { + try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{}); +} + +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; +const Allocator = mem.Allocator; +const testing = std.testing; -- cgit v1.2.3