From bfb31cbc33716220b42bb398471840a4fbed0d89 Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Mon, 12 Feb 2024 10:51:34 -0400 Subject: Using Trie super slow --- src/Grapheme.zig | 7 ++++-- src/gbp.zig | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.zig | 6 ++++- src/trie.zig | 49 +++++++++++++++++++++++++++++++++-------- 4 files changed, 117 insertions(+), 12 deletions(-) create mode 100644 src/gbp.zig (limited to 'src') diff --git a/src/Grapheme.zig b/src/Grapheme.zig index a8a7638..73f6d57 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig @@ -1,13 +1,15 @@ //! `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. const std = @import("std"); +const mem = std.mem; const unicode = std.unicode; const CodePoint = @import("ziglyph").CodePoint; const CodePointIterator = CodePoint.CodePointIterator; const emoji = @import("ziglyph").emoji; -const gbp = @import("gbp"); +// const gbp = @import("gbp"); +const gbp = @import("gbp.zig"); pub const Grapheme = @This(); @@ -32,7 +34,8 @@ pub const GraphemeIterator = struct { const Self = @This(); /// Assumes `src` is valid UTF-8. - pub fn init(str: []const u8) Self { + pub fn init(allocator: mem.Allocator, str: []const u8) !Self { + try gbp.init(allocator); var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; self.buf[1] = self.cp_iter.next(); diff --git a/src/gbp.zig b/src/gbp.zig new file mode 100644 index 0000000..fa4ad54 --- /dev/null +++ b/src/gbp.zig @@ -0,0 +1,67 @@ +const std = @import("std"); +const mem = std.mem; + +const gbp = @import("ziglyph").grapheme_break; +const Trie = @import("trie.zig").Trie; +const Prop = @import("trie.zig").Prop; + +var trie: Trie = undefined; + +pub fn init(allocator: mem.Allocator) !void { + trie = .{ .allocator = allocator, .root = .{} }; + + for ('\u{0}'..'\u{10ffff}') |i| { + const cp: u21 = @intCast(i); + const prop = Prop.forCodePoint(cp); + if (prop == .none) continue; + try trie.put(cp, prop); + } + + const prop = Prop.forCodePoint('\u{10ffff}'); + if (prop == .none) return; + try trie.put('\u{10ffff}', prop); +} + +inline fn getProp(cp: u21) Prop { + return if (trie.get(cp)) |prop| prop else .none; +} + +pub inline fn isControl(cp: u21) bool { + return getProp(cp) == .control; +} + +pub inline fn isExtend(cp: u21) bool { + return getProp(cp) == .extend; +} + +pub inline fn isL(cp: u21) bool { + return getProp(cp) == .hangul_l; +} +pub inline fn isLv(cp: u21) bool { + return getProp(cp) == .hangul_lv; +} +pub inline fn isLvt(cp: u21) bool { + return getProp(cp) == .hangul_lvt; +} +pub inline fn isV(cp: u21) bool { + return getProp(cp) == .hangul_v; +} +pub inline fn isT(cp: u21) bool { + return getProp(cp) == .hangul_t; +} + +pub inline fn isPrepend(cp: u21) bool { + return getProp(cp) == .prepend; +} + +pub inline fn isRegionalIndicator(cp: u21) bool { + return getProp(cp) == .regional; +} + +pub inline fn isSpacingmark(cp: u21) bool { + return getProp(cp) == .spacing; +} + +pub inline fn isZwj(cp: u21) bool { + return getProp(cp) == .zwj; +} diff --git a/src/main.zig b/src/main.zig index b517641..6cc9fe4 100644 --- a/src/main.zig +++ b/src/main.zig @@ -5,8 +5,12 @@ const GraphemeIterator = @import("Grapheme.zig").GraphemeIterator; const input = @embedFile("lang_mix.txt"); pub fn main() !void { + var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); + defer arena.deinit(); + const allocator = arena.allocator(); + var result: usize = 0; - var iter = GraphemeIterator.init(input); + var iter = try GraphemeIterator.init(allocator, input); var timer = try std.time.Timer.start(); diff --git a/src/trie.zig b/src/trie.zig index ee77954..8d2f258 100644 --- a/src/trie.zig +++ b/src/trie.zig @@ -1,11 +1,42 @@ const std = @import("std"); const mem = std.mem; -pub const Color = enum { red, blue }; +const gbp = @import("ziglyph").grapheme_break; + +pub const Prop = enum { + none, + control, + extend, + hangul_l, + hangul_lv, + hangul_lvt, + hangul_v, + hangul_t, + prepend, + regional, + spacing, + zwj, + + pub fn forCodePoint(cp: u21) Prop { + if (gbp.isControl(cp)) return .control; + if (gbp.isExtend(cp)) return .extend; + if (gbp.isL(cp)) return .hangul_l; + if (gbp.isLv(cp)) return .hangul_lv; + if (gbp.isLvt(cp)) return .hangul_lvt; + if (gbp.isT(cp)) return .hangul_t; + if (gbp.isV(cp)) return .hangul_v; + if (gbp.isPrepend(cp)) return .prepend; + if (gbp.isRegionalIndicator(cp)) return .regional; + if (gbp.isSpacingmark(cp)) return .spacing; + if (gbp.isZwj(cp)) return .zwj; + + return .none; + } +}; pub const Node = struct { children: [256]?*Node = [_]?*Node{null} ** 256, - value: ?Color = null, + value: ?Prop = null, }; pub const Trie = struct { @@ -26,7 +57,7 @@ pub const Trie = struct { bytes[0..]; } - pub fn put(self: *Trie, cp: u24, v: Color) !void { + pub fn put(self: *Trie, cp: u24, v: Prop) !void { const s = asBytes(cp); var current: *Node = &self.root; @@ -49,7 +80,7 @@ pub const Trie = struct { } } - pub fn get(self: Trie, cp: u24) ?Color { + pub fn get(self: Trie, cp: u24) ?Prop { const s = asBytes(cp); var current = &self.root; @@ -73,9 +104,9 @@ test "Trie works" { const cp_2: u21 = '\u{10ff}'; const cp_3: u21 = '\u{10}'; - try trie.put(cp_1, .red); - try trie.put(cp_3, .blue); - try std.testing.expectEqual(@as(?Color, .red), trie.get(cp_1)); - try std.testing.expectEqual(@as(?Color, null), trie.get(cp_2)); - try std.testing.expectEqual(@as(?Color, .blue), trie.get(cp_3)); + try trie.put(cp_1, .control); + try trie.put(cp_3, .zwj); + try std.testing.expectEqual(@as(?Prop, .control), trie.get(cp_1)); + try std.testing.expectEqual(@as(?Prop, null), trie.get(cp_2)); + try std.testing.expectEqual(@as(?Prop, .zwj), trie.get(cp_3)); } -- cgit v1.2.3