From bfb31cbc33716220b42bb398471840a4fbed0d89 Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Mon, 12 Feb 2024 10:51:34 -0400 Subject: Using Trie super slow --- build.zig | 20 ++++++++--------- src/Grapheme.zig | 7 ++++-- src/gbp.zig | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.zig | 6 ++++- src/trie.zig | 49 +++++++++++++++++++++++++++++++++-------- 5 files changed, 127 insertions(+), 22 deletions(-) create mode 100644 src/gbp.zig diff --git a/build.zig b/build.zig index bb0a33b..af6ce10 100644 --- a/build.zig +++ b/build.zig @@ -6,15 +6,15 @@ pub fn build(b: *std.Build) void { const ziglyph = b.dependency("ziglyph", .{}); - const gen_exe = b.addExecutable(.{ - .name = "gen", - .root_source_file = .{ .path = "src/gbp_gen.zig" }, - .target = target, - .optimize = optimize, - }); - gen_exe.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); - const run_gen_exe = b.addRunArtifact(gen_exe); - const gen_out = run_gen_exe.addOutputFileArg("gbp.zig"); + // const gen_exe = b.addExecutable(.{ + // .name = "gen", + // .root_source_file = .{ .path = "src/gbp_gen.zig" }, + // .target = target, + // .optimize = optimize, + // }); + // gen_exe.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); + // const run_gen_exe = b.addRunArtifact(gen_exe); + // const gen_out = run_gen_exe.addOutputFileArg("gbp.zig"); const exe = b.addExecutable(.{ .name = "zgbench", @@ -23,7 +23,7 @@ pub fn build(b: *std.Build) void { .optimize = optimize, }); exe.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); - exe.root_module.addAnonymousImport("gbp", .{ .root_source_file = gen_out }); + // exe.root_module.addAnonymousImport("gbp", .{ .root_source_file = gen_out }); b.installArtifact(exe); const run_cmd = b.addRunArtifact(exe); diff --git a/src/Grapheme.zig b/src/Grapheme.zig index a8a7638..73f6d57 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig @@ -1,13 +1,15 @@ //! `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. const std = @import("std"); +const mem = std.mem; const unicode = std.unicode; const CodePoint = @import("ziglyph").CodePoint; const CodePointIterator = CodePoint.CodePointIterator; const emoji = @import("ziglyph").emoji; -const gbp = @import("gbp"); +// const gbp = @import("gbp"); +const gbp = @import("gbp.zig"); pub const Grapheme = @This(); @@ -32,7 +34,8 @@ pub const GraphemeIterator = struct { const Self = @This(); /// Assumes `src` is valid UTF-8. - pub fn init(str: []const u8) Self { + pub fn init(allocator: mem.Allocator, str: []const u8) !Self { + try gbp.init(allocator); var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; self.buf[1] = self.cp_iter.next(); diff --git a/src/gbp.zig b/src/gbp.zig new file mode 100644 index 0000000..fa4ad54 --- /dev/null +++ b/src/gbp.zig @@ -0,0 +1,67 @@ +const std = @import("std"); +const mem = std.mem; + +const gbp = @import("ziglyph").grapheme_break; +const Trie = @import("trie.zig").Trie; +const Prop = @import("trie.zig").Prop; + +var trie: Trie = undefined; + +pub fn init(allocator: mem.Allocator) !void { + trie = .{ .allocator = allocator, .root = .{} }; + + for ('\u{0}'..'\u{10ffff}') |i| { + const cp: u21 = @intCast(i); + const prop = Prop.forCodePoint(cp); + if (prop == .none) continue; + try trie.put(cp, prop); + } + + const prop = Prop.forCodePoint('\u{10ffff}'); + if (prop == .none) return; + try trie.put('\u{10ffff}', prop); +} + +inline fn getProp(cp: u21) Prop { + return if (trie.get(cp)) |prop| prop else .none; +} + +pub inline fn isControl(cp: u21) bool { + return getProp(cp) == .control; +} + +pub inline fn isExtend(cp: u21) bool { + return getProp(cp) == .extend; +} + +pub inline fn isL(cp: u21) bool { + return getProp(cp) == .hangul_l; +} +pub inline fn isLv(cp: u21) bool { + return getProp(cp) == .hangul_lv; +} +pub inline fn isLvt(cp: u21) bool { + return getProp(cp) == .hangul_lvt; +} +pub inline fn isV(cp: u21) bool { + return getProp(cp) == .hangul_v; +} +pub inline fn isT(cp: u21) bool { + return getProp(cp) == .hangul_t; +} + +pub inline fn isPrepend(cp: u21) bool { + return getProp(cp) == .prepend; +} + +pub inline fn isRegionalIndicator(cp: u21) bool { + return getProp(cp) == .regional; +} + +pub inline fn isSpacingmark(cp: u21) bool { + return getProp(cp) == .spacing; +} + +pub inline fn isZwj(cp: u21) bool { + return getProp(cp) == .zwj; +} diff --git a/src/main.zig b/src/main.zig index b517641..6cc9fe4 100644 --- a/src/main.zig +++ b/src/main.zig @@ -5,8 +5,12 @@ const GraphemeIterator = @import("Grapheme.zig").GraphemeIterator; const input = @embedFile("lang_mix.txt"); pub fn main() !void { + var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); + defer arena.deinit(); + const allocator = arena.allocator(); + var result: usize = 0; - var iter = GraphemeIterator.init(input); + var iter = try GraphemeIterator.init(allocator, input); var timer = try std.time.Timer.start(); diff --git a/src/trie.zig b/src/trie.zig index ee77954..8d2f258 100644 --- a/src/trie.zig +++ b/src/trie.zig @@ -1,11 +1,42 @@ const std = @import("std"); const mem = std.mem; -pub const Color = enum { red, blue }; +const gbp = @import("ziglyph").grapheme_break; + +pub const Prop = enum { + none, + control, + extend, + hangul_l, + hangul_lv, + hangul_lvt, + hangul_v, + hangul_t, + prepend, + regional, + spacing, + zwj, + + pub fn forCodePoint(cp: u21) Prop { + if (gbp.isControl(cp)) return .control; + if (gbp.isExtend(cp)) return .extend; + if (gbp.isL(cp)) return .hangul_l; + if (gbp.isLv(cp)) return .hangul_lv; + if (gbp.isLvt(cp)) return .hangul_lvt; + if (gbp.isT(cp)) return .hangul_t; + if (gbp.isV(cp)) return .hangul_v; + if (gbp.isPrepend(cp)) return .prepend; + if (gbp.isRegionalIndicator(cp)) return .regional; + if (gbp.isSpacingmark(cp)) return .spacing; + if (gbp.isZwj(cp)) return .zwj; + + return .none; + } +}; pub const Node = struct { children: [256]?*Node = [_]?*Node{null} ** 256, - value: ?Color = null, + value: ?Prop = null, }; pub const Trie = struct { @@ -26,7 +57,7 @@ pub const Trie = struct { bytes[0..]; } - pub fn put(self: *Trie, cp: u24, v: Color) !void { + pub fn put(self: *Trie, cp: u24, v: Prop) !void { const s = asBytes(cp); var current: *Node = &self.root; @@ -49,7 +80,7 @@ pub const Trie = struct { } } - pub fn get(self: Trie, cp: u24) ?Color { + pub fn get(self: Trie, cp: u24) ?Prop { const s = asBytes(cp); var current = &self.root; @@ -73,9 +104,9 @@ test "Trie works" { const cp_2: u21 = '\u{10ff}'; const cp_3: u21 = '\u{10}'; - try trie.put(cp_1, .red); - try trie.put(cp_3, .blue); - try std.testing.expectEqual(@as(?Color, .red), trie.get(cp_1)); - try std.testing.expectEqual(@as(?Color, null), trie.get(cp_2)); - try std.testing.expectEqual(@as(?Color, .blue), trie.get(cp_3)); + try trie.put(cp_1, .control); + try trie.put(cp_3, .zwj); + try std.testing.expectEqual(@as(?Prop, .control), trie.get(cp_1)); + try std.testing.expectEqual(@as(?Prop, null), trie.get(cp_2)); + try std.testing.expectEqual(@as(?Prop, .zwj), trie.get(cp_3)); } -- cgit v1.2.3