From d2a38e9c2952ec4b003a5ba58f70fc21fcf088c5 Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Mon, 12 Feb 2024 19:04:50 -0400 Subject: Flat array --- src/Grapheme.zig | 7 +--- src/gbp.zig | 67 --------------------------------- src/gbp_gen.zig | 71 +++++++++-------------------------- src/main.zig | 10 +---- src/trie.zig | 112 ------------------------------------------------------- 5 files changed, 21 insertions(+), 246 deletions(-) delete mode 100644 src/gbp.zig delete mode 100644 src/trie.zig (limited to 'src') diff --git a/src/Grapheme.zig b/src/Grapheme.zig index 73f6d57..a8a7638 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig @@ -1,15 +1,13 @@ //! `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. const std = @import("std"); -const mem = std.mem; const unicode = std.unicode; const CodePoint = @import("ziglyph").CodePoint; const CodePointIterator = CodePoint.CodePointIterator; const emoji = @import("ziglyph").emoji; -// const gbp = @import("gbp"); -const gbp = @import("gbp.zig"); +const gbp = @import("gbp"); pub const Grapheme = @This(); @@ -34,8 +32,7 @@ pub const GraphemeIterator = struct { const Self = @This(); /// Assumes `src` is valid UTF-8. - pub fn init(allocator: mem.Allocator, str: []const u8) !Self { - try gbp.init(allocator); + pub fn init(str: []const u8) Self { var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; self.buf[1] = self.cp_iter.next(); diff --git a/src/gbp.zig b/src/gbp.zig deleted file mode 100644 index fa4ad54..0000000 --- a/src/gbp.zig +++ /dev/null @@ -1,67 +0,0 @@ -const std = @import("std"); -const mem = std.mem; - -const gbp = @import("ziglyph").grapheme_break; -const Trie = @import("trie.zig").Trie; -const Prop = @import("trie.zig").Prop; - -var trie: Trie = undefined; - -pub fn init(allocator: mem.Allocator) !void { - trie = .{ .allocator = allocator, .root = .{} }; - - for ('\u{0}'..'\u{10ffff}') |i| { - const cp: u21 = @intCast(i); - const prop = Prop.forCodePoint(cp); - if (prop == .none) continue; - try trie.put(cp, prop); - } - - const prop = Prop.forCodePoint('\u{10ffff}'); - if (prop == .none) return; - try trie.put('\u{10ffff}', prop); -} - -inline fn getProp(cp: u21) Prop { - return if (trie.get(cp)) |prop| prop else .none; -} - -pub inline fn isControl(cp: u21) bool { - return getProp(cp) == .control; -} - -pub inline fn isExtend(cp: u21) bool { - return getProp(cp) == .extend; -} - -pub inline fn isL(cp: u21) bool { - return getProp(cp) == .hangul_l; -} -pub inline fn isLv(cp: u21) bool { - return getProp(cp) == .hangul_lv; -} -pub inline fn isLvt(cp: u21) bool { - return getProp(cp) == .hangul_lvt; -} -pub inline fn isV(cp: u21) bool { - return getProp(cp) == .hangul_v; -} -pub inline fn isT(cp: u21) bool { - return getProp(cp) == .hangul_t; -} - -pub inline fn isPrepend(cp: u21) bool { - return getProp(cp) == .prepend; -} - -pub inline fn isRegionalIndicator(cp: u21) bool { - return getProp(cp) == .regional; -} - -pub inline fn isSpacingmark(cp: u21) bool { - return getProp(cp) == .spacing; -} - -pub inline fn isZwj(cp: u21) bool { - return getProp(cp) == .zwj; -} diff --git a/src/gbp_gen.zig b/src/gbp_gen.zig index 578543d..7673931 100644 --- a/src/gbp_gen.zig +++ b/src/gbp_gen.zig @@ -2,26 +2,6 @@ const std = @import("std"); const gbp = @import("ziglyph").grapheme_break; -const Map = struct { - store: [12]Prop = [_]Prop{.none} ** 12, - len: u8 = 0, - - fn getOrPut(self: *Map, prop: Prop) usize { - var index: ?usize = null; - for (0..self.store.len) |i| { - if (self.store[i] == prop) index = i; - } - - if (index) |idx| { - return idx; - } else { - self.store[self.len] = prop; - self.len += 1; - return self.len - 1; - } - } -}; - const Prop = enum { none, @@ -55,22 +35,20 @@ const Prop = enum { }; pub fn main() !void { - var stage_1: [4352]u21 = undefined; - var stage_2: [1_114_112]u4 = undefined; - var stage_3 = Map{}; - - var current_block_offset: u21 = 0; + var a = [_]?Prop{null} ** 1_114_112; - for (0..0x10ffff + 1) |i| { + // for ('\u{0}'..'\u{10ffff}') |i| { + for ('\u{0}'..'\u{10}') |i| { const cp: u21 = @intCast(i); - const stage_1_index = cp >> 8; - const stage_2_index = current_block_offset + (cp & 0xff); - const stage_3_index = stage_3.getOrPut(Prop.forCodePoint(cp)); - stage_1[stage_1_index] = current_block_offset; - stage_2[stage_2_index] = @intCast(stage_3_index); - if (cp & 0xff == 255) current_block_offset += 256; + const prop = Prop.forCodePoint(cp); + if (prop == .none) continue; + a[cp] = prop; } + const cp = '\u{10ffff}'; + const prop = Prop.forCodePoint(cp); + if (prop != .none) a[cp] = prop; + var args_iter = std.process.args(); _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); @@ -101,33 +79,20 @@ pub fn main() !void { try writer.writeAll(prop_code); - try writer.writeAll("const stage_1 = [_]u21{"); - for (stage_1, 0..) |v, i| { + try writer.writeAll("const array = [_]?Prop{"); + for (&a, 0..) |v, i| { if (i != 0) try writer.writeByte(','); - _ = try writer.print("{}", .{v}); - } - try writer.writeAll("};\n"); - - try writer.writeAll("const stage_2 = [_]u4{"); - for (stage_2, 0..) |v, i| { - if (i != 0) try writer.writeByte(','); - _ = try writer.print("{}", .{v}); - } - try writer.writeAll("};\n"); - - try writer.writeAll("const stage_3 = [_]Prop{"); - for (stage_3.store, 0..) |v, i| { - if (i != 0) try writer.writeByte(','); - _ = try writer.print(".{s}", .{@tagName(v)}); + if (v) |p| { + _ = try writer.print(".{s}", .{@tagName(p)}); + } else { + try writer.writeAll("null"); + } } try writer.writeAll("};\n"); const code = \\inline fn getProp(cp: u21) Prop { - \\ const stage_1_index = cp >> 8; - \\ const stage_2_index = stage_1[stage_1_index] + (cp & 0xff); - \\ const stage_3_index = stage_2[stage_2_index]; - \\ return stage_3[stage_3_index]; + \\ return if (array[cp]) |prop| prop else .none; \\} \\ \\pub inline fn isControl(cp: u21) bool { diff --git a/src/main.zig b/src/main.zig index 6cc9fe4..5de7458 100644 --- a/src/main.zig +++ b/src/main.zig @@ -5,12 +5,8 @@ const GraphemeIterator = @import("Grapheme.zig").GraphemeIterator; const input = @embedFile("lang_mix.txt"); pub fn main() !void { - var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); - defer arena.deinit(); - const allocator = arena.allocator(); - var result: usize = 0; - var iter = try GraphemeIterator.init(allocator, input); + var iter = GraphemeIterator.init(input); var timer = try std.time.Timer.start(); @@ -21,7 +17,3 @@ pub fn main() !void { std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); } - -test { - _ = @import("trie.zig"); -} diff --git a/src/trie.zig b/src/trie.zig deleted file mode 100644 index 8d2f258..0000000 --- a/src/trie.zig +++ /dev/null @@ -1,112 +0,0 @@ -const std = @import("std"); -const mem = std.mem; - -const gbp = @import("ziglyph").grapheme_break; - -pub const Prop = enum { - none, - control, - extend, - hangul_l, - hangul_lv, - hangul_lvt, - hangul_v, - hangul_t, - prepend, - regional, - spacing, - zwj, - - pub fn forCodePoint(cp: u21) Prop { - if (gbp.isControl(cp)) return .control; - if (gbp.isExtend(cp)) return .extend; - if (gbp.isL(cp)) return .hangul_l; - if (gbp.isLv(cp)) return .hangul_lv; - if (gbp.isLvt(cp)) return .hangul_lvt; - if (gbp.isT(cp)) return .hangul_t; - if (gbp.isV(cp)) return .hangul_v; - if (gbp.isPrepend(cp)) return .prepend; - if (gbp.isRegionalIndicator(cp)) return .regional; - if (gbp.isSpacingmark(cp)) return .spacing; - if (gbp.isZwj(cp)) return .zwj; - - return .none; - } -}; - -pub const Node = struct { - children: [256]?*Node = [_]?*Node{null} ** 256, - value: ?Prop = null, -}; - -pub const Trie = struct { - allocator: mem.Allocator, - root: Node, - node_count: usize = 0, - - fn asBytes(cp: u24) []const u8 { - const bytes: [3]u8 = @bitCast(cp); - - return if (bytes[0] < 128) - bytes[0..1] - else if (bytes[1] == 0) - bytes[0..1] - else if (bytes[2] == 0) - bytes[0..2] - else - bytes[0..]; - } - - pub fn put(self: *Trie, cp: u24, v: Prop) !void { - const s = asBytes(cp); - var current: *Node = &self.root; - - for (s, 0..) |b, i| { - if (current.children[b]) |n| { - if (i == s.len - 1) { - n.value = v; - return; - } - - current = n; - continue; - } - - self.node_count += 1; - const new_node = try self.allocator.create(Node); - new_node.* = .{ .value = if (i == s.len - 1) v else null }; - current.children[b] = new_node; - current = new_node; - } - } - - pub fn get(self: Trie, cp: u24) ?Prop { - const s = asBytes(cp); - var current = &self.root; - - return for (s, 0..) |b, i| { - if (current.children[b]) |n| { - if (i == s.len - 1) break if (n.value) |v| v else null; - current = n; - } - } else null; - } -}; - -test "Trie works" { - var arena = std.heap.ArenaAllocator.init(std.testing.allocator); - defer arena.deinit(); - const allocator = arena.allocator(); - - var trie = Trie{ .allocator = allocator, .root = .{} }; - - const cp_1: u21 = '\u{10ffff}'; - const cp_2: u21 = '\u{10ff}'; - const cp_3: u21 = '\u{10}'; - - try trie.put(cp_1, .control); - try trie.put(cp_3, .zwj); - try std.testing.expectEqual(@as(?Prop, .control), trie.get(cp_1)); - try std.testing.expectEqual(@as(?Prop, null), trie.get(cp_2)); - try std.testing.expectEqual(@as(?Prop, .zwj), trie.get(cp_3)); -} -- cgit v1.2.3