diff options
| author | 2024-02-12 19:04:50 -0400 | |
|---|---|---|
| committer | 2024-02-12 19:04:50 -0400 | |
| commit | d2a38e9c2952ec4b003a5ba58f70fc21fcf088c5 (patch) | |
| tree | 7b48d02e3505af142128c484fe76b7aee9091cf9 /src | |
| parent | Using Trie super slow (diff) | |
| download | zg-d2a38e9c2952ec4b003a5ba58f70fc21fcf088c5.tar.gz zg-d2a38e9c2952ec4b003a5ba58f70fc21fcf088c5.tar.xz zg-d2a38e9c2952ec4b003a5ba58f70fc21fcf088c5.zip | |
Flat array
Diffstat (limited to 'src')
| -rw-r--r-- | src/Grapheme.zig | 7 | ||||
| -rw-r--r-- | src/gbp.zig | 67 | ||||
| -rw-r--r-- | src/gbp_gen.zig | 71 | ||||
| -rw-r--r-- | src/main.zig | 10 | ||||
| -rw-r--r-- | src/trie.zig | 112 |
5 files changed, 21 insertions, 246 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig index 73f6d57..a8a7638 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig | |||
| @@ -1,15 +1,13 @@ | |||
| 1 | //! `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. | 1 | //! `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. |
| 2 | 2 | ||
| 3 | const std = @import("std"); | 3 | const std = @import("std"); |
| 4 | const mem = std.mem; | ||
| 5 | const unicode = std.unicode; | 4 | const unicode = std.unicode; |
| 6 | 5 | ||
| 7 | const CodePoint = @import("ziglyph").CodePoint; | 6 | const CodePoint = @import("ziglyph").CodePoint; |
| 8 | const CodePointIterator = CodePoint.CodePointIterator; | 7 | const CodePointIterator = CodePoint.CodePointIterator; |
| 9 | const emoji = @import("ziglyph").emoji; | 8 | const emoji = @import("ziglyph").emoji; |
| 10 | 9 | ||
| 11 | // const gbp = @import("gbp"); | 10 | const gbp = @import("gbp"); |
| 12 | const gbp = @import("gbp.zig"); | ||
| 13 | 11 | ||
| 14 | pub const Grapheme = @This(); | 12 | pub const Grapheme = @This(); |
| 15 | 13 | ||
| @@ -34,8 +32,7 @@ pub const GraphemeIterator = struct { | |||
| 34 | const Self = @This(); | 32 | const Self = @This(); |
| 35 | 33 | ||
| 36 | /// Assumes `src` is valid UTF-8. | 34 | /// Assumes `src` is valid UTF-8. |
| 37 | pub fn init(allocator: mem.Allocator, str: []const u8) !Self { | 35 | pub fn init(str: []const u8) Self { |
| 38 | try gbp.init(allocator); | ||
| 39 | var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; | 36 | var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; |
| 40 | self.buf[1] = self.cp_iter.next(); | 37 | self.buf[1] = self.cp_iter.next(); |
| 41 | 38 | ||
diff --git a/src/gbp.zig b/src/gbp.zig deleted file mode 100644 index fa4ad54..0000000 --- a/src/gbp.zig +++ /dev/null | |||
| @@ -1,67 +0,0 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const mem = std.mem; | ||
| 3 | |||
| 4 | const gbp = @import("ziglyph").grapheme_break; | ||
| 5 | const Trie = @import("trie.zig").Trie; | ||
| 6 | const Prop = @import("trie.zig").Prop; | ||
| 7 | |||
| 8 | var trie: Trie = undefined; | ||
| 9 | |||
| 10 | pub fn init(allocator: mem.Allocator) !void { | ||
| 11 | trie = .{ .allocator = allocator, .root = .{} }; | ||
| 12 | |||
| 13 | for ('\u{0}'..'\u{10ffff}') |i| { | ||
| 14 | const cp: u21 = @intCast(i); | ||
| 15 | const prop = Prop.forCodePoint(cp); | ||
| 16 | if (prop == .none) continue; | ||
| 17 | try trie.put(cp, prop); | ||
| 18 | } | ||
| 19 | |||
| 20 | const prop = Prop.forCodePoint('\u{10ffff}'); | ||
| 21 | if (prop == .none) return; | ||
| 22 | try trie.put('\u{10ffff}', prop); | ||
| 23 | } | ||
| 24 | |||
| 25 | inline fn getProp(cp: u21) Prop { | ||
| 26 | return if (trie.get(cp)) |prop| prop else .none; | ||
| 27 | } | ||
| 28 | |||
| 29 | pub inline fn isControl(cp: u21) bool { | ||
| 30 | return getProp(cp) == .control; | ||
| 31 | } | ||
| 32 | |||
| 33 | pub inline fn isExtend(cp: u21) bool { | ||
| 34 | return getProp(cp) == .extend; | ||
| 35 | } | ||
| 36 | |||
| 37 | pub inline fn isL(cp: u21) bool { | ||
| 38 | return getProp(cp) == .hangul_l; | ||
| 39 | } | ||
| 40 | pub inline fn isLv(cp: u21) bool { | ||
| 41 | return getProp(cp) == .hangul_lv; | ||
| 42 | } | ||
| 43 | pub inline fn isLvt(cp: u21) bool { | ||
| 44 | return getProp(cp) == .hangul_lvt; | ||
| 45 | } | ||
| 46 | pub inline fn isV(cp: u21) bool { | ||
| 47 | return getProp(cp) == .hangul_v; | ||
| 48 | } | ||
| 49 | pub inline fn isT(cp: u21) bool { | ||
| 50 | return getProp(cp) == .hangul_t; | ||
| 51 | } | ||
| 52 | |||
| 53 | pub inline fn isPrepend(cp: u21) bool { | ||
| 54 | return getProp(cp) == .prepend; | ||
| 55 | } | ||
| 56 | |||
| 57 | pub inline fn isRegionalIndicator(cp: u21) bool { | ||
| 58 | return getProp(cp) == .regional; | ||
| 59 | } | ||
| 60 | |||
| 61 | pub inline fn isSpacingmark(cp: u21) bool { | ||
| 62 | return getProp(cp) == .spacing; | ||
| 63 | } | ||
| 64 | |||
| 65 | pub inline fn isZwj(cp: u21) bool { | ||
| 66 | return getProp(cp) == .zwj; | ||
| 67 | } | ||
diff --git a/src/gbp_gen.zig b/src/gbp_gen.zig index 578543d..7673931 100644 --- a/src/gbp_gen.zig +++ b/src/gbp_gen.zig | |||
| @@ -2,26 +2,6 @@ const std = @import("std"); | |||
| 2 | 2 | ||
| 3 | const gbp = @import("ziglyph").grapheme_break; | 3 | const gbp = @import("ziglyph").grapheme_break; |
| 4 | 4 | ||
| 5 | const Map = struct { | ||
| 6 | store: [12]Prop = [_]Prop{.none} ** 12, | ||
| 7 | len: u8 = 0, | ||
| 8 | |||
| 9 | fn getOrPut(self: *Map, prop: Prop) usize { | ||
| 10 | var index: ?usize = null; | ||
| 11 | for (0..self.store.len) |i| { | ||
| 12 | if (self.store[i] == prop) index = i; | ||
| 13 | } | ||
| 14 | |||
| 15 | if (index) |idx| { | ||
| 16 | return idx; | ||
| 17 | } else { | ||
| 18 | self.store[self.len] = prop; | ||
| 19 | self.len += 1; | ||
| 20 | return self.len - 1; | ||
| 21 | } | ||
| 22 | } | ||
| 23 | }; | ||
| 24 | |||
| 25 | const Prop = enum { | 5 | const Prop = enum { |
| 26 | none, | 6 | none, |
| 27 | 7 | ||
| @@ -55,22 +35,20 @@ const Prop = enum { | |||
| 55 | }; | 35 | }; |
| 56 | 36 | ||
| 57 | pub fn main() !void { | 37 | pub fn main() !void { |
| 58 | var stage_1: [4352]u21 = undefined; | 38 | var a = [_]?Prop{null} ** 1_114_112; |
| 59 | var stage_2: [1_114_112]u4 = undefined; | ||
| 60 | var stage_3 = Map{}; | ||
| 61 | |||
| 62 | var current_block_offset: u21 = 0; | ||
| 63 | 39 | ||
| 64 | for (0..0x10ffff + 1) |i| { | 40 | // for ('\u{0}'..'\u{10ffff}') |i| { |
| 41 | for ('\u{0}'..'\u{10}') |i| { | ||
| 65 | const cp: u21 = @intCast(i); | 42 | const cp: u21 = @intCast(i); |
| 66 | const stage_1_index = cp >> 8; | 43 | const prop = Prop.forCodePoint(cp); |
| 67 | const stage_2_index = current_block_offset + (cp & 0xff); | 44 | if (prop == .none) continue; |
| 68 | const stage_3_index = stage_3.getOrPut(Prop.forCodePoint(cp)); | 45 | a[cp] = prop; |
| 69 | stage_1[stage_1_index] = current_block_offset; | ||
| 70 | stage_2[stage_2_index] = @intCast(stage_3_index); | ||
| 71 | if (cp & 0xff == 255) current_block_offset += 256; | ||
| 72 | } | 46 | } |
| 73 | 47 | ||
| 48 | const cp = '\u{10ffff}'; | ||
| 49 | const prop = Prop.forCodePoint(cp); | ||
| 50 | if (prop != .none) a[cp] = prop; | ||
| 51 | |||
| 74 | var args_iter = std.process.args(); | 52 | var args_iter = std.process.args(); |
| 75 | _ = args_iter.skip(); | 53 | _ = args_iter.skip(); |
| 76 | const output_path = args_iter.next() orelse @panic("No output file arg!"); | 54 | const output_path = args_iter.next() orelse @panic("No output file arg!"); |
| @@ -101,33 +79,20 @@ pub fn main() !void { | |||
| 101 | 79 | ||
| 102 | try writer.writeAll(prop_code); | 80 | try writer.writeAll(prop_code); |
| 103 | 81 | ||
| 104 | try writer.writeAll("const stage_1 = [_]u21{"); | 82 | try writer.writeAll("const array = [_]?Prop{"); |
| 105 | for (stage_1, 0..) |v, i| { | 83 | for (&a, 0..) |v, i| { |
| 106 | if (i != 0) try writer.writeByte(','); | 84 | if (i != 0) try writer.writeByte(','); |
| 107 | _ = try writer.print("{}", .{v}); | 85 | if (v) |p| { |
| 108 | } | 86 | _ = try writer.print(".{s}", .{@tagName(p)}); |
| 109 | try writer.writeAll("};\n"); | 87 | } else { |
| 110 | 88 | try writer.writeAll("null"); | |
| 111 | try writer.writeAll("const stage_2 = [_]u4{"); | 89 | } |
| 112 | for (stage_2, 0..) |v, i| { | ||
| 113 | if (i != 0) try writer.writeByte(','); | ||
| 114 | _ = try writer.print("{}", .{v}); | ||
| 115 | } | ||
| 116 | try writer.writeAll("};\n"); | ||
| 117 | |||
| 118 | try writer.writeAll("const stage_3 = [_]Prop{"); | ||
| 119 | for (stage_3.store, 0..) |v, i| { | ||
| 120 | if (i != 0) try writer.writeByte(','); | ||
| 121 | _ = try writer.print(".{s}", .{@tagName(v)}); | ||
| 122 | } | 90 | } |
| 123 | try writer.writeAll("};\n"); | 91 | try writer.writeAll("};\n"); |
| 124 | 92 | ||
| 125 | const code = | 93 | const code = |
| 126 | \\inline fn getProp(cp: u21) Prop { | 94 | \\inline fn getProp(cp: u21) Prop { |
| 127 | \\ const stage_1_index = cp >> 8; | 95 | \\ return if (array[cp]) |prop| prop else .none; |
| 128 | \\ const stage_2_index = stage_1[stage_1_index] + (cp & 0xff); | ||
| 129 | \\ const stage_3_index = stage_2[stage_2_index]; | ||
| 130 | \\ return stage_3[stage_3_index]; | ||
| 131 | \\} | 96 | \\} |
| 132 | \\ | 97 | \\ |
| 133 | \\pub inline fn isControl(cp: u21) bool { | 98 | \\pub inline fn isControl(cp: u21) bool { |
diff --git a/src/main.zig b/src/main.zig index 6cc9fe4..5de7458 100644 --- a/src/main.zig +++ b/src/main.zig | |||
| @@ -5,12 +5,8 @@ const GraphemeIterator = @import("Grapheme.zig").GraphemeIterator; | |||
| 5 | const input = @embedFile("lang_mix.txt"); | 5 | const input = @embedFile("lang_mix.txt"); |
| 6 | 6 | ||
| 7 | pub fn main() !void { | 7 | pub fn main() !void { |
| 8 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); | ||
| 9 | defer arena.deinit(); | ||
| 10 | const allocator = arena.allocator(); | ||
| 11 | |||
| 12 | var result: usize = 0; | 8 | var result: usize = 0; |
| 13 | var iter = try GraphemeIterator.init(allocator, input); | 9 | var iter = GraphemeIterator.init(input); |
| 14 | 10 | ||
| 15 | var timer = try std.time.Timer.start(); | 11 | var timer = try std.time.Timer.start(); |
| 16 | 12 | ||
| @@ -21,7 +17,3 @@ pub fn main() !void { | |||
| 21 | 17 | ||
| 22 | std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); | 18 | std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); |
| 23 | } | 19 | } |
| 24 | |||
| 25 | test { | ||
| 26 | _ = @import("trie.zig"); | ||
| 27 | } | ||
diff --git a/src/trie.zig b/src/trie.zig deleted file mode 100644 index 8d2f258..0000000 --- a/src/trie.zig +++ /dev/null | |||
| @@ -1,112 +0,0 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const mem = std.mem; | ||
| 3 | |||
| 4 | const gbp = @import("ziglyph").grapheme_break; | ||
| 5 | |||
| 6 | pub const Prop = enum { | ||
| 7 | none, | ||
| 8 | control, | ||
| 9 | extend, | ||
| 10 | hangul_l, | ||
| 11 | hangul_lv, | ||
| 12 | hangul_lvt, | ||
| 13 | hangul_v, | ||
| 14 | hangul_t, | ||
| 15 | prepend, | ||
| 16 | regional, | ||
| 17 | spacing, | ||
| 18 | zwj, | ||
| 19 | |||
| 20 | pub fn forCodePoint(cp: u21) Prop { | ||
| 21 | if (gbp.isControl(cp)) return .control; | ||
| 22 | if (gbp.isExtend(cp)) return .extend; | ||
| 23 | if (gbp.isL(cp)) return .hangul_l; | ||
| 24 | if (gbp.isLv(cp)) return .hangul_lv; | ||
| 25 | if (gbp.isLvt(cp)) return .hangul_lvt; | ||
| 26 | if (gbp.isT(cp)) return .hangul_t; | ||
| 27 | if (gbp.isV(cp)) return .hangul_v; | ||
| 28 | if (gbp.isPrepend(cp)) return .prepend; | ||
| 29 | if (gbp.isRegionalIndicator(cp)) return .regional; | ||
| 30 | if (gbp.isSpacingmark(cp)) return .spacing; | ||
| 31 | if (gbp.isZwj(cp)) return .zwj; | ||
| 32 | |||
| 33 | return .none; | ||
| 34 | } | ||
| 35 | }; | ||
| 36 | |||
| 37 | pub const Node = struct { | ||
| 38 | children: [256]?*Node = [_]?*Node{null} ** 256, | ||
| 39 | value: ?Prop = null, | ||
| 40 | }; | ||
| 41 | |||
| 42 | pub const Trie = struct { | ||
| 43 | allocator: mem.Allocator, | ||
| 44 | root: Node, | ||
| 45 | node_count: usize = 0, | ||
| 46 | |||
| 47 | fn asBytes(cp: u24) []const u8 { | ||
| 48 | const bytes: [3]u8 = @bitCast(cp); | ||
| 49 | |||
| 50 | return if (bytes[0] < 128) | ||
| 51 | bytes[0..1] | ||
| 52 | else if (bytes[1] == 0) | ||
| 53 | bytes[0..1] | ||
| 54 | else if (bytes[2] == 0) | ||
| 55 | bytes[0..2] | ||
| 56 | else | ||
| 57 | bytes[0..]; | ||
| 58 | } | ||
| 59 | |||
| 60 | pub fn put(self: *Trie, cp: u24, v: Prop) !void { | ||
| 61 | const s = asBytes(cp); | ||
| 62 | var current: *Node = &self.root; | ||
| 63 | |||
| 64 | for (s, 0..) |b, i| { | ||
| 65 | if (current.children[b]) |n| { | ||
| 66 | if (i == s.len - 1) { | ||
| 67 | n.value = v; | ||
| 68 | return; | ||
| 69 | } | ||
| 70 | |||
| 71 | current = n; | ||
| 72 | continue; | ||
| 73 | } | ||
| 74 | |||
| 75 | self.node_count += 1; | ||
| 76 | const new_node = try self.allocator.create(Node); | ||
| 77 | new_node.* = .{ .value = if (i == s.len - 1) v else null }; | ||
| 78 | current.children[b] = new_node; | ||
| 79 | current = new_node; | ||
| 80 | } | ||
| 81 | } | ||
| 82 | |||
| 83 | pub fn get(self: Trie, cp: u24) ?Prop { | ||
| 84 | const s = asBytes(cp); | ||
| 85 | var current = &self.root; | ||
| 86 | |||
| 87 | return for (s, 0..) |b, i| { | ||
| 88 | if (current.children[b]) |n| { | ||
| 89 | if (i == s.len - 1) break if (n.value) |v| v else null; | ||
| 90 | current = n; | ||
| 91 | } | ||
| 92 | } else null; | ||
| 93 | } | ||
| 94 | }; | ||
| 95 | |||
| 96 | test "Trie works" { | ||
| 97 | var arena = std.heap.ArenaAllocator.init(std.testing.allocator); | ||
| 98 | defer arena.deinit(); | ||
| 99 | const allocator = arena.allocator(); | ||
| 100 | |||
| 101 | var trie = Trie{ .allocator = allocator, .root = .{} }; | ||
| 102 | |||
| 103 | const cp_1: u21 = '\u{10ffff}'; | ||
| 104 | const cp_2: u21 = '\u{10ff}'; | ||
| 105 | const cp_3: u21 = '\u{10}'; | ||
| 106 | |||
| 107 | try trie.put(cp_1, .control); | ||
| 108 | try trie.put(cp_3, .zwj); | ||
| 109 | try std.testing.expectEqual(@as(?Prop, .control), trie.get(cp_1)); | ||
| 110 | try std.testing.expectEqual(@as(?Prop, null), trie.get(cp_2)); | ||
| 111 | try std.testing.expectEqual(@as(?Prop, .zwj), trie.get(cp_3)); | ||
| 112 | } | ||