summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-12 19:04:50 -0400
committerGravatar Jose Colon Rodriguez2024-02-12 19:04:50 -0400
commitd2a38e9c2952ec4b003a5ba58f70fc21fcf088c5 (patch)
tree7b48d02e3505af142128c484fe76b7aee9091cf9 /src
parentUsing Trie super slow (diff)
downloadzg-d2a38e9c2952ec4b003a5ba58f70fc21fcf088c5.tar.gz
zg-d2a38e9c2952ec4b003a5ba58f70fc21fcf088c5.tar.xz
zg-d2a38e9c2952ec4b003a5ba58f70fc21fcf088c5.zip
Flat array
Diffstat (limited to 'src')
-rw-r--r--src/Grapheme.zig7
-rw-r--r--src/gbp.zig67
-rw-r--r--src/gbp_gen.zig71
-rw-r--r--src/main.zig10
-rw-r--r--src/trie.zig112
5 files changed, 21 insertions, 246 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig
index 73f6d57..a8a7638 100644
--- a/src/Grapheme.zig
+++ b/src/Grapheme.zig
@@ -1,15 +1,13 @@
1//! `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. 1//! `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
2 2
3const std = @import("std"); 3const std = @import("std");
4const mem = std.mem;
5const unicode = std.unicode; 4const unicode = std.unicode;
6 5
7const CodePoint = @import("ziglyph").CodePoint; 6const CodePoint = @import("ziglyph").CodePoint;
8const CodePointIterator = CodePoint.CodePointIterator; 7const CodePointIterator = CodePoint.CodePointIterator;
9const emoji = @import("ziglyph").emoji; 8const emoji = @import("ziglyph").emoji;
10 9
11// const gbp = @import("gbp"); 10const gbp = @import("gbp");
12const gbp = @import("gbp.zig");
13 11
14pub const Grapheme = @This(); 12pub const Grapheme = @This();
15 13
@@ -34,8 +32,7 @@ pub const GraphemeIterator = struct {
34 const Self = @This(); 32 const Self = @This();
35 33
36 /// Assumes `src` is valid UTF-8. 34 /// Assumes `src` is valid UTF-8.
37 pub fn init(allocator: mem.Allocator, str: []const u8) !Self { 35 pub fn init(str: []const u8) Self {
38 try gbp.init(allocator);
39 var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; 36 var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } };
40 self.buf[1] = self.cp_iter.next(); 37 self.buf[1] = self.cp_iter.next();
41 38
diff --git a/src/gbp.zig b/src/gbp.zig
deleted file mode 100644
index fa4ad54..0000000
--- a/src/gbp.zig
+++ /dev/null
@@ -1,67 +0,0 @@
1const std = @import("std");
2const mem = std.mem;
3
4const gbp = @import("ziglyph").grapheme_break;
5const Trie = @import("trie.zig").Trie;
6const Prop = @import("trie.zig").Prop;
7
8var trie: Trie = undefined;
9
10pub fn init(allocator: mem.Allocator) !void {
11 trie = .{ .allocator = allocator, .root = .{} };
12
13 for ('\u{0}'..'\u{10ffff}') |i| {
14 const cp: u21 = @intCast(i);
15 const prop = Prop.forCodePoint(cp);
16 if (prop == .none) continue;
17 try trie.put(cp, prop);
18 }
19
20 const prop = Prop.forCodePoint('\u{10ffff}');
21 if (prop == .none) return;
22 try trie.put('\u{10ffff}', prop);
23}
24
25inline fn getProp(cp: u21) Prop {
26 return if (trie.get(cp)) |prop| prop else .none;
27}
28
29pub inline fn isControl(cp: u21) bool {
30 return getProp(cp) == .control;
31}
32
33pub inline fn isExtend(cp: u21) bool {
34 return getProp(cp) == .extend;
35}
36
37pub inline fn isL(cp: u21) bool {
38 return getProp(cp) == .hangul_l;
39}
40pub inline fn isLv(cp: u21) bool {
41 return getProp(cp) == .hangul_lv;
42}
43pub inline fn isLvt(cp: u21) bool {
44 return getProp(cp) == .hangul_lvt;
45}
46pub inline fn isV(cp: u21) bool {
47 return getProp(cp) == .hangul_v;
48}
49pub inline fn isT(cp: u21) bool {
50 return getProp(cp) == .hangul_t;
51}
52
53pub inline fn isPrepend(cp: u21) bool {
54 return getProp(cp) == .prepend;
55}
56
57pub inline fn isRegionalIndicator(cp: u21) bool {
58 return getProp(cp) == .regional;
59}
60
61pub inline fn isSpacingmark(cp: u21) bool {
62 return getProp(cp) == .spacing;
63}
64
65pub inline fn isZwj(cp: u21) bool {
66 return getProp(cp) == .zwj;
67}
diff --git a/src/gbp_gen.zig b/src/gbp_gen.zig
index 578543d..7673931 100644
--- a/src/gbp_gen.zig
+++ b/src/gbp_gen.zig
@@ -2,26 +2,6 @@ const std = @import("std");
2 2
3const gbp = @import("ziglyph").grapheme_break; 3const gbp = @import("ziglyph").grapheme_break;
4 4
5const Map = struct {
6 store: [12]Prop = [_]Prop{.none} ** 12,
7 len: u8 = 0,
8
9 fn getOrPut(self: *Map, prop: Prop) usize {
10 var index: ?usize = null;
11 for (0..self.store.len) |i| {
12 if (self.store[i] == prop) index = i;
13 }
14
15 if (index) |idx| {
16 return idx;
17 } else {
18 self.store[self.len] = prop;
19 self.len += 1;
20 return self.len - 1;
21 }
22 }
23};
24
25const Prop = enum { 5const Prop = enum {
26 none, 6 none,
27 7
@@ -55,22 +35,20 @@ const Prop = enum {
55}; 35};
56 36
57pub fn main() !void { 37pub fn main() !void {
58 var stage_1: [4352]u21 = undefined; 38 var a = [_]?Prop{null} ** 1_114_112;
59 var stage_2: [1_114_112]u4 = undefined;
60 var stage_3 = Map{};
61
62 var current_block_offset: u21 = 0;
63 39
64 for (0..0x10ffff + 1) |i| { 40 // for ('\u{0}'..'\u{10ffff}') |i| {
41 for ('\u{0}'..'\u{10}') |i| {
65 const cp: u21 = @intCast(i); 42 const cp: u21 = @intCast(i);
66 const stage_1_index = cp >> 8; 43 const prop = Prop.forCodePoint(cp);
67 const stage_2_index = current_block_offset + (cp & 0xff); 44 if (prop == .none) continue;
68 const stage_3_index = stage_3.getOrPut(Prop.forCodePoint(cp)); 45 a[cp] = prop;
69 stage_1[stage_1_index] = current_block_offset;
70 stage_2[stage_2_index] = @intCast(stage_3_index);
71 if (cp & 0xff == 255) current_block_offset += 256;
72 } 46 }
73 47
48 const cp = '\u{10ffff}';
49 const prop = Prop.forCodePoint(cp);
50 if (prop != .none) a[cp] = prop;
51
74 var args_iter = std.process.args(); 52 var args_iter = std.process.args();
75 _ = args_iter.skip(); 53 _ = args_iter.skip();
76 const output_path = args_iter.next() orelse @panic("No output file arg!"); 54 const output_path = args_iter.next() orelse @panic("No output file arg!");
@@ -101,33 +79,20 @@ pub fn main() !void {
101 79
102 try writer.writeAll(prop_code); 80 try writer.writeAll(prop_code);
103 81
104 try writer.writeAll("const stage_1 = [_]u21{"); 82 try writer.writeAll("const array = [_]?Prop{");
105 for (stage_1, 0..) |v, i| { 83 for (&a, 0..) |v, i| {
106 if (i != 0) try writer.writeByte(','); 84 if (i != 0) try writer.writeByte(',');
107 _ = try writer.print("{}", .{v}); 85 if (v) |p| {
108 } 86 _ = try writer.print(".{s}", .{@tagName(p)});
109 try writer.writeAll("};\n"); 87 } else {
110 88 try writer.writeAll("null");
111 try writer.writeAll("const stage_2 = [_]u4{"); 89 }
112 for (stage_2, 0..) |v, i| {
113 if (i != 0) try writer.writeByte(',');
114 _ = try writer.print("{}", .{v});
115 }
116 try writer.writeAll("};\n");
117
118 try writer.writeAll("const stage_3 = [_]Prop{");
119 for (stage_3.store, 0..) |v, i| {
120 if (i != 0) try writer.writeByte(',');
121 _ = try writer.print(".{s}", .{@tagName(v)});
122 } 90 }
123 try writer.writeAll("};\n"); 91 try writer.writeAll("};\n");
124 92
125 const code = 93 const code =
126 \\inline fn getProp(cp: u21) Prop { 94 \\inline fn getProp(cp: u21) Prop {
127 \\ const stage_1_index = cp >> 8; 95 \\ return if (array[cp]) |prop| prop else .none;
128 \\ const stage_2_index = stage_1[stage_1_index] + (cp & 0xff);
129 \\ const stage_3_index = stage_2[stage_2_index];
130 \\ return stage_3[stage_3_index];
131 \\} 96 \\}
132 \\ 97 \\
133 \\pub inline fn isControl(cp: u21) bool { 98 \\pub inline fn isControl(cp: u21) bool {
diff --git a/src/main.zig b/src/main.zig
index 6cc9fe4..5de7458 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -5,12 +5,8 @@ const GraphemeIterator = @import("Grapheme.zig").GraphemeIterator;
5const input = @embedFile("lang_mix.txt"); 5const input = @embedFile("lang_mix.txt");
6 6
7pub fn main() !void { 7pub fn main() !void {
8 var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
9 defer arena.deinit();
10 const allocator = arena.allocator();
11
12 var result: usize = 0; 8 var result: usize = 0;
13 var iter = try GraphemeIterator.init(allocator, input); 9 var iter = GraphemeIterator.init(input);
14 10
15 var timer = try std.time.Timer.start(); 11 var timer = try std.time.Timer.start();
16 12
@@ -21,7 +17,3 @@ pub fn main() !void {
21 17
22 std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); 18 std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms });
23} 19}
24
25test {
26 _ = @import("trie.zig");
27}
diff --git a/src/trie.zig b/src/trie.zig
deleted file mode 100644
index 8d2f258..0000000
--- a/src/trie.zig
+++ /dev/null
@@ -1,112 +0,0 @@
1const std = @import("std");
2const mem = std.mem;
3
4const gbp = @import("ziglyph").grapheme_break;
5
6pub const Prop = enum {
7 none,
8 control,
9 extend,
10 hangul_l,
11 hangul_lv,
12 hangul_lvt,
13 hangul_v,
14 hangul_t,
15 prepend,
16 regional,
17 spacing,
18 zwj,
19
20 pub fn forCodePoint(cp: u21) Prop {
21 if (gbp.isControl(cp)) return .control;
22 if (gbp.isExtend(cp)) return .extend;
23 if (gbp.isL(cp)) return .hangul_l;
24 if (gbp.isLv(cp)) return .hangul_lv;
25 if (gbp.isLvt(cp)) return .hangul_lvt;
26 if (gbp.isT(cp)) return .hangul_t;
27 if (gbp.isV(cp)) return .hangul_v;
28 if (gbp.isPrepend(cp)) return .prepend;
29 if (gbp.isRegionalIndicator(cp)) return .regional;
30 if (gbp.isSpacingmark(cp)) return .spacing;
31 if (gbp.isZwj(cp)) return .zwj;
32
33 return .none;
34 }
35};
36
37pub const Node = struct {
38 children: [256]?*Node = [_]?*Node{null} ** 256,
39 value: ?Prop = null,
40};
41
42pub const Trie = struct {
43 allocator: mem.Allocator,
44 root: Node,
45 node_count: usize = 0,
46
47 fn asBytes(cp: u24) []const u8 {
48 const bytes: [3]u8 = @bitCast(cp);
49
50 return if (bytes[0] < 128)
51 bytes[0..1]
52 else if (bytes[1] == 0)
53 bytes[0..1]
54 else if (bytes[2] == 0)
55 bytes[0..2]
56 else
57 bytes[0..];
58 }
59
60 pub fn put(self: *Trie, cp: u24, v: Prop) !void {
61 const s = asBytes(cp);
62 var current: *Node = &self.root;
63
64 for (s, 0..) |b, i| {
65 if (current.children[b]) |n| {
66 if (i == s.len - 1) {
67 n.value = v;
68 return;
69 }
70
71 current = n;
72 continue;
73 }
74
75 self.node_count += 1;
76 const new_node = try self.allocator.create(Node);
77 new_node.* = .{ .value = if (i == s.len - 1) v else null };
78 current.children[b] = new_node;
79 current = new_node;
80 }
81 }
82
83 pub fn get(self: Trie, cp: u24) ?Prop {
84 const s = asBytes(cp);
85 var current = &self.root;
86
87 return for (s, 0..) |b, i| {
88 if (current.children[b]) |n| {
89 if (i == s.len - 1) break if (n.value) |v| v else null;
90 current = n;
91 }
92 } else null;
93 }
94};
95
96test "Trie works" {
97 var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
98 defer arena.deinit();
99 const allocator = arena.allocator();
100
101 var trie = Trie{ .allocator = allocator, .root = .{} };
102
103 const cp_1: u21 = '\u{10ffff}';
104 const cp_2: u21 = '\u{10ff}';
105 const cp_3: u21 = '\u{10}';
106
107 try trie.put(cp_1, .control);
108 try trie.put(cp_3, .zwj);
109 try std.testing.expectEqual(@as(?Prop, .control), trie.get(cp_1));
110 try std.testing.expectEqual(@as(?Prop, null), trie.get(cp_2));
111 try std.testing.expectEqual(@as(?Prop, .zwj), trie.get(cp_3));
112}