diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/Grapheme.zig | 7 | ||||
| -rw-r--r-- | src/gbp.zig | 67 | ||||
| -rw-r--r-- | src/main.zig | 6 | ||||
| -rw-r--r-- | src/trie.zig | 49 |
4 files changed, 117 insertions, 12 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig index a8a7638..73f6d57 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig | |||
| @@ -1,13 +1,15 @@ | |||
| 1 | //! `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. | 1 | //! `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. |
| 2 | 2 | ||
| 3 | const std = @import("std"); | 3 | const std = @import("std"); |
| 4 | const mem = std.mem; | ||
| 4 | const unicode = std.unicode; | 5 | const unicode = std.unicode; |
| 5 | 6 | ||
| 6 | const CodePoint = @import("ziglyph").CodePoint; | 7 | const CodePoint = @import("ziglyph").CodePoint; |
| 7 | const CodePointIterator = CodePoint.CodePointIterator; | 8 | const CodePointIterator = CodePoint.CodePointIterator; |
| 8 | const emoji = @import("ziglyph").emoji; | 9 | const emoji = @import("ziglyph").emoji; |
| 9 | 10 | ||
| 10 | const gbp = @import("gbp"); | 11 | // const gbp = @import("gbp"); |
| 12 | const gbp = @import("gbp.zig"); | ||
| 11 | 13 | ||
| 12 | pub const Grapheme = @This(); | 14 | pub const Grapheme = @This(); |
| 13 | 15 | ||
| @@ -32,7 +34,8 @@ pub const GraphemeIterator = struct { | |||
| 32 | const Self = @This(); | 34 | const Self = @This(); |
| 33 | 35 | ||
| 34 | /// Assumes `src` is valid UTF-8. | 36 | /// Assumes `src` is valid UTF-8. |
| 35 | pub fn init(str: []const u8) Self { | 37 | pub fn init(allocator: mem.Allocator, str: []const u8) !Self { |
| 38 | try gbp.init(allocator); | ||
| 36 | var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; | 39 | var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; |
| 37 | self.buf[1] = self.cp_iter.next(); | 40 | self.buf[1] = self.cp_iter.next(); |
| 38 | 41 | ||
diff --git a/src/gbp.zig b/src/gbp.zig new file mode 100644 index 0000000..fa4ad54 --- /dev/null +++ b/src/gbp.zig | |||
| @@ -0,0 +1,67 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const mem = std.mem; | ||
| 3 | |||
| 4 | const gbp = @import("ziglyph").grapheme_break; | ||
| 5 | const Trie = @import("trie.zig").Trie; | ||
| 6 | const Prop = @import("trie.zig").Prop; | ||
| 7 | |||
| 8 | var trie: Trie = undefined; | ||
| 9 | |||
| 10 | pub fn init(allocator: mem.Allocator) !void { | ||
| 11 | trie = .{ .allocator = allocator, .root = .{} }; | ||
| 12 | |||
| 13 | for ('\u{0}'..'\u{10ffff}') |i| { | ||
| 14 | const cp: u21 = @intCast(i); | ||
| 15 | const prop = Prop.forCodePoint(cp); | ||
| 16 | if (prop == .none) continue; | ||
| 17 | try trie.put(cp, prop); | ||
| 18 | } | ||
| 19 | |||
| 20 | const prop = Prop.forCodePoint('\u{10ffff}'); | ||
| 21 | if (prop == .none) return; | ||
| 22 | try trie.put('\u{10ffff}', prop); | ||
| 23 | } | ||
| 24 | |||
| 25 | inline fn getProp(cp: u21) Prop { | ||
| 26 | return if (trie.get(cp)) |prop| prop else .none; | ||
| 27 | } | ||
| 28 | |||
| 29 | pub inline fn isControl(cp: u21) bool { | ||
| 30 | return getProp(cp) == .control; | ||
| 31 | } | ||
| 32 | |||
| 33 | pub inline fn isExtend(cp: u21) bool { | ||
| 34 | return getProp(cp) == .extend; | ||
| 35 | } | ||
| 36 | |||
| 37 | pub inline fn isL(cp: u21) bool { | ||
| 38 | return getProp(cp) == .hangul_l; | ||
| 39 | } | ||
| 40 | pub inline fn isLv(cp: u21) bool { | ||
| 41 | return getProp(cp) == .hangul_lv; | ||
| 42 | } | ||
| 43 | pub inline fn isLvt(cp: u21) bool { | ||
| 44 | return getProp(cp) == .hangul_lvt; | ||
| 45 | } | ||
| 46 | pub inline fn isV(cp: u21) bool { | ||
| 47 | return getProp(cp) == .hangul_v; | ||
| 48 | } | ||
| 49 | pub inline fn isT(cp: u21) bool { | ||
| 50 | return getProp(cp) == .hangul_t; | ||
| 51 | } | ||
| 52 | |||
| 53 | pub inline fn isPrepend(cp: u21) bool { | ||
| 54 | return getProp(cp) == .prepend; | ||
| 55 | } | ||
| 56 | |||
| 57 | pub inline fn isRegionalIndicator(cp: u21) bool { | ||
| 58 | return getProp(cp) == .regional; | ||
| 59 | } | ||
| 60 | |||
| 61 | pub inline fn isSpacingmark(cp: u21) bool { | ||
| 62 | return getProp(cp) == .spacing; | ||
| 63 | } | ||
| 64 | |||
| 65 | pub inline fn isZwj(cp: u21) bool { | ||
| 66 | return getProp(cp) == .zwj; | ||
| 67 | } | ||
diff --git a/src/main.zig b/src/main.zig index b517641..6cc9fe4 100644 --- a/src/main.zig +++ b/src/main.zig | |||
| @@ -5,8 +5,12 @@ const GraphemeIterator = @import("Grapheme.zig").GraphemeIterator; | |||
| 5 | const input = @embedFile("lang_mix.txt"); | 5 | const input = @embedFile("lang_mix.txt"); |
| 6 | 6 | ||
| 7 | pub fn main() !void { | 7 | pub fn main() !void { |
| 8 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); | ||
| 9 | defer arena.deinit(); | ||
| 10 | const allocator = arena.allocator(); | ||
| 11 | |||
| 8 | var result: usize = 0; | 12 | var result: usize = 0; |
| 9 | var iter = GraphemeIterator.init(input); | 13 | var iter = try GraphemeIterator.init(allocator, input); |
| 10 | 14 | ||
| 11 | var timer = try std.time.Timer.start(); | 15 | var timer = try std.time.Timer.start(); |
| 12 | 16 | ||
diff --git a/src/trie.zig b/src/trie.zig index ee77954..8d2f258 100644 --- a/src/trie.zig +++ b/src/trie.zig | |||
| @@ -1,11 +1,42 @@ | |||
| 1 | const std = @import("std"); | 1 | const std = @import("std"); |
| 2 | const mem = std.mem; | 2 | const mem = std.mem; |
| 3 | 3 | ||
| 4 | pub const Color = enum { red, blue }; | 4 | const gbp = @import("ziglyph").grapheme_break; |
| 5 | |||
| 6 | pub const Prop = enum { | ||
| 7 | none, | ||
| 8 | control, | ||
| 9 | extend, | ||
| 10 | hangul_l, | ||
| 11 | hangul_lv, | ||
| 12 | hangul_lvt, | ||
| 13 | hangul_v, | ||
| 14 | hangul_t, | ||
| 15 | prepend, | ||
| 16 | regional, | ||
| 17 | spacing, | ||
| 18 | zwj, | ||
| 19 | |||
| 20 | pub fn forCodePoint(cp: u21) Prop { | ||
| 21 | if (gbp.isControl(cp)) return .control; | ||
| 22 | if (gbp.isExtend(cp)) return .extend; | ||
| 23 | if (gbp.isL(cp)) return .hangul_l; | ||
| 24 | if (gbp.isLv(cp)) return .hangul_lv; | ||
| 25 | if (gbp.isLvt(cp)) return .hangul_lvt; | ||
| 26 | if (gbp.isT(cp)) return .hangul_t; | ||
| 27 | if (gbp.isV(cp)) return .hangul_v; | ||
| 28 | if (gbp.isPrepend(cp)) return .prepend; | ||
| 29 | if (gbp.isRegionalIndicator(cp)) return .regional; | ||
| 30 | if (gbp.isSpacingmark(cp)) return .spacing; | ||
| 31 | if (gbp.isZwj(cp)) return .zwj; | ||
| 32 | |||
| 33 | return .none; | ||
| 34 | } | ||
| 35 | }; | ||
| 5 | 36 | ||
| 6 | pub const Node = struct { | 37 | pub const Node = struct { |
| 7 | children: [256]?*Node = [_]?*Node{null} ** 256, | 38 | children: [256]?*Node = [_]?*Node{null} ** 256, |
| 8 | value: ?Color = null, | 39 | value: ?Prop = null, |
| 9 | }; | 40 | }; |
| 10 | 41 | ||
| 11 | pub const Trie = struct { | 42 | pub const Trie = struct { |
| @@ -26,7 +57,7 @@ pub const Trie = struct { | |||
| 26 | bytes[0..]; | 57 | bytes[0..]; |
| 27 | } | 58 | } |
| 28 | 59 | ||
| 29 | pub fn put(self: *Trie, cp: u24, v: Color) !void { | 60 | pub fn put(self: *Trie, cp: u24, v: Prop) !void { |
| 30 | const s = asBytes(cp); | 61 | const s = asBytes(cp); |
| 31 | var current: *Node = &self.root; | 62 | var current: *Node = &self.root; |
| 32 | 63 | ||
| @@ -49,7 +80,7 @@ pub const Trie = struct { | |||
| 49 | } | 80 | } |
| 50 | } | 81 | } |
| 51 | 82 | ||
| 52 | pub fn get(self: Trie, cp: u24) ?Color { | 83 | pub fn get(self: Trie, cp: u24) ?Prop { |
| 53 | const s = asBytes(cp); | 84 | const s = asBytes(cp); |
| 54 | var current = &self.root; | 85 | var current = &self.root; |
| 55 | 86 | ||
| @@ -73,9 +104,9 @@ test "Trie works" { | |||
| 73 | const cp_2: u21 = '\u{10ff}'; | 104 | const cp_2: u21 = '\u{10ff}'; |
| 74 | const cp_3: u21 = '\u{10}'; | 105 | const cp_3: u21 = '\u{10}'; |
| 75 | 106 | ||
| 76 | try trie.put(cp_1, .red); | 107 | try trie.put(cp_1, .control); |
| 77 | try trie.put(cp_3, .blue); | 108 | try trie.put(cp_3, .zwj); |
| 78 | try std.testing.expectEqual(@as(?Color, .red), trie.get(cp_1)); | 109 | try std.testing.expectEqual(@as(?Prop, .control), trie.get(cp_1)); |
| 79 | try std.testing.expectEqual(@as(?Color, null), trie.get(cp_2)); | 110 | try std.testing.expectEqual(@as(?Prop, null), trie.get(cp_2)); |
| 80 | try std.testing.expectEqual(@as(?Color, .blue), trie.get(cp_3)); | 111 | try std.testing.expectEqual(@as(?Prop, .zwj), trie.get(cp_3)); |
| 81 | } | 112 | } |