summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/GenCatData.zig83
-rw-r--r--src/HangulData.zig8
-rw-r--r--src/main.zig33
3 files changed, 108 insertions, 16 deletions
diff --git a/src/GenCatData.zig b/src/GenCatData.zig
new file mode 100644
index 0000000..5496e4e
--- /dev/null
+++ b/src/GenCatData.zig
@@ -0,0 +1,83 @@
1const std = @import("std");
2const builtin = @import("builtin");
3const compress = std.compress;
4const mem = std.mem;
5
6/// General Category
7pub const Gc = enum {
8 Cc,
9 Cf,
10 Cn,
11 Co,
12 Cs,
13 Ll,
14 Lm,
15 Lo,
16 Lt,
17 Lu,
18 Mc,
19 Me,
20 Mn,
21 Nd,
22 Nl,
23 No,
24 Pc,
25 Pd,
26 Pe,
27 Pf,
28 Pi,
29 Po,
30 Ps,
31 Sc,
32 Sk,
33 Sm,
34 So,
35 Zl,
36 Zp,
37 Zs,
38};
39
40allocator: mem.Allocator,
41s1: []u16 = undefined,
42s2: []u5 = undefined,
43s3: []u5 = undefined,
44
45const Self = @This();
46
47pub fn init(allocator: mem.Allocator) !Self {
48 const decompressor = compress.deflate.decompressor;
49 const in_bytes = @embedFile("gencat");
50 var in_fbs = std.io.fixedBufferStream(in_bytes);
51 var in_decomp = try decompressor(allocator, in_fbs.reader(), null);
52 defer in_decomp.deinit();
53 var reader = in_decomp.reader();
54
55 const endian = builtin.cpu.arch.endian();
56
57 var self = Self{ .allocator = allocator };
58
59 const s1_len: u16 = try reader.readInt(u16, endian);
60 self.s1 = try allocator.alloc(u16, s1_len);
61 for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
62
63 const s2_len: u16 = try reader.readInt(u16, endian);
64 self.s2 = try allocator.alloc(u5, s2_len);
65 for (0..s2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian));
66
67 const s3_len: u16 = try reader.readInt(u8, endian);
68 self.s3 = try allocator.alloc(u5, s3_len);
69 for (0..s3_len) |i| self.s3[i] = @intCast(try reader.readInt(u8, endian));
70
71 return self;
72}
73
74pub fn deinit(self: *Self) void {
75 self.allocator.free(self.s1);
76 self.allocator.free(self.s2);
77 self.allocator.free(self.s3);
78}
79
80/// Lookup the General Category for `cp`.
81pub inline fn gc(self: Self, cp: u21) Gc {
82 return @enumFromInt(self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]]);
83}
diff --git a/src/HangulData.zig b/src/HangulData.zig
index 4d80c99..b97424c 100644
--- a/src/HangulData.zig
+++ b/src/HangulData.zig
@@ -15,7 +15,7 @@ pub const Syllable = enum {
15 15
16allocator: mem.Allocator, 16allocator: mem.Allocator,
17s1: []u16 = undefined, 17s1: []u16 = undefined,
18s2: []Syllable = undefined, 18s2: []u3 = undefined,
19 19
20const Self = @This(); 20const Self = @This();
21 21
@@ -35,8 +35,8 @@ pub fn init(allocator: mem.Allocator) !Self {
35 for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); 35 for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
36 36
37 const stage_2_len: u16 = try reader.readInt(u16, endian); 37 const stage_2_len: u16 = try reader.readInt(u16, endian);
38 self.s2 = try allocator.alloc(Syllable, stage_2_len); 38 self.s2 = try allocator.alloc(u3, stage_2_len);
39 for (0..stage_2_len) |i| self.s2[i] = @enumFromInt(try reader.readInt(u8, endian)); 39 for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian));
40 40
41 return self; 41 return self;
42} 42}
@@ -48,5 +48,5 @@ pub fn deinit(self: *Self) void {
48 48
49/// Returns the Hangul syllable type for `cp`. 49/// Returns the Hangul syllable type for `cp`.
50pub inline fn syllable(self: Self, cp: u21) Syllable { 50pub inline fn syllable(self: Self, cp: u21) Syllable {
51 return self.s2[self.s1[cp >> 8] + (cp & 0xff)]; 51 return @enumFromInt(self.s2[self.s1[cp >> 8] + (cp & 0xff)]);
52} 52}
diff --git a/src/main.zig b/src/main.zig
index 0f1aab5..c521c4f 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -11,14 +11,16 @@ const std = @import("std");
11// const strWidth = @import("display_width").strWidth; 11// const strWidth = @import("display_width").strWidth;
12 12
13// const CodePointIterator = @import("ziglyph").CodePointIterator; 13// const CodePointIterator = @import("ziglyph").CodePointIterator;
14// const CodePointIterator = @import("code_point").Iterator; 14const CodePointIterator = @import("code_point").Iterator;
15 15
16// const ascii = @import("ascii"); 16// const ascii = @import("ascii");
17// const ascii = std.ascii; 17// const ascii = std.ascii;
18 18
19// const Normalizer = @import("ziglyph").Normalizer; 19// const Normalizer = @import("ziglyph").Normalizer;
20const NormData = @import("Normalizer").NormData; 20// const NormData = @import("Normalizer").NormData;
21const Normalizer = @import("Normalizer"); 21// const Normalizer = @import("Normalizer");
22
23const GenCatData = @import("GenCatData");
22 24
23pub fn main() !void { 25pub fn main() !void {
24 var args_iter = std.process.args(); 26 var args_iter = std.process.args();
@@ -32,16 +34,19 @@ pub fn main() !void {
32 const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32)); 34 const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32));
33 defer allocator.free(input); 35 defer allocator.free(input);
34 36
35 var data = try NormData.init(allocator); 37 // var data = try NormData.init(allocator);
36 defer data.deinit(); 38 // defer data.deinit();
37 var n = Normalizer{ .norm_data = &data }; 39 // var n = Normalizer{ .norm_data = &data };
38 // var n = try Normalizer.init(allocator); 40 // var n = try Normalizer.init(allocator);
39 // defer n.deinit(); 41 // defer n.deinit();
40 42
43 var gencat_data = try GenCatData.init(allocator);
44 defer gencat_data.deinit();
45
41 // var iter = GraphemeIterator.init(input, &data); 46 // var iter = GraphemeIterator.init(input, &data);
42 // defer iter.deinit(); 47 // defer iter.deinit();
43 // var iter = CodePointIterator{ .bytes = input }; 48 var iter = CodePointIterator{ .bytes = input };
44 var iter = std.mem.splitScalar(u8, input, '\n'); 49 // var iter = std.mem.splitScalar(u8, input, '\n');
45 50
46 var result: usize = 0; 51 var result: usize = 0;
47 // var result: isize = 0; 52 // var result: isize = 0;
@@ -50,10 +55,14 @@ pub fn main() !void {
50 // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); 55 // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code));
51 // while (iter.next()) |_| result += 1; 56 // while (iter.next()) |_| result += 1;
52 // while (iter.next()) |line| result += strWidth(line, &data); 57 // while (iter.next()) |line| result += strWidth(line, &data);
53 while (iter.next()) |line| { 58 // while (iter.next()) |line| {
54 const nfc = try n.nfc(allocator, line); 59 // const nfc = try n.nfc(allocator, line);
55 result += nfc.slice.len; 60 // result += nfc.slice.len;
56 // nfc.deinit(); 61 // // nfc.deinit();
62 // }
63 while (iter.next()) |cp| {
64 if (cp.code == 'É') std.debug.print("`{u}` Gc: {s}\n", .{ cp.code, @tagName(gencat_data.gc(cp.code)) });
65 result += 1;
57 } 66 }
58 67
59 std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); 68 std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms });