diff options
| author | 2024-02-27 11:16:41 -0400 | |
|---|---|---|
| committer | 2024-02-27 11:16:41 -0400 | |
| commit | f6418d582fc2294983bfa647c7148a263af13db5 (patch) | |
| tree | 8f05af609928553bd91c61c31580d78a0de68736 /src | |
| parent | Removed equality functions from Normalizer (diff) | |
| download | zg-f6418d582fc2294983bfa647c7148a263af13db5.tar.gz zg-f6418d582fc2294983bfa647c7148a263af13db5.tar.xz zg-f6418d582fc2294983bfa647c7148a263af13db5.zip | |
Using NormPropsData in NormData; No Ziglyph deps in NOrmalizer
Diffstat (limited to 'src')
| -rw-r--r-- | src/NormData.zig | 4 | ||||
| -rw-r--r-- | src/NormPropsData.zig | 53 | ||||
| -rw-r--r-- | src/Normalizer.zig | 9 | ||||
| -rw-r--r-- | src/main.zig | 10 |
4 files changed, 67 insertions, 9 deletions
diff --git a/src/NormData.zig b/src/NormData.zig index 8923382..7c2a09b 100644 --- a/src/NormData.zig +++ b/src/NormData.zig | |||
| @@ -5,11 +5,13 @@ const CanonData = @import("CanonData"); | |||
| 5 | const CccData = @import("CombiningData"); | 5 | const CccData = @import("CombiningData"); |
| 6 | const CompatData = @import("CompatData"); | 6 | const CompatData = @import("CompatData"); |
| 7 | const HangulData = @import("HangulData"); | 7 | const HangulData = @import("HangulData"); |
| 8 | const NormPropsData = @import("NormPropsData"); | ||
| 8 | 9 | ||
| 9 | canon_data: CanonData, | 10 | canon_data: CanonData, |
| 10 | ccc_data: CccData, | 11 | ccc_data: CccData, |
| 11 | compat_data: CompatData, | 12 | compat_data: CompatData, |
| 12 | hangul_data: HangulData, | 13 | hangul_data: HangulData, |
| 14 | normp_data: NormPropsData, | ||
| 13 | 15 | ||
| 14 | const Self = @This(); | 16 | const Self = @This(); |
| 15 | 17 | ||
| @@ -19,6 +21,7 @@ pub fn init(allocator: std.mem.Allocator) !Self { | |||
| 19 | .ccc_data = try CccData.init(allocator), | 21 | .ccc_data = try CccData.init(allocator), |
| 20 | .compat_data = try CompatData.init(allocator), | 22 | .compat_data = try CompatData.init(allocator), |
| 21 | .hangul_data = try HangulData.init(allocator), | 23 | .hangul_data = try HangulData.init(allocator), |
| 24 | .normp_data = try NormPropsData.init(allocator), | ||
| 22 | }; | 25 | }; |
| 23 | } | 26 | } |
| 24 | 27 | ||
| @@ -27,4 +30,5 @@ pub fn deinit(self: *Self) void { | |||
| 27 | self.ccc_data.deinit(); | 30 | self.ccc_data.deinit(); |
| 28 | self.compat_data.deinit(); | 31 | self.compat_data.deinit(); |
| 29 | self.hangul_data.deinit(); | 32 | self.hangul_data.deinit(); |
| 33 | self.normp_data.deinit(); | ||
| 30 | } | 34 | } |
diff --git a/src/NormPropsData.zig b/src/NormPropsData.zig new file mode 100644 index 0000000..3c49712 --- /dev/null +++ b/src/NormPropsData.zig | |||
| @@ -0,0 +1,53 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | const compress = std.compress; | ||
| 4 | const mem = std.mem; | ||
| 5 | const testing = std.testing; | ||
| 6 | |||
| 7 | allocator: mem.Allocator, | ||
| 8 | s1: []u16 = undefined, | ||
| 9 | s2: []u4 = undefined, | ||
| 10 | |||
| 11 | const Self = @This(); | ||
| 12 | |||
| 13 | pub fn init(allocator: mem.Allocator) !Self { | ||
| 14 | const decompressor = compress.deflate.decompressor; | ||
| 15 | const in_bytes = @embedFile("normp"); | ||
| 16 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 17 | var in_decomp = try decompressor(allocator, in_fbs.reader(), null); | ||
| 18 | defer in_decomp.deinit(); | ||
| 19 | var reader = in_decomp.reader(); | ||
| 20 | |||
| 21 | const endian = builtin.cpu.arch.endian(); | ||
| 22 | var self = Self{ .allocator = allocator }; | ||
| 23 | |||
| 24 | const stage_1_len: u16 = try reader.readInt(u16, endian); | ||
| 25 | self.s1 = try allocator.alloc(u16, stage_1_len); | ||
| 26 | for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); | ||
| 27 | |||
| 28 | const stage_2_len: u16 = try reader.readInt(u16, endian); | ||
| 29 | self.s2 = try allocator.alloc(u4, stage_2_len); | ||
| 30 | for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian)); | ||
| 31 | |||
| 32 | return self; | ||
| 33 | } | ||
| 34 | |||
| 35 | pub fn deinit(self: *Self) void { | ||
| 36 | self.allocator.free(self.s1); | ||
| 37 | self.allocator.free(self.s2); | ||
| 38 | } | ||
| 39 | |||
| 40 | /// Returns true if `cp` is already in NFD form. | ||
| 41 | pub inline fn isNfd(self: Self, cp: u21) bool { | ||
| 42 | return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 1 == 0; | ||
| 43 | } | ||
| 44 | |||
| 45 | /// Returns true if `cp` is already in NFKD form. | ||
| 46 | pub inline fn isNfkd(self: Self, cp: u21) bool { | ||
| 47 | return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 2 == 0; | ||
| 48 | } | ||
| 49 | |||
| 50 | /// Returns true if `cp` is not allowed in any normalized form. | ||
| 51 | pub inline fn isFcx(self: Self, cp: u21) bool { | ||
| 52 | return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 4 == 4; | ||
| 53 | } | ||
diff --git a/src/Normalizer.zig b/src/Normalizer.zig index d1d7cee..26177ac 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig | |||
| @@ -6,8 +6,6 @@ const std = @import("std"); | |||
| 6 | const testing = std.testing; | 6 | const testing = std.testing; |
| 7 | 7 | ||
| 8 | const CodePointIterator = @import("code_point").Iterator; | 8 | const CodePointIterator = @import("code_point").Iterator; |
| 9 | const norm_props = @import("ziglyph").normalization_props; | ||
| 10 | |||
| 11 | pub const NormData = @import("NormData"); | 9 | pub const NormData = @import("NormData"); |
| 12 | 10 | ||
| 13 | norm_data: *NormData, | 11 | norm_data: *NormData, |
| @@ -109,7 +107,10 @@ pub fn decompose(self: Self, cp: u21, form: Form) Decomp { | |||
| 109 | var dc = Decomp{ .form = form }; | 107 | var dc = Decomp{ .form = form }; |
| 110 | 108 | ||
| 111 | // ASCII or NFD / NFKD quick checks. | 109 | // ASCII or NFD / NFKD quick checks. |
| 112 | if (cp <= 127 or (form == .nfd and norm_props.isNfd(cp)) or (form == .nfkd and norm_props.isNfkd(cp))) { | 110 | if (cp <= 127 or |
| 111 | (form == .nfd and self.norm_data.normp_data.isNfd(cp)) or | ||
| 112 | (form == .nfkd and self.norm_data.normp_data.isNfkd(cp))) | ||
| 113 | { | ||
| 113 | dc.cps[0] = cp; | 114 | dc.cps[0] = cp; |
| 114 | return dc; | 115 | return dc; |
| 115 | } | 116 | } |
| @@ -436,7 +437,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 436 | if (!processed_hangul) { | 437 | if (!processed_hangul) { |
| 437 | // L -> C not Hangul. | 438 | // L -> C not Hangul. |
| 438 | if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| { | 439 | if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| { |
| 439 | if (!norm_props.isFcx(P)) { | 440 | if (!self.norm_data.normp_data.isFcx(P)) { |
| 440 | d_list.items[sidx] = P; | 441 | d_list.items[sidx] = P; |
| 441 | d_list.items[i] = tombstone; // Mark for deletion. | 442 | d_list.items[i] = tombstone; // Mark for deletion. |
| 442 | deleted += 1; | 443 | deleted += 1; |
diff --git a/src/main.zig b/src/main.zig index 2c2cf8c..15dca16 100644 --- a/src/main.zig +++ b/src/main.zig | |||
| @@ -25,9 +25,9 @@ pub fn main() !void { | |||
| 25 | _ = args_iter.skip(); | 25 | _ = args_iter.skip(); |
| 26 | const in_path = args_iter.next() orelse return error.MissingArg; | 26 | const in_path = args_iter.next() orelse return error.MissingArg; |
| 27 | 27 | ||
| 28 | var gpa = std.heap.GeneralPurposeAllocator(.{}){}; | 28 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); |
| 29 | defer _ = gpa.deinit(); | 29 | defer arena.deinit(); |
| 30 | const allocator = gpa.allocator(); | 30 | const allocator = arena.allocator(); |
| 31 | 31 | ||
| 32 | const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32)); | 32 | const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32)); |
| 33 | defer allocator.free(input); | 33 | defer allocator.free(input); |
| @@ -51,9 +51,9 @@ pub fn main() !void { | |||
| 51 | // while (iter.next()) |_| result += 1; | 51 | // while (iter.next()) |_| result += 1; |
| 52 | // while (iter.next()) |line| result += strWidth(line, &data); | 52 | // while (iter.next()) |line| result += strWidth(line, &data); |
| 53 | while (iter.next()) |line| { | 53 | while (iter.next()) |line| { |
| 54 | var nfc = try n.nfc(allocator, line); | 54 | const nfc = try n.nfc(allocator, line); |
| 55 | result += nfc.slice.len; | 55 | result += nfc.slice.len; |
| 56 | nfc.deinit(); | 56 | // nfc.deinit(); |
| 57 | } | 57 | } |
| 58 | 58 | ||
| 59 | std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); | 59 | std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); |