diff options
| -rw-r--r-- | build.zig | 38 | ||||
| -rw-r--r-- | codegen/canon.zig | 68 | ||||
| -rw-r--r-- | src/Canonical.zig | 45 | ||||
| -rw-r--r-- | src/NormData.zig | 22 | ||||
| -rw-r--r-- | src/Normalizer.zig | 155 | ||||
| -rw-r--r-- | src/main.zig | 11 |
6 files changed, 242 insertions, 97 deletions
| @@ -34,6 +34,15 @@ pub fn build(b: *std.Build) void { | |||
| 34 | const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.bin.z"); | 34 | const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.bin.z"); |
| 35 | 35 | ||
| 36 | // Normalization properties | 36 | // Normalization properties |
| 37 | const canon_gen_exe = b.addExecutable(.{ | ||
| 38 | .name = "canon", | ||
| 39 | .root_source_file = .{ .path = "codegen/canon.zig" }, | ||
| 40 | .target = b.host, | ||
| 41 | .optimize = .Debug, | ||
| 42 | }); | ||
| 43 | const run_canon_gen_exe = b.addRunArtifact(canon_gen_exe); | ||
| 44 | const canon_gen_out = run_canon_gen_exe.addOutputFileArg("canon.bin.z"); | ||
| 45 | |||
| 37 | const ccc_gen_exe = b.addExecutable(.{ | 46 | const ccc_gen_exe = b.addExecutable(.{ |
| 38 | .name = "ccc", | 47 | .name = "ccc", |
| 39 | .root_source_file = .{ .path = "codegen/ccc.zig" }, | 48 | .root_source_file = .{ .path = "codegen/ccc.zig" }, |
| @@ -101,6 +110,21 @@ pub fn build(b: *std.Build) void { | |||
| 101 | }); | 110 | }); |
| 102 | ccc_data.addAnonymousImport("ccc", .{ .root_source_file = ccc_gen_out }); | 111 | ccc_data.addAnonymousImport("ccc", .{ .root_source_file = ccc_gen_out }); |
| 103 | 112 | ||
| 113 | const canon_data = b.createModule(.{ | ||
| 114 | .root_source_file = .{ .path = "src/Canonical.zig" }, | ||
| 115 | .target = target, | ||
| 116 | .optimize = optimize, | ||
| 117 | }); | ||
| 118 | canon_data.addAnonymousImport("canon", .{ .root_source_file = canon_gen_out }); | ||
| 119 | |||
| 120 | const norm_data = b.createModule(.{ | ||
| 121 | .root_source_file = .{ .path = "src/NormData.zig" }, | ||
| 122 | .target = target, | ||
| 123 | .optimize = optimize, | ||
| 124 | }); | ||
| 125 | norm_data.addImport("CanonicalData", canon_data); | ||
| 126 | norm_data.addImport("CombiningClassData", ccc_data); | ||
| 127 | |||
| 104 | const norm = b.addModule("Normalizer", .{ | 128 | const norm = b.addModule("Normalizer", .{ |
| 105 | .root_source_file = .{ .path = "src/Normalizer.zig" }, | 129 | .root_source_file = .{ .path = "src/Normalizer.zig" }, |
| 106 | .target = target, | 130 | .target = target, |
| @@ -108,7 +132,7 @@ pub fn build(b: *std.Build) void { | |||
| 108 | }); | 132 | }); |
| 109 | norm.addImport("code_point", code_point); | 133 | norm.addImport("code_point", code_point); |
| 110 | norm.addImport("ziglyph", ziglyph.module("ziglyph")); | 134 | norm.addImport("ziglyph", ziglyph.module("ziglyph")); |
| 111 | norm.addImport("CombiningClassData", ccc_data); | 135 | norm.addImport("NormData", norm_data); |
| 112 | 136 | ||
| 113 | // Benchmark rig | 137 | // Benchmark rig |
| 114 | const exe = b.addExecutable(.{ | 138 | const exe = b.addExecutable(.{ |
| @@ -134,18 +158,18 @@ pub fn build(b: *std.Build) void { | |||
| 134 | 158 | ||
| 135 | // Tests | 159 | // Tests |
| 136 | const exe_unit_tests = b.addTest(.{ | 160 | const exe_unit_tests = b.addTest(.{ |
| 137 | .root_source_file = .{ .path = "src/DisplayWidth.zig" }, | 161 | .root_source_file = .{ .path = "src/Normalizer.zig" }, |
| 138 | .target = target, | 162 | .target = target, |
| 139 | .optimize = optimize, | 163 | .optimize = optimize, |
| 140 | }); | 164 | }); |
| 141 | exe_unit_tests.root_module.addImport("ascii", ascii); | 165 | // exe_unit_tests.root_module.addImport("ascii", ascii); |
| 142 | exe_unit_tests.root_module.addImport("code_point", code_point); | 166 | exe_unit_tests.root_module.addImport("code_point", code_point); |
| 143 | // exe_unit_tests.root_module.addImport("GraphemeData", grapheme_data); | 167 | // exe_unit_tests.root_module.addImport("GraphemeData", grapheme_data); |
| 144 | exe_unit_tests.root_module.addImport("grapheme", grapheme); | 168 | // exe_unit_tests.root_module.addImport("grapheme", grapheme); |
| 145 | // exe_unit_tests.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); | 169 | exe_unit_tests.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); |
| 146 | // exe_unit_tests.root_module.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out }); | 170 | // exe_unit_tests.root_module.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out }); |
| 147 | exe_unit_tests.root_module.addImport("DisplayWidthData", dw_data); | 171 | // exe_unit_tests.root_module.addImport("DisplayWidthData", dw_data); |
| 148 | // exe_unit_tests.root_module.addImport("CombiningClassData", ccc_data); | 172 | exe_unit_tests.root_module.addImport("NormData", norm_data); |
| 149 | 173 | ||
| 150 | const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); | 174 | const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); |
| 151 | 175 | ||
diff --git a/codegen/canon.zig b/codegen/canon.zig new file mode 100644 index 0000000..9d72edd --- /dev/null +++ b/codegen/canon.zig | |||
| @@ -0,0 +1,68 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | |||
| 4 | pub fn main() !void { | ||
| 5 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); | ||
| 6 | defer arena.deinit(); | ||
| 7 | const allocator = arena.allocator(); | ||
| 8 | |||
| 9 | // Process DerivedEastAsianWidth.txt | ||
| 10 | var in_file = try std.fs.cwd().openFile("data/unicode/UnicodeData.txt", .{}); | ||
| 11 | defer in_file.close(); | ||
| 12 | var in_buf = std.io.bufferedReader(in_file.reader()); | ||
| 13 | const in_reader = in_buf.reader(); | ||
| 14 | |||
| 15 | var args_iter = try std.process.argsWithAllocator(allocator); | ||
| 16 | defer args_iter.deinit(); | ||
| 17 | _ = args_iter.skip(); | ||
| 18 | const output_path = args_iter.next() orelse @panic("No output file arg!"); | ||
| 19 | |||
| 20 | const compressor = std.compress.deflate.compressor; | ||
| 21 | var out_file = try std.fs.cwd().createFile(output_path, .{}); | ||
| 22 | defer out_file.close(); | ||
| 23 | var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); | ||
| 24 | defer out_comp.deinit(); | ||
| 25 | const writer = out_comp.writer(); | ||
| 26 | |||
| 27 | const endian = builtin.cpu.arch.endian(); | ||
| 28 | var line_buf: [4096]u8 = undefined; | ||
| 29 | |||
| 30 | lines: while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { | ||
| 31 | if (line.len == 0) continue; | ||
| 32 | |||
| 33 | var field_iter = std.mem.splitScalar(u8, line, ';'); | ||
| 34 | var cps: [3]u24 = undefined; | ||
| 35 | var len: u8 = 2; | ||
| 36 | |||
| 37 | var i: usize = 0; | ||
| 38 | while (field_iter.next()) |field| : (i += 1) { | ||
| 39 | switch (i) { | ||
| 40 | 0 => cps[0] = try std.fmt.parseInt(u24, field, 16), | ||
| 41 | |||
| 42 | 5 => { | ||
| 43 | // Not canonical. | ||
| 44 | if (field.len == 0 or field[0] == '<') continue :lines; | ||
| 45 | if (std.mem.indexOfScalar(u8, field, ' ')) |space| { | ||
| 46 | // Canonical | ||
| 47 | len = 3; | ||
| 48 | cps[1] = try std.fmt.parseInt(u24, field[0..space], 16); | ||
| 49 | cps[2] = try std.fmt.parseInt(u24, field[space + 1 ..], 16); | ||
| 50 | } else { | ||
| 51 | // Singleton | ||
| 52 | cps[1] = try std.fmt.parseInt(u24, field, 16); | ||
| 53 | } | ||
| 54 | }, | ||
| 55 | |||
| 56 | 2 => if (line[0] == '<') continue :lines, | ||
| 57 | |||
| 58 | else => {}, | ||
| 59 | } | ||
| 60 | } | ||
| 61 | |||
| 62 | try writer.writeInt(u8, @intCast(len), endian); | ||
| 63 | for (cps[0..len]) |cp| try writer.writeInt(u24, cp, endian); | ||
| 64 | } | ||
| 65 | |||
| 66 | try writer.writeInt(u16, 0, endian); | ||
| 67 | try out_comp.flush(); | ||
| 68 | } | ||
diff --git a/src/Canonical.zig b/src/Canonical.zig new file mode 100644 index 0000000..d54e828 --- /dev/null +++ b/src/Canonical.zig | |||
| @@ -0,0 +1,45 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | const compress = std.compress; | ||
| 4 | const mem = std.mem; | ||
| 5 | |||
| 6 | allocator: mem.Allocator, | ||
| 7 | nfd: [][2]u21 = undefined, | ||
| 8 | |||
| 9 | const Self = @This(); | ||
| 10 | |||
| 11 | pub fn init(allocator: mem.Allocator) !Self { | ||
| 12 | const decompressor = compress.deflate.decompressor; | ||
| 13 | const in_bytes = @embedFile("canon"); | ||
| 14 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 15 | var in_decomp = try decompressor(allocator, in_fbs.reader(), null); | ||
| 16 | defer in_decomp.deinit(); | ||
| 17 | var reader = in_decomp.reader(); | ||
| 18 | |||
| 19 | const endian = builtin.cpu.arch.endian(); | ||
| 20 | var self = Self{ | ||
| 21 | .allocator = allocator, | ||
| 22 | .nfd = try allocator.alloc([2]u21, 0x110000), | ||
| 23 | }; | ||
| 24 | |||
| 25 | for (0..0x110000) |i| self.nfd[i] = .{ @intCast(i), 0 }; | ||
| 26 | |||
| 27 | while (true) { | ||
| 28 | const len: u8 = try reader.readInt(u8, endian); | ||
| 29 | if (len == 0) break; | ||
| 30 | const cp = try reader.readInt(u24, endian); | ||
| 31 | self.nfd[cp][0] = @intCast(try reader.readInt(u24, endian)); | ||
| 32 | if (len == 3) self.nfd[cp][1] = @intCast(try reader.readInt(u24, endian)); | ||
| 33 | } | ||
| 34 | |||
| 35 | return self; | ||
| 36 | } | ||
| 37 | |||
| 38 | pub fn deinit(self: *Self) void { | ||
| 39 | self.allocator.free(self.nfd); | ||
| 40 | } | ||
| 41 | |||
| 42 | /// Returns canonical decomposition for `cp`. | ||
| 43 | pub inline fn toNfd(self: Self, cp: u21) [2]u21 { | ||
| 44 | return self.nfd[cp]; | ||
| 45 | } | ||
diff --git a/src/NormData.zig b/src/NormData.zig new file mode 100644 index 0000000..c6fa8e8 --- /dev/null +++ b/src/NormData.zig | |||
| @@ -0,0 +1,22 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const mem = std.mem; | ||
| 3 | |||
| 4 | const CanonData = @import("CanonicalData"); | ||
| 5 | const CccData = @import("CombiningClassData"); | ||
| 6 | |||
| 7 | canon_data: CanonData, | ||
| 8 | ccc_data: CccData, | ||
| 9 | |||
| 10 | const Self = @This(); | ||
| 11 | |||
| 12 | pub fn init(allocator: std.mem.Allocator) !Self { | ||
| 13 | return Self{ | ||
| 14 | .canon_data = try CanonData.init(allocator), | ||
| 15 | .ccc_data = try CccData.init(allocator), | ||
| 16 | }; | ||
| 17 | } | ||
| 18 | |||
| 19 | pub fn deinit(self: *Self) void { | ||
| 20 | self.canon_data.deinit(); | ||
| 21 | self.ccc_data.deinit(); | ||
| 22 | } | ||
diff --git a/src/Normalizer.zig b/src/Normalizer.zig index 6a19f47..848cf20 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig | |||
| @@ -3,26 +3,26 @@ | |||
| 3 | //! string equality under different parameters related to normalization (see `eql`, `eqlCaseless`, `eqlIdentifiers`). | 3 | //! string equality under different parameters related to normalization (see `eql`, `eqlCaseless`, `eqlIdentifiers`). |
| 4 | 4 | ||
| 5 | const std = @import("std"); | 5 | const std = @import("std"); |
| 6 | const testing = std.testing; | ||
| 6 | 7 | ||
| 7 | const CodePointIterator = @import("code_point").Iterator; | 8 | const CodePointIterator = @import("code_point").Iterator; |
| 8 | const case_fold_map = @import("ziglyph").case_folding; | 9 | const case_fold_map = @import("ziglyph").case_folding; |
| 9 | const hangul_map = @import("ziglyph").hangul; | 10 | const hangul_map = @import("ziglyph").hangul; |
| 10 | const norm_props = @import("ziglyph").normalization_props; | 11 | const norm_props = @import("ziglyph").normalization_props; |
| 11 | pub const Data = @import("CombiningClassData"); | ||
| 12 | 12 | ||
| 13 | ccc_data: *Data, | 13 | pub const NormData = @import("NormData"); |
| 14 | |||
| 14 | nfc_map: std.AutoHashMap([2]u21, u21), | 15 | nfc_map: std.AutoHashMap([2]u21, u21), |
| 15 | nfd_map: std.AutoHashMap(u21, [2]u21), | ||
| 16 | nfkd_map: std.AutoHashMap(u21, [18]u21), | 16 | nfkd_map: std.AutoHashMap(u21, [18]u21), |
| 17 | norm_data: *NormData, | ||
| 17 | 18 | ||
| 18 | const Self = @This(); | 19 | const Self = @This(); |
| 19 | 20 | ||
| 20 | pub fn init(allocator: std.mem.Allocator, data: *Data) !Self { | 21 | pub fn init(allocator: std.mem.Allocator, norm_data: *NormData) !Self { |
| 21 | var self = Self{ | 22 | var self = Self{ |
| 22 | .ccc_data = data, | ||
| 23 | .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator), | 23 | .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator), |
| 24 | .nfd_map = std.AutoHashMap(u21, [2]u21).init(allocator), | ||
| 25 | .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), | 24 | .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), |
| 25 | .norm_data = norm_data, | ||
| 26 | }; | 26 | }; |
| 27 | errdefer self.deinit(); | 27 | errdefer self.deinit(); |
| 28 | 28 | ||
| @@ -46,24 +46,6 @@ pub fn init(allocator: std.mem.Allocator, data: *Data) !Self { | |||
| 46 | try self.nfc_map.put(.{ cp_a, cp_b }, cp_c); | 46 | try self.nfc_map.put(.{ cp_a, cp_b }, cp_c); |
| 47 | } | 47 | } |
| 48 | 48 | ||
| 49 | // Canonical decompositions | ||
| 50 | const decomp_file = @embedFile("autogen/canonical_decompositions.txt.deflate"); | ||
| 51 | var decomp_stream = std.io.fixedBufferStream(decomp_file); | ||
| 52 | var decomp_decomp = try decompressor(allocator, decomp_stream.reader(), null); | ||
| 53 | defer decomp_decomp.deinit(); | ||
| 54 | |||
| 55 | var decomp_buf = std.io.bufferedReader(decomp_decomp.reader()); | ||
| 56 | const decomp_reader = decomp_buf.reader(); | ||
| 57 | |||
| 58 | while (try decomp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| { | ||
| 59 | if (line.len == 0) continue; | ||
| 60 | var fields = std.mem.split(u8, line, ";"); | ||
| 61 | const cp_a = try std.fmt.parseInt(u21, fields.next().?, 16); | ||
| 62 | const cp_b = try std.fmt.parseInt(u21, fields.next().?, 16); | ||
| 63 | const cp_c = try std.fmt.parseInt(u21, fields.next().?, 16); | ||
| 64 | try self.nfd_map.put(cp_a, .{ cp_b, cp_c }); | ||
| 65 | } | ||
| 66 | |||
| 67 | // Compatibility decompositions | 49 | // Compatibility decompositions |
| 68 | const dekomp_file = @embedFile("autogen/compatibility_decompositions.txt.deflate"); | 50 | const dekomp_file = @embedFile("autogen/compatibility_decompositions.txt.deflate"); |
| 69 | var dekomp_stream = std.io.fixedBufferStream(dekomp_file); | 51 | var dekomp_stream = std.io.fixedBufferStream(dekomp_file); |
| @@ -92,14 +74,14 @@ pub fn init(allocator: std.mem.Allocator, data: *Data) !Self { | |||
| 92 | 74 | ||
| 93 | pub fn deinit(self: *Self) void { | 75 | pub fn deinit(self: *Self) void { |
| 94 | self.nfc_map.deinit(); | 76 | self.nfc_map.deinit(); |
| 95 | self.nfd_map.deinit(); | ||
| 96 | self.nfkd_map.deinit(); | 77 | self.nfkd_map.deinit(); |
| 97 | } | 78 | } |
| 98 | 79 | ||
| 99 | test "init / deinit" { | 80 | test "init / deinit" { |
| 100 | var data = try Data.init(std.testing.allocator); | 81 | const allocator = testing.allocator; |
| 101 | defer data.deinit(); | 82 | var norm_data = try NormData.init(allocator); |
| 102 | var n = try init(std.testing.allocator, &data); | 83 | defer norm_data.deinit(); |
| 84 | var n = try init(allocator, &norm_data); | ||
| 103 | defer n.deinit(); | 85 | defer n.deinit(); |
| 104 | } | 86 | } |
| 105 | 87 | ||
| @@ -169,17 +151,22 @@ const Decomp = struct { | |||
| 169 | pub fn mapping(self: Self, cp: u21, form: Form) Decomp { | 151 | pub fn mapping(self: Self, cp: u21, form: Form) Decomp { |
| 170 | std.debug.assert(form == .nfd or form == .nfkd); | 152 | std.debug.assert(form == .nfd or form == .nfkd); |
| 171 | 153 | ||
| 172 | var dc = Decomp{ .form = .same }; | 154 | var dc = Decomp{ .form = .nfd }; |
| 173 | dc.cps[0] = cp; | 155 | const canon_dc = self.norm_data.canon_data.toNfd(cp); |
| 156 | const len: usize = if (canon_dc[1] == 0) 1 else 2; | ||
| 157 | |||
| 158 | if (len == 1 and canon_dc[0] == cp) { | ||
| 159 | dc.form = .same; | ||
| 160 | dc.cps[0] = cp; | ||
| 161 | } else { | ||
| 162 | @memcpy(dc.cps[0..len], canon_dc[0..len]); | ||
| 163 | } | ||
| 174 | 164 | ||
| 175 | if (self.nfkd_map.get(cp)) |array| { | 165 | if (self.nfkd_map.get(cp)) |array| { |
| 176 | if (form != .nfd) { | 166 | if (form != .nfd) { |
| 177 | dc.form = .nfkd; | 167 | dc.form = .nfkd; |
| 178 | @memcpy(dc.cps[0..array.len], &array); | 168 | @memcpy(dc.cps[0..array.len], &array); |
| 179 | } | 169 | } |
| 180 | } else if (self.nfd_map.get(cp)) |array| { | ||
| 181 | dc.form = .nfd; | ||
| 182 | @memcpy(dc.cps[0..array.len], &array); | ||
| 183 | } | 170 | } |
| 184 | 171 | ||
| 185 | return dc; | 172 | return dc; |
| @@ -244,10 +231,10 @@ pub fn decompose(self: Self, cp: u21, form: Form) Decomp { | |||
| 244 | } | 231 | } |
| 245 | 232 | ||
| 246 | test "decompose" { | 233 | test "decompose" { |
| 247 | const allocator = std.testing.allocator; | 234 | const allocator = testing.allocator; |
| 248 | var data = try Data.init(allocator); | 235 | var norm_data = try NormData.init(allocator); |
| 249 | defer data.deinit(); | 236 | defer norm_data.deinit(); |
| 250 | var n = try init(allocator, &data); | 237 | var n = try init(allocator, &norm_data); |
| 251 | defer n.deinit(); | 238 | defer n.deinit(); |
| 252 | 239 | ||
| 253 | var dc = n.decompose('é', .nfd); | 240 | var dc = n.decompose('é', .nfd); |
| @@ -314,7 +301,7 @@ pub const Result = struct { | |||
| 314 | 301 | ||
| 315 | // Compares code points by Canonical Combining Class order. | 302 | // Compares code points by Canonical Combining Class order. |
| 316 | fn cccLess(self: Self, lhs: u21, rhs: u21) bool { | 303 | fn cccLess(self: Self, lhs: u21, rhs: u21) bool { |
| 317 | return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs); | 304 | return self.norm_data.ccc_data.ccc(lhs) < self.norm_data.ccc_data.ccc(rhs); |
| 318 | } | 305 | } |
| 319 | 306 | ||
| 320 | // Applies the Canonical Sorting Algorithm. | 307 | // Applies the Canonical Sorting Algorithm. |
| @@ -322,7 +309,7 @@ fn canonicalSort(self: Self, cps: []u21) void { | |||
| 322 | var i: usize = 0; | 309 | var i: usize = 0; |
| 323 | while (i < cps.len) : (i += 1) { | 310 | while (i < cps.len) : (i += 1) { |
| 324 | const start: usize = i; | 311 | const start: usize = i; |
| 325 | while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} | 312 | while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} |
| 326 | std.mem.sort(u21, cps[start..i], self, cccLess); | 313 | std.mem.sort(u21, cps[start..i], self, cccLess); |
| 327 | } | 314 | } |
| 328 | } | 315 | } |
| @@ -368,10 +355,10 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 368 | } | 355 | } |
| 369 | 356 | ||
| 370 | test "nfd ASCII / no-alloc" { | 357 | test "nfd ASCII / no-alloc" { |
| 371 | const allocator = std.testing.allocator; | 358 | const allocator = testing.allocator; |
| 372 | var data = try Data.init(allocator); | 359 | var norm_data = try NormData.init(allocator); |
| 373 | defer data.deinit(); | 360 | defer norm_data.deinit(); |
| 374 | var n = try init(allocator, &data); | 361 | var n = try init(allocator, &norm_data); |
| 375 | defer n.deinit(); | 362 | defer n.deinit(); |
| 376 | 363 | ||
| 377 | var result = try n.nfd(allocator, "Hello World!"); | 364 | var result = try n.nfd(allocator, "Hello World!"); |
| @@ -381,10 +368,10 @@ test "nfd ASCII / no-alloc" { | |||
| 381 | } | 368 | } |
| 382 | 369 | ||
| 383 | test "nfd !ASCII / alloc" { | 370 | test "nfd !ASCII / alloc" { |
| 384 | const allocator = std.testing.allocator; | 371 | const allocator = testing.allocator; |
| 385 | var data = try Data.init(allocator); | 372 | var norm_data = try NormData.init(allocator); |
| 386 | defer data.deinit(); | 373 | defer norm_data.deinit(); |
| 387 | var n = try init(allocator, &data); | 374 | var n = try init(allocator, &norm_data); |
| 388 | defer n.deinit(); | 375 | defer n.deinit(); |
| 389 | 376 | ||
| 390 | var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); | 377 | var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); |
| @@ -394,10 +381,10 @@ test "nfd !ASCII / alloc" { | |||
| 394 | } | 381 | } |
| 395 | 382 | ||
| 396 | test "nfkd ASCII / no-alloc" { | 383 | test "nfkd ASCII / no-alloc" { |
| 397 | const allocator = std.testing.allocator; | 384 | const allocator = testing.allocator; |
| 398 | var data = try Data.init(allocator); | 385 | var norm_data = try NormData.init(allocator); |
| 399 | defer data.deinit(); | 386 | defer norm_data.deinit(); |
| 400 | var n = try init(allocator, &data); | 387 | var n = try init(allocator, &norm_data); |
| 401 | defer n.deinit(); | 388 | defer n.deinit(); |
| 402 | 389 | ||
| 403 | var result = try n.nfkd(allocator, "Hello World!"); | 390 | var result = try n.nfkd(allocator, "Hello World!"); |
| @@ -407,10 +394,10 @@ test "nfkd ASCII / no-alloc" { | |||
| 407 | } | 394 | } |
| 408 | 395 | ||
| 409 | test "nfkd !ASCII / alloc" { | 396 | test "nfkd !ASCII / alloc" { |
| 410 | const allocator = std.testing.allocator; | 397 | const allocator = testing.allocator; |
| 411 | var data = try Data.init(allocator); | 398 | var norm_data = try NormData.init(allocator); |
| 412 | defer data.deinit(); | 399 | defer norm_data.deinit(); |
| 413 | var n = try init(allocator, &data); | 400 | var n = try init(allocator, &norm_data); |
| 414 | defer n.deinit(); | 401 | defer n.deinit(); |
| 415 | 402 | ||
| 416 | var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); | 403 | var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); |
| @@ -426,7 +413,7 @@ fn isHangul(cp: u21) bool { | |||
| 426 | } | 413 | } |
| 427 | 414 | ||
| 428 | fn isNonHangulStarter(self: Self, cp: u21) bool { | 415 | fn isNonHangulStarter(self: Self, cp: u21) bool { |
| 429 | return !isHangul(cp) and self.ccc_data.isStarter(cp); | 416 | return !isHangul(cp) and self.norm_data.ccc_data.isStarter(cp); |
| 430 | } | 417 | } |
| 431 | 418 | ||
| 432 | /// Normalizes `str` to NFC. | 419 | /// Normalizes `str` to NFC. |
| @@ -468,7 +455,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 468 | 455 | ||
| 469 | block_check: while (i < d_list.items.len) : (i += 1) { | 456 | block_check: while (i < d_list.items.len) : (i += 1) { |
| 470 | const C = d_list.items[i]; | 457 | const C = d_list.items[i]; |
| 471 | const cc_C = self.ccc_data.ccc(C); | 458 | const cc_C = self.norm_data.ccc_data.ccc(C); |
| 472 | var starter_index: ?usize = null; | 459 | var starter_index: ?usize = null; |
| 473 | var j: usize = i; | 460 | var j: usize = i; |
| 474 | 461 | ||
| @@ -476,10 +463,10 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 476 | j -= 1; | 463 | j -= 1; |
| 477 | 464 | ||
| 478 | // Check for starter. | 465 | // Check for starter. |
| 479 | if (self.ccc_data.isStarter(d_list.items[j])) { | 466 | if (self.norm_data.ccc_data.isStarter(d_list.items[j])) { |
| 480 | if (i - j > 1) { // If there's distance between the starting point and the current position. | 467 | if (i - j > 1) { // If there's distance between the starting point and the current position. |
| 481 | for (d_list.items[(j + 1)..i]) |B| { | 468 | for (d_list.items[(j + 1)..i]) |B| { |
| 482 | const cc_B = self.ccc_data.ccc(B); | 469 | const cc_B = self.norm_data.ccc_data.ccc(B); |
| 483 | // Check for blocking conditions. | 470 | // Check for blocking conditions. |
| 484 | if (isHangul(C)) { | 471 | if (isHangul(C)) { |
| 485 | if (cc_B != 0 or self.isNonHangulStarter(B)) continue :block_check; | 472 | if (cc_B != 0 or self.isNonHangulStarter(B)) continue :block_check; |
| @@ -563,10 +550,10 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 563 | } | 550 | } |
| 564 | 551 | ||
| 565 | test "nfc" { | 552 | test "nfc" { |
| 566 | const allocator = std.testing.allocator; | 553 | const allocator = testing.allocator; |
| 567 | var data = try Data.init(allocator); | 554 | var norm_data = try NormData.init(allocator); |
| 568 | defer data.deinit(); | 555 | defer norm_data.deinit(); |
| 569 | var n = try init(allocator, &data); | 556 | var n = try init(allocator, &norm_data); |
| 570 | defer n.deinit(); | 557 | defer n.deinit(); |
| 571 | 558 | ||
| 572 | var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); | 559 | var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); |
| @@ -576,10 +563,10 @@ test "nfc" { | |||
| 576 | } | 563 | } |
| 577 | 564 | ||
| 578 | test "nfkc" { | 565 | test "nfkc" { |
| 579 | const allocator = std.testing.allocator; | 566 | const allocator = testing.allocator; |
| 580 | var data = try Data.init(allocator); | 567 | var norm_data = try NormData.init(allocator); |
| 581 | defer data.deinit(); | 568 | defer norm_data.deinit(); |
| 582 | var n = try init(allocator, &data); | 569 | var n = try init(allocator, &norm_data); |
| 583 | defer n.deinit(); | 570 | defer n.deinit(); |
| 584 | 571 | ||
| 585 | var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); | 572 | var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); |
| @@ -637,10 +624,10 @@ pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u | |||
| 637 | } | 624 | } |
| 638 | 625 | ||
| 639 | test "eql" { | 626 | test "eql" { |
| 640 | const allocator = std.testing.allocator; | 627 | const allocator = testing.allocator; |
| 641 | var data = try Data.init(allocator); | 628 | var norm_data = try NormData.init(allocator); |
| 642 | defer data.deinit(); | 629 | defer norm_data.deinit(); |
| 643 | var n = try init(allocator, &data); | 630 | var n = try init(allocator, &norm_data); |
| 644 | defer n.deinit(); | 631 | defer n.deinit(); |
| 645 | 632 | ||
| 646 | try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); | 633 | try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); |
| @@ -706,10 +693,10 @@ pub fn eqlCaseless(self: Self, allocator: std.mem.Allocator, a: []const u8, b: [ | |||
| 706 | } | 693 | } |
| 707 | 694 | ||
| 708 | test "eqlCaseless" { | 695 | test "eqlCaseless" { |
| 709 | const allocator = std.testing.allocator; | 696 | const allocator = testing.allocator; |
| 710 | var data = try Data.init(allocator); | 697 | var norm_data = try NormData.init(allocator); |
| 711 | defer data.deinit(); | 698 | defer norm_data.deinit(); |
| 712 | var n = try init(allocator, &data); | 699 | var n = try init(allocator, &norm_data); |
| 713 | defer n.deinit(); | 700 | defer n.deinit(); |
| 714 | 701 | ||
| 715 | try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}")); | 702 | try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}")); |
| @@ -719,7 +706,7 @@ test "eqlCaseless" { | |||
| 719 | // FCD | 706 | // FCD |
| 720 | fn getLeadCcc(self: Self, cp: u21) u8 { | 707 | fn getLeadCcc(self: Self, cp: u21) u8 { |
| 721 | const dc = self.mapping(cp, .nfd); | 708 | const dc = self.mapping(cp, .nfd); |
| 722 | return self.ccc_data.ccc(dc.cps[0]); | 709 | return self.norm_data.ccc_data.ccc(dc.cps[0]); |
| 723 | } | 710 | } |
| 724 | 711 | ||
| 725 | fn getTrailCcc(self: Self, cp: u21) u8 { | 712 | fn getTrailCcc(self: Self, cp: u21) u8 { |
| @@ -727,7 +714,7 @@ fn getTrailCcc(self: Self, cp: u21) u8 { | |||
| 727 | const len = for (dc.cps, 0..) |dcp, i| { | 714 | const len = for (dc.cps, 0..) |dcp, i| { |
| 728 | if (dcp == 0) break i; | 715 | if (dcp == 0) break i; |
| 729 | } else dc.cps.len; | 716 | } else dc.cps.len; |
| 730 | return self.ccc_data.ccc(dc.cps[len - 1]); | 717 | return self.norm_data.ccc_data.ccc(dc.cps[len - 1]); |
| 731 | } | 718 | } |
| 732 | 719 | ||
| 733 | /// Fast check to detect if a string is already in NFC or NFD form. | 720 | /// Fast check to detect if a string is already in NFC or NFD form. |
| @@ -743,10 +730,10 @@ pub fn isFcd(self: Self, str: []const u8) bool { | |||
| 743 | } | 730 | } |
| 744 | 731 | ||
| 745 | test "isFcd" { | 732 | test "isFcd" { |
| 746 | const allocator = std.testing.allocator; | 733 | const allocator = testing.allocator; |
| 747 | var data = try Data.init(allocator); | 734 | var norm_data = try NormData.init(allocator); |
| 748 | defer data.deinit(); | 735 | defer norm_data.deinit(); |
| 749 | var n = try init(allocator, &data); | 736 | var n = try init(allocator, &norm_data); |
| 750 | defer n.deinit(); | 737 | defer n.deinit(); |
| 751 | 738 | ||
| 752 | const is_nfc = "José \u{3D3}"; | 739 | const is_nfc = "José \u{3D3}"; |
| @@ -764,9 +751,9 @@ test "Unicode normalization tests" { | |||
| 764 | defer arena.deinit(); | 751 | defer arena.deinit(); |
| 765 | var allocator = arena.allocator(); | 752 | var allocator = arena.allocator(); |
| 766 | 753 | ||
| 767 | var data = try Data.init(allocator); | 754 | var norm_data = try NormData.init(allocator); |
| 768 | defer data.deinit(); | 755 | defer norm_data.deinit(); |
| 769 | var n = try init(allocator, &data); | 756 | var n = try init(allocator, &norm_data); |
| 770 | defer n.deinit(); | 757 | defer n.deinit(); |
| 771 | 758 | ||
| 772 | var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); | 759 | var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); |
diff --git a/src/main.zig b/src/main.zig index 57db05b..d1a0bb3 100644 --- a/src/main.zig +++ b/src/main.zig | |||
| @@ -17,7 +17,7 @@ const std = @import("std"); | |||
| 17 | // const ascii = std.ascii; | 17 | // const ascii = std.ascii; |
| 18 | 18 | ||
| 19 | // const norm = @import("ziglyph").Normalizer; | 19 | // const norm = @import("ziglyph").Normalizer; |
| 20 | const Data = @import("Normalizer").Data; | 20 | const NormData = @import("Normalizer").NormData; |
| 21 | const norm = @import("Normalizer"); | 21 | const norm = @import("Normalizer"); |
| 22 | 22 | ||
| 23 | pub fn main() !void { | 23 | pub fn main() !void { |
| @@ -32,10 +32,9 @@ pub fn main() !void { | |||
| 32 | const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32)); | 32 | const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32)); |
| 33 | defer allocator.free(input); | 33 | defer allocator.free(input); |
| 34 | 34 | ||
| 35 | var data = try Data.init(allocator); | 35 | var norm_data = try NormData.init(allocator); |
| 36 | defer data.deinit(); | 36 | defer norm_data.deinit(); |
| 37 | 37 | var n = try norm.init(allocator, &norm_data); | |
| 38 | var n = try norm.init(allocator, &data); | ||
| 39 | defer n.deinit(); | 38 | defer n.deinit(); |
| 40 | // var n = try norm.init(allocator); | 39 | // var n = try norm.init(allocator); |
| 41 | // defer n.deinit(); | 40 | // defer n.deinit(); |
| @@ -53,7 +52,7 @@ pub fn main() !void { | |||
| 53 | // while (iter.next()) |_| result += 1; | 52 | // while (iter.next()) |_| result += 1; |
| 54 | // while (iter.next()) |line| result += strWidth(line, &data); | 53 | // while (iter.next()) |line| result += strWidth(line, &data); |
| 55 | while (iter.next()) |line| { | 54 | while (iter.next()) |line| { |
| 56 | var nfc = try n.nfc(allocator, line); | 55 | var nfc = try n.nfd(allocator, line); |
| 57 | result += nfc.slice.len; | 56 | result += nfc.slice.len; |
| 58 | nfc.deinit(); | 57 | nfc.deinit(); |
| 59 | } | 58 | } |