summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-03-01 18:51:43 -0400
committerGravatar Jose Colon Rodriguez2024-03-01 18:51:43 -0400
commit9a0fb96c0c28540493a205b85d1b89d2c9b50f2b (patch)
tree723760b45ef8ef604b235d10c3c60edfadd0bb70
parentRemoved dupe tombstone check in Normalizer (diff)
downloadzg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.tar.gz
zg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.tar.xz
zg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.zip
Normalizer.eqlIgnoreCase compatibility caseless matching
-rw-r--r--build.zig20
-rw-r--r--codegen/fold.zig76
-rw-r--r--src/FoldData.zig48
-rw-r--r--src/NormData.zig4
-rw-r--r--src/Normalizer.zig103
-rw-r--r--src/main.zig17
6 files changed, 258 insertions, 10 deletions
diff --git a/build.zig b/build.zig
index 09edceb..3d4d1c4 100644
--- a/build.zig
+++ b/build.zig
@@ -88,6 +88,15 @@ pub fn build(b: *std.Build) void {
88 const run_gencat_gen_exe = b.addRunArtifact(gencat_gen_exe); 88 const run_gencat_gen_exe = b.addRunArtifact(gencat_gen_exe);
89 const gencat_gen_out = run_gencat_gen_exe.addOutputFileArg("gencat.bin.z"); 89 const gencat_gen_out = run_gencat_gen_exe.addOutputFileArg("gencat.bin.z");
90 90
91 const fold_gen_exe = b.addExecutable(.{
92 .name = "fold",
93 .root_source_file = .{ .path = "codegen/fold.zig" },
94 .target = b.host,
95 .optimize = .Debug,
96 });
97 const run_fold_gen_exe = b.addRunArtifact(fold_gen_exe);
98 const fold_gen_out = run_fold_gen_exe.addOutputFileArg("fold.bin.z");
99
91 // Modules we provide 100 // Modules we provide
92 // Code points 101 // Code points
93 const code_point = b.addModule("code_point", .{ 102 const code_point = b.addModule("code_point", .{
@@ -174,6 +183,14 @@ pub fn build(b: *std.Build) void {
174 }); 183 });
175 normp_data.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out }); 184 normp_data.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out });
176 185
186 // Case folding
187 const fold_data = b.createModule(.{
188 .root_source_file = .{ .path = "src/FoldData.zig" },
189 .target = target,
190 .optimize = optimize,
191 });
192 fold_data.addAnonymousImport("fold", .{ .root_source_file = fold_gen_out });
193
177 const norm_data = b.createModule(.{ 194 const norm_data = b.createModule(.{
178 .root_source_file = .{ .path = "src/NormData.zig" }, 195 .root_source_file = .{ .path = "src/NormData.zig" },
179 .target = target, 196 .target = target,
@@ -184,6 +201,7 @@ pub fn build(b: *std.Build) void {
184 norm_data.addImport("CompatData", compat_data); 201 norm_data.addImport("CompatData", compat_data);
185 norm_data.addImport("HangulData", hangul_data); 202 norm_data.addImport("HangulData", hangul_data);
186 norm_data.addImport("NormPropsData", normp_data); 203 norm_data.addImport("NormPropsData", normp_data);
204 norm_data.addImport("FoldData", fold_data);
187 205
188 const norm = b.addModule("Normalizer", .{ 206 const norm = b.addModule("Normalizer", .{
189 .root_source_file = .{ .path = "src/Normalizer.zig" }, 207 .root_source_file = .{ .path = "src/Normalizer.zig" },
@@ -195,7 +213,7 @@ pub fn build(b: *std.Build) void {
195 norm.addImport("NormData", norm_data); 213 norm.addImport("NormData", norm_data);
196 214
197 // General Category 215 // General Category
198 const gencat_data = b.createModule(.{ 216 const gencat_data = b.addModule("GenCatData", .{
199 .root_source_file = .{ .path = "src/GenCatData.zig" }, 217 .root_source_file = .{ .path = "src/GenCatData.zig" },
200 .target = target, 218 .target = target,
201 .optimize = optimize, 219 .optimize = optimize,
diff --git a/codegen/fold.zig b/codegen/fold.zig
new file mode 100644
index 0000000..7977e61
--- /dev/null
+++ b/codegen/fold.zig
@@ -0,0 +1,76 @@
1const std = @import("std");
2const builtin = @import("builtin");
3const fmt = std.fmt;
4const mem = std.mem;
5
6pub fn main() !void {
7 var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
8 defer arena.deinit();
9 const allocator = arena.allocator();
10
11 // Process DerivedEastAsianWidth.txt
12 var in_file = try std.fs.cwd().openFile("data/unicode/CaseFolding.txt", .{});
13 defer in_file.close();
14 var in_buf = std.io.bufferedReader(in_file.reader());
15 const in_reader = in_buf.reader();
16
17 var args_iter = try std.process.argsWithAllocator(allocator);
18 defer args_iter.deinit();
19 _ = args_iter.skip();
20 const output_path = args_iter.next() orelse @panic("No output file arg!");
21
22 const compressor = std.compress.deflate.compressor;
23 var out_file = try std.fs.cwd().createFile(output_path, .{});
24 defer out_file.close();
25 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression });
26 defer out_comp.deinit();
27 const writer = out_comp.writer();
28
29 const endian = builtin.cpu.arch.endian();
30 var line_buf: [4096]u8 = undefined;
31
32 lines: while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
33 if (line.len == 0 or line[0] == '#') continue;
34
35 const no_comment = if (mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line;
36
37 var field_iter = mem.tokenizeSequence(u8, no_comment, "; ");
38 var cps: [4]u24 = undefined;
39 var len: usize = 2;
40
41 var i: usize = 0;
42 while (field_iter.next()) |field| : (i += 1) {
43 switch (i) {
44 0 => cps[0] = try fmt.parseInt(u24, field, 16),
45
46 1 => {
47 if (!mem.eql(u8, field, "C") and !mem.eql(u8, field, "F")) continue :lines;
48 if (mem.eql(u8, field, "F")) len = 3;
49 },
50
51 2 => {
52 if (len == 3) {
53 // Full case fold
54 // std.debug.print("-->{s} {s}\n", .{ line, field });
55 var cp_iter = mem.tokenizeScalar(u8, field, ' ');
56 len = 1;
57 while (cp_iter.next()) |cp_str| : (len += 1) {
58 cps[len] = try fmt.parseInt(u24, cp_str, 16);
59 }
60 } else {
61 // Common case fold
62 cps[1] = try fmt.parseInt(u24, field, 16);
63 }
64 },
65
66 else => {},
67 }
68 }
69
70 try writer.writeInt(u8, @intCast(len), endian);
71 for (cps[0..len]) |cp| try writer.writeInt(u24, cp, endian);
72 }
73
74 try writer.writeInt(u16, 0, endian);
75 try out_comp.flush();
76}
diff --git a/src/FoldData.zig b/src/FoldData.zig
new file mode 100644
index 0000000..139c677
--- /dev/null
+++ b/src/FoldData.zig
@@ -0,0 +1,48 @@
1const std = @import("std");
2const builtin = @import("builtin");
3const compress = std.compress;
4const mem = std.mem;
5
6allocator: mem.Allocator,
7fold: [][]u21 = undefined,
8
9const Self = @This();
10
11pub fn init(allocator: mem.Allocator) !Self {
12 const decompressor = compress.deflate.decompressor;
13 const in_bytes = @embedFile("fold");
14 var in_fbs = std.io.fixedBufferStream(in_bytes);
15 var in_decomp = try decompressor(allocator, in_fbs.reader(), null);
16 defer in_decomp.deinit();
17 var reader = in_decomp.reader();
18
19 const endian = builtin.cpu.arch.endian();
20 var self = Self{
21 .allocator = allocator,
22 .fold = try allocator.alloc([]u21, 0x110000),
23 };
24
25 @memset(self.fold, &.{});
26
27 while (true) {
28 const len: u8 = try reader.readInt(u8, endian);
29 if (len == 0) break;
30 const cp = try reader.readInt(u24, endian);
31 self.fold[cp] = try allocator.alloc(u21, len - 1);
32 for (0..len - 1) |i| {
33 self.fold[cp][i] = @intCast(try reader.readInt(u24, endian));
34 }
35 }
36
37 return self;
38}
39
40pub fn deinit(self: *Self) void {
41 for (self.fold) |slice| self.allocator.free(slice);
42 self.allocator.free(self.fold);
43}
44
45/// Returns the case fold for `cp`.
46pub inline fn caseFold(self: Self, cp: u21) []const u21 {
47 return self.fold[cp];
48}
diff --git a/src/NormData.zig b/src/NormData.zig
index 7c2a09b..3c2f614 100644
--- a/src/NormData.zig
+++ b/src/NormData.zig
@@ -4,6 +4,7 @@ const mem = std.mem;
4const CanonData = @import("CanonData"); 4const CanonData = @import("CanonData");
5const CccData = @import("CombiningData"); 5const CccData = @import("CombiningData");
6const CompatData = @import("CompatData"); 6const CompatData = @import("CompatData");
7const FoldData = @import("FoldData");
7const HangulData = @import("HangulData"); 8const HangulData = @import("HangulData");
8const NormPropsData = @import("NormPropsData"); 9const NormPropsData = @import("NormPropsData");
9 10
@@ -12,6 +13,7 @@ ccc_data: CccData,
12compat_data: CompatData, 13compat_data: CompatData,
13hangul_data: HangulData, 14hangul_data: HangulData,
14normp_data: NormPropsData, 15normp_data: NormPropsData,
16fold_data: FoldData,
15 17
16const Self = @This(); 18const Self = @This();
17 19
@@ -20,6 +22,7 @@ pub fn init(allocator: std.mem.Allocator) !Self {
20 .canon_data = try CanonData.init(allocator), 22 .canon_data = try CanonData.init(allocator),
21 .ccc_data = try CccData.init(allocator), 23 .ccc_data = try CccData.init(allocator),
22 .compat_data = try CompatData.init(allocator), 24 .compat_data = try CompatData.init(allocator),
25 .fold_data = try FoldData.init(allocator),
23 .hangul_data = try HangulData.init(allocator), 26 .hangul_data = try HangulData.init(allocator),
24 .normp_data = try NormPropsData.init(allocator), 27 .normp_data = try NormPropsData.init(allocator),
25 }; 28 };
@@ -30,5 +33,6 @@ pub fn deinit(self: *Self) void {
30 self.ccc_data.deinit(); 33 self.ccc_data.deinit();
31 self.compat_data.deinit(); 34 self.compat_data.deinit();
32 self.hangul_data.deinit(); 35 self.hangul_data.deinit();
36 self.fold_data.deinit();
33 self.normp_data.deinit(); 37 self.normp_data.deinit();
34} 38}
diff --git a/src/Normalizer.zig b/src/Normalizer.zig
index abe35e5..c68b2ec 100644
--- a/src/Normalizer.zig
+++ b/src/Normalizer.zig
@@ -343,7 +343,102 @@ test "nfkd !ASCII / alloc" {
343 try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); 343 try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice);
344} 344}
345 345
346// Composition utilities. 346fn caseFold(
347 self: Self,
348 allocator: mem.Allocator,
349 cps: []const u21,
350) ![]const u21 {
351 var cfcps = std.ArrayList(u21).init(allocator);
352 defer cfcps.deinit();
353
354 for (cps) |cp| {
355 const cf = self.norm_data.fold_data.caseFold(cp);
356
357 if (cf.len == 0) {
358 try cfcps.append(cp);
359 } else {
360 try cfcps.appendSlice(cf);
361 }
362 }
363
364 return try cfcps.toOwnedSlice();
365}
366
367fn nfkdCodePoints(
368 self: Self,
369 allocator: mem.Allocator,
370 cps: []const u21,
371) ![]u21 {
372 var dcp_list = std.ArrayList(u21).init(allocator);
373 defer dcp_list.deinit();
374
375 var dc_buf: [18]u21 = undefined;
376
377 for (cps) |cp| {
378 const dc = self.decompose(cp, .nfkd, &dc_buf);
379
380 if (dc.form == .same) {
381 try dcp_list.append(cp);
382 } else {
383 try dcp_list.appendSlice(dc.cps);
384 }
385 }
386
387 self.canonicalSort(dcp_list.items);
388
389 return try dcp_list.toOwnedSlice();
390}
391
392pub fn eqlIgnoreCase(
393 self: Self,
394 allocator: mem.Allocator,
395 a: []const u8,
396 b: []const u8,
397) !bool {
398 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
399
400 const nfd_a = try self.nfxdCodePoints(allocator, a, .nfd);
401 defer allocator.free(nfd_a);
402 const cf_nfd_a = try self.caseFold(allocator, nfd_a);
403 defer allocator.free(cf_nfd_a);
404 const nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfd_a);
405 defer allocator.free(nfkd_cf_nfd_a);
406 const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a);
407 defer allocator.free(cf_nfkd_cf_nfd_a);
408 const nfkd_cf_nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a);
409 defer allocator.free(nfkd_cf_nfkd_cf_nfd_a);
410
411 const nfd_b = try self.nfxdCodePoints(allocator, b, .nfd);
412 defer allocator.free(nfd_b);
413 const cf_nfd_b = try self.caseFold(allocator, nfd_b);
414 defer allocator.free(cf_nfd_b);
415 const nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfd_b);
416 defer allocator.free(nfkd_cf_nfd_b);
417 const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b);
418 defer allocator.free(cf_nfkd_cf_nfd_b);
419 const nfkd_cf_nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b);
420 defer allocator.free(nfkd_cf_nfkd_cf_nfd_b);
421
422 return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b);
423}
424
425test "eqlIgnoreCase" {
426 const allocator = testing.allocator;
427 var data = try NormData.init(allocator);
428 defer data.deinit();
429 var n = Self{ .norm_data = &data };
430
431 try testing.expect(try n.eqlIgnoreCase(allocator, "ascii only!", "ASCII Only!"));
432
433 const a = "Héllo World! \u{3d3}";
434 const b = "He\u{301}llo World! \u{3a5}\u{301}";
435 try testing.expect(try n.eqlIgnoreCase(allocator, a, b));
436
437 const c = "He\u{301}llo World! \u{3d2}\u{301}";
438 try testing.expect(try n.eqlIgnoreCase(allocator, a, c));
439}
440
441// Composition (NFC, NFKC)
347 442
348fn isHangul(self: Self, cp: u21) bool { 443fn isHangul(self: Self, cp: u21) bool {
349 return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none; 444 return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none;
@@ -504,11 +599,11 @@ test "nfkc" {
504 try testing.expectEqualStrings("Complex char: \u{038E}", result.slice); 599 try testing.expectEqualStrings("Complex char: \u{038E}", result.slice);
505} 600}
506 601
507/// Tests for equality of `a` and `b` after normalizing to NFD. 602/// Tests for equality of `a` and `b` after normalizing to NFC.
508pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool { 603pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool {
509 var norm_result_a = try self.nfd(allocator, a); 604 var norm_result_a = try self.nfc(allocator, a);
510 defer norm_result_a.deinit(); 605 defer norm_result_a.deinit();
511 var norm_result_b = try self.nfd(allocator, b); 606 var norm_result_b = try self.nfc(allocator, b);
512 defer norm_result_b.deinit(); 607 defer norm_result_b.deinit();
513 608
514 return mem.eql(u8, norm_result_a.slice, norm_result_b.slice); 609 return mem.eql(u8, norm_result_a.slice, norm_result_b.slice);
diff --git a/src/main.zig b/src/main.zig
index 59a0fbc..a5afa66 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -49,21 +49,28 @@ pub fn main() !void {
49 var iter = std.mem.splitScalar(u8, input, '\n'); 49 var iter = std.mem.splitScalar(u8, input, '\n');
50 50
51 var result: usize = 0; 51 var result: usize = 0;
52 var prev_line: []const u8 = "";
52 // var result: isize = 0; 53 // var result: isize = 0;
53 var timer = try std.time.Timer.start(); 54 var timer = try std.time.Timer.start();
54 55
55 // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); 56 // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code));
56 // while (iter.next()) |_| result += 1; 57 // while (iter.next()) |_| result += 1;
57 // while (iter.next()) |line| result += strWidth(line, &data); 58 // while (iter.next()) |line| result += strWidth(line, &data);
58 while (iter.next()) |line| { 59 // while (iter.next()) |line| {
59 const nfc = try n.nfkc(allocator, line); 60 // const nfc = try n.nfkc(allocator, line);
60 result += nfc.slice.len; 61 // result += nfc.slice.len;
61 // nfc.deinit(); 62 // // nfc.deinit();
62 } 63 // }
63 // while (iter.next()) |cp| { 64 // while (iter.next()) |cp| {
64 // if (cp.code == 'É') std.debug.print("`{u}` Gc: {s}\n", .{ cp.code, @tagName(gencat_data.gc(cp.code)) }); 65 // if (cp.code == 'É') std.debug.print("`{u}` Gc: {s}\n", .{ cp.code, @tagName(gencat_data.gc(cp.code)) });
65 // result += 1; 66 // result += 1;
66 // } 67 // }
68 while (iter.next()) |line| {
69 if (try n.eqlIgnoreCase(allocator, prev_line, line)) {
70 result += line.len;
71 }
72 prev_line = line;
73 }
67 74
68 std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); 75 std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms });
69} 76}