summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-03-01 20:31:52 -0400
committerGravatar Jose Colon Rodriguez2024-03-01 20:31:52 -0400
commit1ecfd06469ed4c2503034796faf4e7dca4196238 (patch)
treefc95244332b24780306228e12cb22ffd27979d5f /src
parentChanges when case folded check; 20ms faster (diff)
downloadzg-1ecfd06469ed4c2503034796faf4e7dca4196238.tar.gz
zg-1ecfd06469ed4c2503034796faf4e7dca4196238.tar.xz
zg-1ecfd06469ed4c2503034796faf4e7dca4196238.zip
Moved case fold stuff to src/Caser.zig
Diffstat (limited to 'src')
-rw-r--r--src/Caser.zig109
-rw-r--r--src/NormData.zig3
-rw-r--r--src/Normalizer.zig98
-rw-r--r--src/main.zig21
4 files changed, 125 insertions, 106 deletions
diff --git a/src/Caser.zig b/src/Caser.zig
new file mode 100644
index 0000000..d02370a
--- /dev/null
+++ b/src/Caser.zig
@@ -0,0 +1,109 @@
1const std = @import("std");
2const mem = std.mem;
3const testing = std.testing;
4
5const ascii = @import("ascii");
6pub const FoldData = @import("FoldData");
7const Normalizer = @import("Normalizer");
8
9fold_data: *const FoldData,
10
11const Self = @This();
12
13fn caseFold(
14 self: Self,
15 allocator: mem.Allocator,
16 cps: []const u21,
17) ![]const u21 {
18 var cfcps = std.ArrayList(u21).init(allocator);
19 defer cfcps.deinit();
20
21 for (cps) |cp| {
22 const cf = self.fold_data.caseFold(cp);
23
24 if (cf.len == 0) {
25 try cfcps.append(cp);
26 } else {
27 try cfcps.appendSlice(cf);
28 }
29 }
30
31 return try cfcps.toOwnedSlice();
32}
33
34fn changesWhenCaseFolded(self: Self, cps: []const u21) bool {
35 return for (cps) |cp| {
36 if (self.fold_data.changesWhenCaseFolded(cp)) break true;
37 } else false;
38}
39
40pub fn compatCaselessMatch(
41 self: Self,
42 allocator: mem.Allocator,
43 normalizer: *const Normalizer,
44 a: []const u8,
45 b: []const u8,
46) !bool {
47 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
48
49 // Process a
50 const nfd_a = try normalizer.nfxdCodePoints(allocator, a, .nfd);
51 defer allocator.free(nfd_a);
52
53 var need_frr_cf_nfd_a = false;
54 var cf_nfd_a: []const u21 = nfd_a;
55 if (self.changesWhenCaseFolded(nfd_a)) {
56 cf_nfd_a = try self.caseFold(allocator, nfd_a);
57 need_frr_cf_nfd_a = true;
58 }
59 defer if (need_frr_cf_nfd_a) allocator.free(cf_nfd_a);
60
61 const nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfd_a);
62 defer allocator.free(nfkd_cf_nfd_a);
63 const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a);
64 defer allocator.free(cf_nfkd_cf_nfd_a);
65 const nfkd_cf_nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a);
66 defer allocator.free(nfkd_cf_nfkd_cf_nfd_a);
67
68 // Process b
69 const nfd_b = try normalizer.nfxdCodePoints(allocator, b, .nfd);
70 defer allocator.free(nfd_b);
71
72 var need_frr_cf_nfd_b = false;
73 var cf_nfd_b: []const u21 = nfd_b;
74 if (self.changesWhenCaseFolded(nfd_b)) {
75 cf_nfd_b = try self.caseFold(allocator, nfd_b);
76 need_frr_cf_nfd_b = true;
77 }
78 defer if (need_frr_cf_nfd_b) allocator.free(cf_nfd_b);
79
80 const nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfd_b);
81 defer allocator.free(nfkd_cf_nfd_b);
82 const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b);
83 defer allocator.free(cf_nfkd_cf_nfd_b);
84 const nfkd_cf_nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b);
85 defer allocator.free(nfkd_cf_nfkd_cf_nfd_b);
86
87 return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b);
88}
89
90test "compatCaselessMatch" {
91 const allocator = testing.allocator;
92
93 var norm_data = try Normalizer.NormData.init(allocator);
94 defer norm_data.deinit();
95 const n = Normalizer{ .norm_data = &norm_data };
96
97 var fold_data = try FoldData.init(allocator);
98 defer fold_data.deinit();
99 const caser = Self{ .fold_data = &fold_data };
100
101 try testing.expect(try caser.compatCaselessMatch(allocator, &n, "ascii only!", "ASCII Only!"));
102
103 const a = "Héllo World! \u{3d3}";
104 const b = "He\u{301}llo World! \u{3a5}\u{301}";
105 try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, b));
106
107 const c = "He\u{301}llo World! \u{3d2}\u{301}";
108 try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, c));
109}
diff --git a/src/NormData.zig b/src/NormData.zig
index 3c2f614..8a7fa49 100644
--- a/src/NormData.zig
+++ b/src/NormData.zig
@@ -13,7 +13,6 @@ ccc_data: CccData,
13compat_data: CompatData, 13compat_data: CompatData,
14hangul_data: HangulData, 14hangul_data: HangulData,
15normp_data: NormPropsData, 15normp_data: NormPropsData,
16fold_data: FoldData,
17 16
18const Self = @This(); 17const Self = @This();
19 18
@@ -22,7 +21,6 @@ pub fn init(allocator: std.mem.Allocator) !Self {
22 .canon_data = try CanonData.init(allocator), 21 .canon_data = try CanonData.init(allocator),
23 .ccc_data = try CccData.init(allocator), 22 .ccc_data = try CccData.init(allocator),
24 .compat_data = try CompatData.init(allocator), 23 .compat_data = try CompatData.init(allocator),
25 .fold_data = try FoldData.init(allocator),
26 .hangul_data = try HangulData.init(allocator), 24 .hangul_data = try HangulData.init(allocator),
27 .normp_data = try NormPropsData.init(allocator), 25 .normp_data = try NormPropsData.init(allocator),
28 }; 26 };
@@ -33,6 +31,5 @@ pub fn deinit(self: *Self) void {
33 self.ccc_data.deinit(); 31 self.ccc_data.deinit();
34 self.compat_data.deinit(); 32 self.compat_data.deinit();
35 self.hangul_data.deinit(); 33 self.hangul_data.deinit();
36 self.fold_data.deinit();
37 self.normp_data.deinit(); 34 self.normp_data.deinit();
38} 35}
diff --git a/src/Normalizer.zig b/src/Normalizer.zig
index 5a26dfa..3ff157c 100644
--- a/src/Normalizer.zig
+++ b/src/Normalizer.zig
@@ -18,7 +18,7 @@ const ascii = @import("ascii");
18const CodePointIterator = @import("code_point").Iterator; 18const CodePointIterator = @import("code_point").Iterator;
19pub const NormData = @import("NormData"); 19pub const NormData = @import("NormData");
20 20
21norm_data: *NormData, 21norm_data: *const NormData,
22 22
23const Self = @This(); 23const Self = @This();
24 24
@@ -255,7 +255,7 @@ pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) !Result {
255 return self.nfxd(allocator, str, .nfkd); 255 return self.nfxd(allocator, str, .nfkd);
256} 256}
257 257
258fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) ![]u21 { 258pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) ![]u21 {
259 var dcp_list = std.ArrayList(u21).init(allocator); 259 var dcp_list = std.ArrayList(u21).init(allocator);
260 defer dcp_list.deinit(); 260 defer dcp_list.deinit();
261 261
@@ -343,28 +343,7 @@ test "nfkd !ASCII / alloc" {
343 try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); 343 try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice);
344} 344}
345 345
346fn caseFold( 346pub fn nfkdCodePoints(
347 self: Self,
348 allocator: mem.Allocator,
349 cps: []const u21,
350) ![]const u21 {
351 var cfcps = std.ArrayList(u21).init(allocator);
352 defer cfcps.deinit();
353
354 for (cps) |cp| {
355 const cf = self.norm_data.fold_data.caseFold(cp);
356
357 if (cf.len == 0) {
358 try cfcps.append(cp);
359 } else {
360 try cfcps.appendSlice(cf);
361 }
362 }
363
364 return try cfcps.toOwnedSlice();
365}
366
367fn nfkdCodePoints(
368 self: Self, 347 self: Self,
369 allocator: mem.Allocator, 348 allocator: mem.Allocator,
370 cps: []const u21, 349 cps: []const u21,
@@ -389,77 +368,6 @@ fn nfkdCodePoints(
389 return try dcp_list.toOwnedSlice(); 368 return try dcp_list.toOwnedSlice();
390} 369}
391 370
392fn changesWhenCaseFolded(self: Self, cps: []const u21) bool {
393 return for (cps) |cp| {
394 if (self.norm_data.fold_data.changesWhenCaseFolded(cp)) break true;
395 } else false;
396}
397
398pub fn eqlIgnoreCase(
399 self: Self,
400 allocator: mem.Allocator,
401 a: []const u8,
402 b: []const u8,
403) !bool {
404 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
405
406 // Process a
407 const nfd_a = try self.nfxdCodePoints(allocator, a, .nfd);
408 defer allocator.free(nfd_a);
409
410 var need_frr_cf_nfd_a = false;
411 var cf_nfd_a: []const u21 = nfd_a;
412 if (self.changesWhenCaseFolded(nfd_a)) {
413 cf_nfd_a = try self.caseFold(allocator, nfd_a);
414 need_frr_cf_nfd_a = true;
415 }
416 defer if (need_frr_cf_nfd_a) allocator.free(cf_nfd_a);
417
418 const nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfd_a);
419 defer allocator.free(nfkd_cf_nfd_a);
420 const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a);
421 defer allocator.free(cf_nfkd_cf_nfd_a);
422 const nfkd_cf_nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a);
423 defer allocator.free(nfkd_cf_nfkd_cf_nfd_a);
424
425 // Process b
426 const nfd_b = try self.nfxdCodePoints(allocator, b, .nfd);
427 defer allocator.free(nfd_b);
428
429 var need_frr_cf_nfd_b = false;
430 var cf_nfd_b: []const u21 = nfd_b;
431 if (self.changesWhenCaseFolded(nfd_b)) {
432 cf_nfd_b = try self.caseFold(allocator, nfd_b);
433 need_frr_cf_nfd_b = true;
434 }
435 defer if (need_frr_cf_nfd_b) allocator.free(cf_nfd_b);
436
437 const nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfd_b);
438 defer allocator.free(nfkd_cf_nfd_b);
439 const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b);
440 defer allocator.free(cf_nfkd_cf_nfd_b);
441 const nfkd_cf_nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b);
442 defer allocator.free(nfkd_cf_nfkd_cf_nfd_b);
443
444 return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b);
445}
446
447test "eqlIgnoreCase" {
448 const allocator = testing.allocator;
449 var data = try NormData.init(allocator);
450 defer data.deinit();
451 var n = Self{ .norm_data = &data };
452
453 try testing.expect(try n.eqlIgnoreCase(allocator, "ascii only!", "ASCII Only!"));
454
455 const a = "Héllo World! \u{3d3}";
456 const b = "He\u{301}llo World! \u{3a5}\u{301}";
457 try testing.expect(try n.eqlIgnoreCase(allocator, a, b));
458
459 const c = "He\u{301}llo World! \u{3d2}\u{301}";
460 try testing.expect(try n.eqlIgnoreCase(allocator, a, c));
461}
462
463// Composition (NFC, NFKC) 371// Composition (NFC, NFKC)
464 372
465fn isHangul(self: Self, cp: u21) bool { 373fn isHangul(self: Self, cp: u21) bool {
diff --git a/src/main.zig b/src/main.zig
index a5afa66..46e7c9d 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -11,15 +11,16 @@ const std = @import("std");
11// const strWidth = @import("display_width").strWidth; 11// const strWidth = @import("display_width").strWidth;
12 12
13// const CodePointIterator = @import("ziglyph").CodePointIterator; 13// const CodePointIterator = @import("ziglyph").CodePointIterator;
14const CodePointIterator = @import("code_point").Iterator; 14// const CodePointIterator = @import("code_point").Iterator;
15 15
16// const ascii = @import("ascii"); 16// const ascii = @import("ascii");
17// const ascii = std.ascii; 17// const ascii = std.ascii;
18 18
19// const Normalizer = @import("ziglyph").Normalizer; 19// const Normalizer = @import("ziglyph").Normalizer;
20const NormData = @import("Normalizer").NormData;
21const Normalizer = @import("Normalizer"); 20const Normalizer = @import("Normalizer");
22 21
22const Caser = @import("Caser");
23
23// const GenCatData = @import("GenCatData"); 24// const GenCatData = @import("GenCatData");
24 25
25pub fn main() !void { 26pub fn main() !void {
@@ -34,15 +35,19 @@ pub fn main() !void {
34 const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32)); 35 const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32));
35 defer allocator.free(input); 36 defer allocator.free(input);
36 37
37 var data = try NormData.init(allocator); 38 var norm_data = try Normalizer.NormData.init(allocator);
38 defer data.deinit(); 39 defer norm_data.deinit();
39 var n = Normalizer{ .norm_data = &data }; 40 var norm = Normalizer{ .norm_data = &norm_data };
40 // var n = try Normalizer.init(allocator); 41 // var norm = try Normalizer.init(allocator);
41 // defer n.deinit(); 42 // defer norm.deinit();
42 43
43 // var gencat_data = try GenCatData.init(allocator); 44 // var gencat_data = try GenCatData.init(allocator);
44 // defer gencat_data.deinit(); 45 // defer gencat_data.deinit();
45 46
47 var fold_data = try Caser.FoldData.init(allocator);
48 defer fold_data.deinit();
49 var caser = Caser{ .fold_data = &fold_data };
50
46 // var iter = GraphemeIterator.init(input, &data); 51 // var iter = GraphemeIterator.init(input, &data);
47 // defer iter.deinit(); 52 // defer iter.deinit();
48 // var iter = CodePointIterator{ .bytes = input }; 53 // var iter = CodePointIterator{ .bytes = input };
@@ -66,7 +71,7 @@ pub fn main() !void {
66 // result += 1; 71 // result += 1;
67 // } 72 // }
68 while (iter.next()) |line| { 73 while (iter.next()) |line| {
69 if (try n.eqlIgnoreCase(allocator, prev_line, line)) { 74 if (try caser.compatCaselessMatch(allocator, &norm, prev_line, line)) {
70 result += line.len; 75 result += line.len;
71 } 76 }
72 prev_line = line; 77 prev_line = line;