diff options
| author | 2024-03-01 20:49:49 -0400 | |
|---|---|---|
| committer | 2024-03-01 20:49:49 -0400 | |
| commit | 68b01d794dcf145fb11603a238c647b7ca998f84 (patch) | |
| tree | 5758ec9f6622105d4df7e2990c0e4708530d44ff | |
| parent | Moved case fold stuff to src/Caser.zig (diff) | |
| download | zg-68b01d794dcf145fb11603a238c647b7ca998f84.tar.gz zg-68b01d794dcf145fb11603a238c647b7ca998f84.tar.xz zg-68b01d794dcf145fb11603a238c647b7ca998f84.zip | |
Added canonical caseless match to Caser
| -rw-r--r-- | src/Caser.zig | 85 | ||||
| -rw-r--r-- | src/Normalizer.zig | 25 | ||||
| -rw-r--r-- | src/main.zig | 2 |
3 files changed, 105 insertions, 7 deletions
diff --git a/src/Caser.zig b/src/Caser.zig index d02370a..43a3a5b 100644 --- a/src/Caser.zig +++ b/src/Caser.zig | |||
| @@ -50,13 +50,13 @@ pub fn compatCaselessMatch( | |||
| 50 | const nfd_a = try normalizer.nfxdCodePoints(allocator, a, .nfd); | 50 | const nfd_a = try normalizer.nfxdCodePoints(allocator, a, .nfd); |
| 51 | defer allocator.free(nfd_a); | 51 | defer allocator.free(nfd_a); |
| 52 | 52 | ||
| 53 | var need_frr_cf_nfd_a = false; | 53 | var need_free_cf_nfd_a = false; |
| 54 | var cf_nfd_a: []const u21 = nfd_a; | 54 | var cf_nfd_a: []const u21 = nfd_a; |
| 55 | if (self.changesWhenCaseFolded(nfd_a)) { | 55 | if (self.changesWhenCaseFolded(nfd_a)) { |
| 56 | cf_nfd_a = try self.caseFold(allocator, nfd_a); | 56 | cf_nfd_a = try self.caseFold(allocator, nfd_a); |
| 57 | need_frr_cf_nfd_a = true; | 57 | need_free_cf_nfd_a = true; |
| 58 | } | 58 | } |
| 59 | defer if (need_frr_cf_nfd_a) allocator.free(cf_nfd_a); | 59 | defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); |
| 60 | 60 | ||
| 61 | const nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfd_a); | 61 | const nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfd_a); |
| 62 | defer allocator.free(nfkd_cf_nfd_a); | 62 | defer allocator.free(nfkd_cf_nfd_a); |
| @@ -69,13 +69,13 @@ pub fn compatCaselessMatch( | |||
| 69 | const nfd_b = try normalizer.nfxdCodePoints(allocator, b, .nfd); | 69 | const nfd_b = try normalizer.nfxdCodePoints(allocator, b, .nfd); |
| 70 | defer allocator.free(nfd_b); | 70 | defer allocator.free(nfd_b); |
| 71 | 71 | ||
| 72 | var need_frr_cf_nfd_b = false; | 72 | var need_free_cf_nfd_b = false; |
| 73 | var cf_nfd_b: []const u21 = nfd_b; | 73 | var cf_nfd_b: []const u21 = nfd_b; |
| 74 | if (self.changesWhenCaseFolded(nfd_b)) { | 74 | if (self.changesWhenCaseFolded(nfd_b)) { |
| 75 | cf_nfd_b = try self.caseFold(allocator, nfd_b); | 75 | cf_nfd_b = try self.caseFold(allocator, nfd_b); |
| 76 | need_frr_cf_nfd_b = true; | 76 | need_free_cf_nfd_b = true; |
| 77 | } | 77 | } |
| 78 | defer if (need_frr_cf_nfd_b) allocator.free(cf_nfd_b); | 78 | defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); |
| 79 | 79 | ||
| 80 | const nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfd_b); | 80 | const nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfd_b); |
| 81 | defer allocator.free(nfkd_cf_nfd_b); | 81 | defer allocator.free(nfkd_cf_nfd_b); |
| @@ -107,3 +107,76 @@ test "compatCaselessMatch" { | |||
| 107 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; | 107 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; |
| 108 | try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, c)); | 108 | try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, c)); |
| 109 | } | 109 | } |
| 110 | |||
| 111 | pub fn canonCaselessMatch( | ||
| 112 | self: Self, | ||
| 113 | allocator: mem.Allocator, | ||
| 114 | normalizer: *const Normalizer, | ||
| 115 | a: []const u8, | ||
| 116 | b: []const u8, | ||
| 117 | ) !bool { | ||
| 118 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); | ||
| 119 | |||
| 120 | // Process a | ||
| 121 | const nfd_a = try normalizer.nfxdCodePoints(allocator, a, .nfd); | ||
| 122 | defer allocator.free(nfd_a); | ||
| 123 | |||
| 124 | var need_free_cf_nfd_a = false; | ||
| 125 | var cf_nfd_a: []const u21 = nfd_a; | ||
| 126 | if (self.changesWhenCaseFolded(nfd_a)) { | ||
| 127 | cf_nfd_a = try self.caseFold(allocator, nfd_a); | ||
| 128 | need_free_cf_nfd_a = true; | ||
| 129 | } | ||
| 130 | defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); | ||
| 131 | |||
| 132 | var need_free_nfd_cf_nfd_a = false; | ||
| 133 | var nfd_cf_nfd_a = cf_nfd_a; | ||
| 134 | if (!need_free_cf_nfd_a) { | ||
| 135 | nfd_cf_nfd_a = try normalizer.nfdCodePoints(allocator, cf_nfd_a); | ||
| 136 | need_free_nfd_cf_nfd_a = true; | ||
| 137 | } | ||
| 138 | defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); | ||
| 139 | |||
| 140 | // Process b | ||
| 141 | const nfd_b = try normalizer.nfxdCodePoints(allocator, b, .nfd); | ||
| 142 | defer allocator.free(nfd_b); | ||
| 143 | |||
| 144 | var need_free_cf_nfd_b = false; | ||
| 145 | var cf_nfd_b: []const u21 = nfd_b; | ||
| 146 | if (self.changesWhenCaseFolded(nfd_b)) { | ||
| 147 | cf_nfd_b = try self.caseFold(allocator, nfd_b); | ||
| 148 | need_free_cf_nfd_b = true; | ||
| 149 | } | ||
| 150 | defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); | ||
| 151 | |||
| 152 | var need_free_nfd_cf_nfd_b = false; | ||
| 153 | var nfd_cf_nfd_b = cf_nfd_b; | ||
| 154 | if (!need_free_cf_nfd_b) { | ||
| 155 | nfd_cf_nfd_b = try normalizer.nfdCodePoints(allocator, cf_nfd_b); | ||
| 156 | need_free_nfd_cf_nfd_b = true; | ||
| 157 | } | ||
| 158 | defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b); | ||
| 159 | |||
| 160 | return mem.eql(u21, nfd_cf_nfd_a, nfd_cf_nfd_b); | ||
| 161 | } | ||
| 162 | |||
| 163 | test "canonCaselessMatch" { | ||
| 164 | const allocator = testing.allocator; | ||
| 165 | |||
| 166 | var norm_data = try Normalizer.NormData.init(allocator); | ||
| 167 | defer norm_data.deinit(); | ||
| 168 | const n = Normalizer{ .norm_data = &norm_data }; | ||
| 169 | |||
| 170 | var fold_data = try FoldData.init(allocator); | ||
| 171 | defer fold_data.deinit(); | ||
| 172 | const caser = Self{ .fold_data = &fold_data }; | ||
| 173 | |||
| 174 | try testing.expect(try caser.canonCaselessMatch(allocator, &n, "ascii only!", "ASCII Only!")); | ||
| 175 | |||
| 176 | const a = "Héllo World! \u{3d3}"; | ||
| 177 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; | ||
| 178 | try testing.expect(!try caser.canonCaselessMatch(allocator, &n, a, b)); | ||
| 179 | |||
| 180 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; | ||
| 181 | try testing.expect(try caser.canonCaselessMatch(allocator, &n, a, c)); | ||
| 182 | } | ||
diff --git a/src/Normalizer.zig b/src/Normalizer.zig index 3ff157c..b5a54d1 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig | |||
| @@ -343,6 +343,31 @@ test "nfkd !ASCII / alloc" { | |||
| 343 | try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); | 343 | try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); |
| 344 | } | 344 | } |
| 345 | 345 | ||
| 346 | pub fn nfdCodePoints( | ||
| 347 | self: Self, | ||
| 348 | allocator: mem.Allocator, | ||
| 349 | cps: []const u21, | ||
| 350 | ) ![]u21 { | ||
| 351 | var dcp_list = std.ArrayList(u21).init(allocator); | ||
| 352 | defer dcp_list.deinit(); | ||
| 353 | |||
| 354 | var dc_buf: [18]u21 = undefined; | ||
| 355 | |||
| 356 | for (cps) |cp| { | ||
| 357 | const dc = self.decompose(cp, .nfd, &dc_buf); | ||
| 358 | |||
| 359 | if (dc.form == .same) { | ||
| 360 | try dcp_list.append(cp); | ||
| 361 | } else { | ||
| 362 | try dcp_list.appendSlice(dc.cps); | ||
| 363 | } | ||
| 364 | } | ||
| 365 | |||
| 366 | self.canonicalSort(dcp_list.items); | ||
| 367 | |||
| 368 | return try dcp_list.toOwnedSlice(); | ||
| 369 | } | ||
| 370 | |||
| 346 | pub fn nfkdCodePoints( | 371 | pub fn nfkdCodePoints( |
| 347 | self: Self, | 372 | self: Self, |
| 348 | allocator: mem.Allocator, | 373 | allocator: mem.Allocator, |
diff --git a/src/main.zig b/src/main.zig index 46e7c9d..f9ce28e 100644 --- a/src/main.zig +++ b/src/main.zig | |||
| @@ -71,7 +71,7 @@ pub fn main() !void { | |||
| 71 | // result += 1; | 71 | // result += 1; |
| 72 | // } | 72 | // } |
| 73 | while (iter.next()) |line| { | 73 | while (iter.next()) |line| { |
| 74 | if (try caser.compatCaselessMatch(allocator, &norm, prev_line, line)) { | 74 | if (try caser.canonCaselessMatch(allocator, &norm, prev_line, line)) { |
| 75 | result += line.len; | 75 | result += line.len; |
| 76 | } | 76 | } |
| 77 | prev_line = line; | 77 | prev_line = line; |