diff options
| author | 2024-03-01 18:51:43 -0400 | |
|---|---|---|
| committer | 2024-03-01 18:51:43 -0400 | |
| commit | 9a0fb96c0c28540493a205b85d1b89d2c9b50f2b (patch) | |
| tree | 723760b45ef8ef604b235d10c3c60edfadd0bb70 /src/Normalizer.zig | |
| parent | Removed dupe tombstone check in Normalizer (diff) | |
| download | zg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.tar.gz zg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.tar.xz zg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.zip | |
Normalizer.eqlIgnoreCase compatibility caseless matching
Diffstat (limited to 'src/Normalizer.zig')
| -rw-r--r-- | src/Normalizer.zig | 103 |
1 files changed, 99 insertions, 4 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig index abe35e5..c68b2ec 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig | |||
| @@ -343,7 +343,102 @@ test "nfkd !ASCII / alloc" { | |||
| 343 | try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); | 343 | try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); |
| 344 | } | 344 | } |
| 345 | 345 | ||
| 346 | // Composition utilities. | 346 | fn caseFold( |
| 347 | self: Self, | ||
| 348 | allocator: mem.Allocator, | ||
| 349 | cps: []const u21, | ||
| 350 | ) ![]const u21 { | ||
| 351 | var cfcps = std.ArrayList(u21).init(allocator); | ||
| 352 | defer cfcps.deinit(); | ||
| 353 | |||
| 354 | for (cps) |cp| { | ||
| 355 | const cf = self.norm_data.fold_data.caseFold(cp); | ||
| 356 | |||
| 357 | if (cf.len == 0) { | ||
| 358 | try cfcps.append(cp); | ||
| 359 | } else { | ||
| 360 | try cfcps.appendSlice(cf); | ||
| 361 | } | ||
| 362 | } | ||
| 363 | |||
| 364 | return try cfcps.toOwnedSlice(); | ||
| 365 | } | ||
| 366 | |||
| 367 | fn nfkdCodePoints( | ||
| 368 | self: Self, | ||
| 369 | allocator: mem.Allocator, | ||
| 370 | cps: []const u21, | ||
| 371 | ) ![]u21 { | ||
| 372 | var dcp_list = std.ArrayList(u21).init(allocator); | ||
| 373 | defer dcp_list.deinit(); | ||
| 374 | |||
| 375 | var dc_buf: [18]u21 = undefined; | ||
| 376 | |||
| 377 | for (cps) |cp| { | ||
| 378 | const dc = self.decompose(cp, .nfkd, &dc_buf); | ||
| 379 | |||
| 380 | if (dc.form == .same) { | ||
| 381 | try dcp_list.append(cp); | ||
| 382 | } else { | ||
| 383 | try dcp_list.appendSlice(dc.cps); | ||
| 384 | } | ||
| 385 | } | ||
| 386 | |||
| 387 | self.canonicalSort(dcp_list.items); | ||
| 388 | |||
| 389 | return try dcp_list.toOwnedSlice(); | ||
| 390 | } | ||
| 391 | |||
| 392 | pub fn eqlIgnoreCase( | ||
| 393 | self: Self, | ||
| 394 | allocator: mem.Allocator, | ||
| 395 | a: []const u8, | ||
| 396 | b: []const u8, | ||
| 397 | ) !bool { | ||
| 398 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); | ||
| 399 | |||
| 400 | const nfd_a = try self.nfxdCodePoints(allocator, a, .nfd); | ||
| 401 | defer allocator.free(nfd_a); | ||
| 402 | const cf_nfd_a = try self.caseFold(allocator, nfd_a); | ||
| 403 | defer allocator.free(cf_nfd_a); | ||
| 404 | const nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfd_a); | ||
| 405 | defer allocator.free(nfkd_cf_nfd_a); | ||
| 406 | const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a); | ||
| 407 | defer allocator.free(cf_nfkd_cf_nfd_a); | ||
| 408 | const nfkd_cf_nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); | ||
| 409 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); | ||
| 410 | |||
| 411 | const nfd_b = try self.nfxdCodePoints(allocator, b, .nfd); | ||
| 412 | defer allocator.free(nfd_b); | ||
| 413 | const cf_nfd_b = try self.caseFold(allocator, nfd_b); | ||
| 414 | defer allocator.free(cf_nfd_b); | ||
| 415 | const nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfd_b); | ||
| 416 | defer allocator.free(nfkd_cf_nfd_b); | ||
| 417 | const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b); | ||
| 418 | defer allocator.free(cf_nfkd_cf_nfd_b); | ||
| 419 | const nfkd_cf_nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); | ||
| 420 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); | ||
| 421 | |||
| 422 | return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); | ||
| 423 | } | ||
| 424 | |||
| 425 | test "eqlIgnoreCase" { | ||
| 426 | const allocator = testing.allocator; | ||
| 427 | var data = try NormData.init(allocator); | ||
| 428 | defer data.deinit(); | ||
| 429 | var n = Self{ .norm_data = &data }; | ||
| 430 | |||
| 431 | try testing.expect(try n.eqlIgnoreCase(allocator, "ascii only!", "ASCII Only!")); | ||
| 432 | |||
| 433 | const a = "Héllo World! \u{3d3}"; | ||
| 434 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; | ||
| 435 | try testing.expect(try n.eqlIgnoreCase(allocator, a, b)); | ||
| 436 | |||
| 437 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; | ||
| 438 | try testing.expect(try n.eqlIgnoreCase(allocator, a, c)); | ||
| 439 | } | ||
| 440 | |||
| 441 | // Composition (NFC, NFKC) | ||
| 347 | 442 | ||
| 348 | fn isHangul(self: Self, cp: u21) bool { | 443 | fn isHangul(self: Self, cp: u21) bool { |
| 349 | return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none; | 444 | return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none; |
| @@ -504,11 +599,11 @@ test "nfkc" { | |||
| 504 | try testing.expectEqualStrings("Complex char: \u{038E}", result.slice); | 599 | try testing.expectEqualStrings("Complex char: \u{038E}", result.slice); |
| 505 | } | 600 | } |
| 506 | 601 | ||
| 507 | /// Tests for equality of `a` and `b` after normalizing to NFD. | 602 | /// Tests for equality of `a` and `b` after normalizing to NFC. |
| 508 | pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool { | 603 | pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool { |
| 509 | var norm_result_a = try self.nfd(allocator, a); | 604 | var norm_result_a = try self.nfc(allocator, a); |
| 510 | defer norm_result_a.deinit(); | 605 | defer norm_result_a.deinit(); |
| 511 | var norm_result_b = try self.nfd(allocator, b); | 606 | var norm_result_b = try self.nfc(allocator, b); |
| 512 | defer norm_result_b.deinit(); | 607 | defer norm_result_b.deinit(); |
| 513 | 608 | ||
| 514 | return mem.eql(u8, norm_result_a.slice, norm_result_b.slice); | 609 | return mem.eql(u8, norm_result_a.slice, norm_result_b.slice); |