summaryrefslogtreecommitdiff
path: root/src/Normalizer.zig
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-03-01 18:51:43 -0400
committerGravatar Jose Colon Rodriguez2024-03-01 18:51:43 -0400
commit9a0fb96c0c28540493a205b85d1b89d2c9b50f2b (patch)
tree723760b45ef8ef604b235d10c3c60edfadd0bb70 /src/Normalizer.zig
parentRemoved dupe tombstone check in Normalizer (diff)
downloadzg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.tar.gz
zg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.tar.xz
zg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.zip
Normalizer.eqlIgnoreCase compatibility caseless matching
Diffstat (limited to 'src/Normalizer.zig')
-rw-r--r--src/Normalizer.zig103
1 files changed, 99 insertions, 4 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig
index abe35e5..c68b2ec 100644
--- a/src/Normalizer.zig
+++ b/src/Normalizer.zig
@@ -343,7 +343,102 @@ test "nfkd !ASCII / alloc" {
343 try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); 343 try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice);
344} 344}
345 345
346// Composition utilities. 346fn caseFold(
347 self: Self,
348 allocator: mem.Allocator,
349 cps: []const u21,
350) ![]const u21 {
351 var cfcps = std.ArrayList(u21).init(allocator);
352 defer cfcps.deinit();
353
354 for (cps) |cp| {
355 const cf = self.norm_data.fold_data.caseFold(cp);
356
357 if (cf.len == 0) {
358 try cfcps.append(cp);
359 } else {
360 try cfcps.appendSlice(cf);
361 }
362 }
363
364 return try cfcps.toOwnedSlice();
365}
366
367fn nfkdCodePoints(
368 self: Self,
369 allocator: mem.Allocator,
370 cps: []const u21,
371) ![]u21 {
372 var dcp_list = std.ArrayList(u21).init(allocator);
373 defer dcp_list.deinit();
374
375 var dc_buf: [18]u21 = undefined;
376
377 for (cps) |cp| {
378 const dc = self.decompose(cp, .nfkd, &dc_buf);
379
380 if (dc.form == .same) {
381 try dcp_list.append(cp);
382 } else {
383 try dcp_list.appendSlice(dc.cps);
384 }
385 }
386
387 self.canonicalSort(dcp_list.items);
388
389 return try dcp_list.toOwnedSlice();
390}
391
392pub fn eqlIgnoreCase(
393 self: Self,
394 allocator: mem.Allocator,
395 a: []const u8,
396 b: []const u8,
397) !bool {
398 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
399
400 const nfd_a = try self.nfxdCodePoints(allocator, a, .nfd);
401 defer allocator.free(nfd_a);
402 const cf_nfd_a = try self.caseFold(allocator, nfd_a);
403 defer allocator.free(cf_nfd_a);
404 const nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfd_a);
405 defer allocator.free(nfkd_cf_nfd_a);
406 const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a);
407 defer allocator.free(cf_nfkd_cf_nfd_a);
408 const nfkd_cf_nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a);
409 defer allocator.free(nfkd_cf_nfkd_cf_nfd_a);
410
411 const nfd_b = try self.nfxdCodePoints(allocator, b, .nfd);
412 defer allocator.free(nfd_b);
413 const cf_nfd_b = try self.caseFold(allocator, nfd_b);
414 defer allocator.free(cf_nfd_b);
415 const nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfd_b);
416 defer allocator.free(nfkd_cf_nfd_b);
417 const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b);
418 defer allocator.free(cf_nfkd_cf_nfd_b);
419 const nfkd_cf_nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b);
420 defer allocator.free(nfkd_cf_nfkd_cf_nfd_b);
421
422 return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b);
423}
424
425test "eqlIgnoreCase" {
426 const allocator = testing.allocator;
427 var data = try NormData.init(allocator);
428 defer data.deinit();
429 var n = Self{ .norm_data = &data };
430
431 try testing.expect(try n.eqlIgnoreCase(allocator, "ascii only!", "ASCII Only!"));
432
433 const a = "Héllo World! \u{3d3}";
434 const b = "He\u{301}llo World! \u{3a5}\u{301}";
435 try testing.expect(try n.eqlIgnoreCase(allocator, a, b));
436
437 const c = "He\u{301}llo World! \u{3d2}\u{301}";
438 try testing.expect(try n.eqlIgnoreCase(allocator, a, c));
439}
440
441// Composition (NFC, NFKC)
347 442
348fn isHangul(self: Self, cp: u21) bool { 443fn isHangul(self: Self, cp: u21) bool {
349 return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none; 444 return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none;
@@ -504,11 +599,11 @@ test "nfkc" {
504 try testing.expectEqualStrings("Complex char: \u{038E}", result.slice); 599 try testing.expectEqualStrings("Complex char: \u{038E}", result.slice);
505} 600}
506 601
507/// Tests for equality of `a` and `b` after normalizing to NFD. 602/// Tests for equality of `a` and `b` after normalizing to NFC.
508pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool { 603pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool {
509 var norm_result_a = try self.nfd(allocator, a); 604 var norm_result_a = try self.nfc(allocator, a);
510 defer norm_result_a.deinit(); 605 defer norm_result_a.deinit();
511 var norm_result_b = try self.nfd(allocator, b); 606 var norm_result_b = try self.nfc(allocator, b);
512 defer norm_result_b.deinit(); 607 defer norm_result_b.deinit();
513 608
514 return mem.eql(u8, norm_result_a.slice, norm_result_b.slice); 609 return mem.eql(u8, norm_result_a.slice, norm_result_b.slice);