diff options
| author | 2024-03-01 18:51:43 -0400 | |
|---|---|---|
| committer | 2024-03-01 18:51:43 -0400 | |
| commit | 9a0fb96c0c28540493a205b85d1b89d2c9b50f2b (patch) | |
| tree | 723760b45ef8ef604b235d10c3c60edfadd0bb70 /src | |
| parent | Removed dupe tombstone check in Normalizer (diff) | |
| download | zg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.tar.gz zg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.tar.xz zg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.zip | |
Normalizer.eqlIgnoreCase compatibility caseless matching
Diffstat (limited to 'src')
| -rw-r--r-- | src/FoldData.zig | 48 | ||||
| -rw-r--r-- | src/NormData.zig | 4 | ||||
| -rw-r--r-- | src/Normalizer.zig | 103 | ||||
| -rw-r--r-- | src/main.zig | 17 |
4 files changed, 163 insertions, 9 deletions
diff --git a/src/FoldData.zig b/src/FoldData.zig new file mode 100644 index 0000000..139c677 --- /dev/null +++ b/src/FoldData.zig | |||
| @@ -0,0 +1,48 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | const compress = std.compress; | ||
| 4 | const mem = std.mem; | ||
| 5 | |||
| 6 | allocator: mem.Allocator, | ||
| 7 | fold: [][]u21 = undefined, | ||
| 8 | |||
| 9 | const Self = @This(); | ||
| 10 | |||
| 11 | pub fn init(allocator: mem.Allocator) !Self { | ||
| 12 | const decompressor = compress.deflate.decompressor; | ||
| 13 | const in_bytes = @embedFile("fold"); | ||
| 14 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 15 | var in_decomp = try decompressor(allocator, in_fbs.reader(), null); | ||
| 16 | defer in_decomp.deinit(); | ||
| 17 | var reader = in_decomp.reader(); | ||
| 18 | |||
| 19 | const endian = builtin.cpu.arch.endian(); | ||
| 20 | var self = Self{ | ||
| 21 | .allocator = allocator, | ||
| 22 | .fold = try allocator.alloc([]u21, 0x110000), | ||
| 23 | }; | ||
| 24 | |||
| 25 | @memset(self.fold, &.{}); | ||
| 26 | |||
| 27 | while (true) { | ||
| 28 | const len: u8 = try reader.readInt(u8, endian); | ||
| 29 | if (len == 0) break; | ||
| 30 | const cp = try reader.readInt(u24, endian); | ||
| 31 | self.fold[cp] = try allocator.alloc(u21, len - 1); | ||
| 32 | for (0..len - 1) |i| { | ||
| 33 | self.fold[cp][i] = @intCast(try reader.readInt(u24, endian)); | ||
| 34 | } | ||
| 35 | } | ||
| 36 | |||
| 37 | return self; | ||
| 38 | } | ||
| 39 | |||
| 40 | pub fn deinit(self: *Self) void { | ||
| 41 | for (self.fold) |slice| self.allocator.free(slice); | ||
| 42 | self.allocator.free(self.fold); | ||
| 43 | } | ||
| 44 | |||
| 45 | /// Returns the case fold for `cp`. | ||
| 46 | pub inline fn caseFold(self: Self, cp: u21) []const u21 { | ||
| 47 | return self.fold[cp]; | ||
| 48 | } | ||
diff --git a/src/NormData.zig b/src/NormData.zig index 7c2a09b..3c2f614 100644 --- a/src/NormData.zig +++ b/src/NormData.zig | |||
| @@ -4,6 +4,7 @@ const mem = std.mem; | |||
| 4 | const CanonData = @import("CanonData"); | 4 | const CanonData = @import("CanonData"); |
| 5 | const CccData = @import("CombiningData"); | 5 | const CccData = @import("CombiningData"); |
| 6 | const CompatData = @import("CompatData"); | 6 | const CompatData = @import("CompatData"); |
| 7 | const FoldData = @import("FoldData"); | ||
| 7 | const HangulData = @import("HangulData"); | 8 | const HangulData = @import("HangulData"); |
| 8 | const NormPropsData = @import("NormPropsData"); | 9 | const NormPropsData = @import("NormPropsData"); |
| 9 | 10 | ||
| @@ -12,6 +13,7 @@ ccc_data: CccData, | |||
| 12 | compat_data: CompatData, | 13 | compat_data: CompatData, |
| 13 | hangul_data: HangulData, | 14 | hangul_data: HangulData, |
| 14 | normp_data: NormPropsData, | 15 | normp_data: NormPropsData, |
| 16 | fold_data: FoldData, | ||
| 15 | 17 | ||
| 16 | const Self = @This(); | 18 | const Self = @This(); |
| 17 | 19 | ||
| @@ -20,6 +22,7 @@ pub fn init(allocator: std.mem.Allocator) !Self { | |||
| 20 | .canon_data = try CanonData.init(allocator), | 22 | .canon_data = try CanonData.init(allocator), |
| 21 | .ccc_data = try CccData.init(allocator), | 23 | .ccc_data = try CccData.init(allocator), |
| 22 | .compat_data = try CompatData.init(allocator), | 24 | .compat_data = try CompatData.init(allocator), |
| 25 | .fold_data = try FoldData.init(allocator), | ||
| 23 | .hangul_data = try HangulData.init(allocator), | 26 | .hangul_data = try HangulData.init(allocator), |
| 24 | .normp_data = try NormPropsData.init(allocator), | 27 | .normp_data = try NormPropsData.init(allocator), |
| 25 | }; | 28 | }; |
| @@ -30,5 +33,6 @@ pub fn deinit(self: *Self) void { | |||
| 30 | self.ccc_data.deinit(); | 33 | self.ccc_data.deinit(); |
| 31 | self.compat_data.deinit(); | 34 | self.compat_data.deinit(); |
| 32 | self.hangul_data.deinit(); | 35 | self.hangul_data.deinit(); |
| 36 | self.fold_data.deinit(); | ||
| 33 | self.normp_data.deinit(); | 37 | self.normp_data.deinit(); |
| 34 | } | 38 | } |
diff --git a/src/Normalizer.zig b/src/Normalizer.zig index abe35e5..c68b2ec 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig | |||
| @@ -343,7 +343,102 @@ test "nfkd !ASCII / alloc" { | |||
| 343 | try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); | 343 | try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); |
| 344 | } | 344 | } |
| 345 | 345 | ||
| 346 | // Composition utilities. | 346 | fn caseFold( |
| 347 | self: Self, | ||
| 348 | allocator: mem.Allocator, | ||
| 349 | cps: []const u21, | ||
| 350 | ) ![]const u21 { | ||
| 351 | var cfcps = std.ArrayList(u21).init(allocator); | ||
| 352 | defer cfcps.deinit(); | ||
| 353 | |||
| 354 | for (cps) |cp| { | ||
| 355 | const cf = self.norm_data.fold_data.caseFold(cp); | ||
| 356 | |||
| 357 | if (cf.len == 0) { | ||
| 358 | try cfcps.append(cp); | ||
| 359 | } else { | ||
| 360 | try cfcps.appendSlice(cf); | ||
| 361 | } | ||
| 362 | } | ||
| 363 | |||
| 364 | return try cfcps.toOwnedSlice(); | ||
| 365 | } | ||
| 366 | |||
| 367 | fn nfkdCodePoints( | ||
| 368 | self: Self, | ||
| 369 | allocator: mem.Allocator, | ||
| 370 | cps: []const u21, | ||
| 371 | ) ![]u21 { | ||
| 372 | var dcp_list = std.ArrayList(u21).init(allocator); | ||
| 373 | defer dcp_list.deinit(); | ||
| 374 | |||
| 375 | var dc_buf: [18]u21 = undefined; | ||
| 376 | |||
| 377 | for (cps) |cp| { | ||
| 378 | const dc = self.decompose(cp, .nfkd, &dc_buf); | ||
| 379 | |||
| 380 | if (dc.form == .same) { | ||
| 381 | try dcp_list.append(cp); | ||
| 382 | } else { | ||
| 383 | try dcp_list.appendSlice(dc.cps); | ||
| 384 | } | ||
| 385 | } | ||
| 386 | |||
| 387 | self.canonicalSort(dcp_list.items); | ||
| 388 | |||
| 389 | return try dcp_list.toOwnedSlice(); | ||
| 390 | } | ||
| 391 | |||
| 392 | pub fn eqlIgnoreCase( | ||
| 393 | self: Self, | ||
| 394 | allocator: mem.Allocator, | ||
| 395 | a: []const u8, | ||
| 396 | b: []const u8, | ||
| 397 | ) !bool { | ||
| 398 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); | ||
| 399 | |||
| 400 | const nfd_a = try self.nfxdCodePoints(allocator, a, .nfd); | ||
| 401 | defer allocator.free(nfd_a); | ||
| 402 | const cf_nfd_a = try self.caseFold(allocator, nfd_a); | ||
| 403 | defer allocator.free(cf_nfd_a); | ||
| 404 | const nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfd_a); | ||
| 405 | defer allocator.free(nfkd_cf_nfd_a); | ||
| 406 | const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a); | ||
| 407 | defer allocator.free(cf_nfkd_cf_nfd_a); | ||
| 408 | const nfkd_cf_nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); | ||
| 409 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); | ||
| 410 | |||
| 411 | const nfd_b = try self.nfxdCodePoints(allocator, b, .nfd); | ||
| 412 | defer allocator.free(nfd_b); | ||
| 413 | const cf_nfd_b = try self.caseFold(allocator, nfd_b); | ||
| 414 | defer allocator.free(cf_nfd_b); | ||
| 415 | const nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfd_b); | ||
| 416 | defer allocator.free(nfkd_cf_nfd_b); | ||
| 417 | const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b); | ||
| 418 | defer allocator.free(cf_nfkd_cf_nfd_b); | ||
| 419 | const nfkd_cf_nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); | ||
| 420 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); | ||
| 421 | |||
| 422 | return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); | ||
| 423 | } | ||
| 424 | |||
| 425 | test "eqlIgnoreCase" { | ||
| 426 | const allocator = testing.allocator; | ||
| 427 | var data = try NormData.init(allocator); | ||
| 428 | defer data.deinit(); | ||
| 429 | var n = Self{ .norm_data = &data }; | ||
| 430 | |||
| 431 | try testing.expect(try n.eqlIgnoreCase(allocator, "ascii only!", "ASCII Only!")); | ||
| 432 | |||
| 433 | const a = "Héllo World! \u{3d3}"; | ||
| 434 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; | ||
| 435 | try testing.expect(try n.eqlIgnoreCase(allocator, a, b)); | ||
| 436 | |||
| 437 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; | ||
| 438 | try testing.expect(try n.eqlIgnoreCase(allocator, a, c)); | ||
| 439 | } | ||
| 440 | |||
| 441 | // Composition (NFC, NFKC) | ||
| 347 | 442 | ||
| 348 | fn isHangul(self: Self, cp: u21) bool { | 443 | fn isHangul(self: Self, cp: u21) bool { |
| 349 | return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none; | 444 | return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none; |
| @@ -504,11 +599,11 @@ test "nfkc" { | |||
| 504 | try testing.expectEqualStrings("Complex char: \u{038E}", result.slice); | 599 | try testing.expectEqualStrings("Complex char: \u{038E}", result.slice); |
| 505 | } | 600 | } |
| 506 | 601 | ||
| 507 | /// Tests for equality of `a` and `b` after normalizing to NFD. | 602 | /// Tests for equality of `a` and `b` after normalizing to NFC. |
| 508 | pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool { | 603 | pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool { |
| 509 | var norm_result_a = try self.nfd(allocator, a); | 604 | var norm_result_a = try self.nfc(allocator, a); |
| 510 | defer norm_result_a.deinit(); | 605 | defer norm_result_a.deinit(); |
| 511 | var norm_result_b = try self.nfd(allocator, b); | 606 | var norm_result_b = try self.nfc(allocator, b); |
| 512 | defer norm_result_b.deinit(); | 607 | defer norm_result_b.deinit(); |
| 513 | 608 | ||
| 514 | return mem.eql(u8, norm_result_a.slice, norm_result_b.slice); | 609 | return mem.eql(u8, norm_result_a.slice, norm_result_b.slice); |
diff --git a/src/main.zig b/src/main.zig index 59a0fbc..a5afa66 100644 --- a/src/main.zig +++ b/src/main.zig | |||
| @@ -49,21 +49,28 @@ pub fn main() !void { | |||
| 49 | var iter = std.mem.splitScalar(u8, input, '\n'); | 49 | var iter = std.mem.splitScalar(u8, input, '\n'); |
| 50 | 50 | ||
| 51 | var result: usize = 0; | 51 | var result: usize = 0; |
| 52 | var prev_line: []const u8 = ""; | ||
| 52 | // var result: isize = 0; | 53 | // var result: isize = 0; |
| 53 | var timer = try std.time.Timer.start(); | 54 | var timer = try std.time.Timer.start(); |
| 54 | 55 | ||
| 55 | // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); | 56 | // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); |
| 56 | // while (iter.next()) |_| result += 1; | 57 | // while (iter.next()) |_| result += 1; |
| 57 | // while (iter.next()) |line| result += strWidth(line, &data); | 58 | // while (iter.next()) |line| result += strWidth(line, &data); |
| 58 | while (iter.next()) |line| { | 59 | // while (iter.next()) |line| { |
| 59 | const nfc = try n.nfkc(allocator, line); | 60 | // const nfc = try n.nfkc(allocator, line); |
| 60 | result += nfc.slice.len; | 61 | // result += nfc.slice.len; |
| 61 | // nfc.deinit(); | 62 | // // nfc.deinit(); |
| 62 | } | 63 | // } |
| 63 | // while (iter.next()) |cp| { | 64 | // while (iter.next()) |cp| { |
| 64 | // if (cp.code == 'É') std.debug.print("`{u}` Gc: {s}\n", .{ cp.code, @tagName(gencat_data.gc(cp.code)) }); | 65 | // if (cp.code == 'É') std.debug.print("`{u}` Gc: {s}\n", .{ cp.code, @tagName(gencat_data.gc(cp.code)) }); |
| 65 | // result += 1; | 66 | // result += 1; |
| 66 | // } | 67 | // } |
| 68 | while (iter.next()) |line| { | ||
| 69 | if (try n.eqlIgnoreCase(allocator, prev_line, line)) { | ||
| 70 | result += line.len; | ||
| 71 | } | ||
| 72 | prev_line = line; | ||
| 73 | } | ||
| 67 | 74 | ||
| 68 | std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); | 75 | std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); |
| 69 | } | 76 | } |