diff options
| author | 2024-03-01 20:31:52 -0400 | |
|---|---|---|
| committer | 2024-03-01 20:31:52 -0400 | |
| commit | 1ecfd06469ed4c2503034796faf4e7dca4196238 (patch) | |
| tree | fc95244332b24780306228e12cb22ffd27979d5f /src | |
| parent | Changes when case folded check; 20ms faster (diff) | |
| download | zg-1ecfd06469ed4c2503034796faf4e7dca4196238.tar.gz zg-1ecfd06469ed4c2503034796faf4e7dca4196238.tar.xz zg-1ecfd06469ed4c2503034796faf4e7dca4196238.zip | |
Moved case fold stuff to src/Caser.zig
Diffstat (limited to 'src')
| -rw-r--r-- | src/Caser.zig | 109 | ||||
| -rw-r--r-- | src/NormData.zig | 3 | ||||
| -rw-r--r-- | src/Normalizer.zig | 98 | ||||
| -rw-r--r-- | src/main.zig | 21 |
4 files changed, 125 insertions, 106 deletions
diff --git a/src/Caser.zig b/src/Caser.zig new file mode 100644 index 0000000..d02370a --- /dev/null +++ b/src/Caser.zig | |||
| @@ -0,0 +1,109 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const mem = std.mem; | ||
| 3 | const testing = std.testing; | ||
| 4 | |||
| 5 | const ascii = @import("ascii"); | ||
| 6 | pub const FoldData = @import("FoldData"); | ||
| 7 | const Normalizer = @import("Normalizer"); | ||
| 8 | |||
| 9 | fold_data: *const FoldData, | ||
| 10 | |||
| 11 | const Self = @This(); | ||
| 12 | |||
| 13 | fn caseFold( | ||
| 14 | self: Self, | ||
| 15 | allocator: mem.Allocator, | ||
| 16 | cps: []const u21, | ||
| 17 | ) ![]const u21 { | ||
| 18 | var cfcps = std.ArrayList(u21).init(allocator); | ||
| 19 | defer cfcps.deinit(); | ||
| 20 | |||
| 21 | for (cps) |cp| { | ||
| 22 | const cf = self.fold_data.caseFold(cp); | ||
| 23 | |||
| 24 | if (cf.len == 0) { | ||
| 25 | try cfcps.append(cp); | ||
| 26 | } else { | ||
| 27 | try cfcps.appendSlice(cf); | ||
| 28 | } | ||
| 29 | } | ||
| 30 | |||
| 31 | return try cfcps.toOwnedSlice(); | ||
| 32 | } | ||
| 33 | |||
| 34 | fn changesWhenCaseFolded(self: Self, cps: []const u21) bool { | ||
| 35 | return for (cps) |cp| { | ||
| 36 | if (self.fold_data.changesWhenCaseFolded(cp)) break true; | ||
| 37 | } else false; | ||
| 38 | } | ||
| 39 | |||
| 40 | pub fn compatCaselessMatch( | ||
| 41 | self: Self, | ||
| 42 | allocator: mem.Allocator, | ||
| 43 | normalizer: *const Normalizer, | ||
| 44 | a: []const u8, | ||
| 45 | b: []const u8, | ||
| 46 | ) !bool { | ||
| 47 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); | ||
| 48 | |||
| 49 | // Process a | ||
| 50 | const nfd_a = try normalizer.nfxdCodePoints(allocator, a, .nfd); | ||
| 51 | defer allocator.free(nfd_a); | ||
| 52 | |||
| 53 | var need_frr_cf_nfd_a = false; | ||
| 54 | var cf_nfd_a: []const u21 = nfd_a; | ||
| 55 | if (self.changesWhenCaseFolded(nfd_a)) { | ||
| 56 | cf_nfd_a = try self.caseFold(allocator, nfd_a); | ||
| 57 | need_frr_cf_nfd_a = true; | ||
| 58 | } | ||
| 59 | defer if (need_frr_cf_nfd_a) allocator.free(cf_nfd_a); | ||
| 60 | |||
| 61 | const nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfd_a); | ||
| 62 | defer allocator.free(nfkd_cf_nfd_a); | ||
| 63 | const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a); | ||
| 64 | defer allocator.free(cf_nfkd_cf_nfd_a); | ||
| 65 | const nfkd_cf_nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); | ||
| 66 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); | ||
| 67 | |||
| 68 | // Process b | ||
| 69 | const nfd_b = try normalizer.nfxdCodePoints(allocator, b, .nfd); | ||
| 70 | defer allocator.free(nfd_b); | ||
| 71 | |||
| 72 | var need_frr_cf_nfd_b = false; | ||
| 73 | var cf_nfd_b: []const u21 = nfd_b; | ||
| 74 | if (self.changesWhenCaseFolded(nfd_b)) { | ||
| 75 | cf_nfd_b = try self.caseFold(allocator, nfd_b); | ||
| 76 | need_frr_cf_nfd_b = true; | ||
| 77 | } | ||
| 78 | defer if (need_frr_cf_nfd_b) allocator.free(cf_nfd_b); | ||
| 79 | |||
| 80 | const nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfd_b); | ||
| 81 | defer allocator.free(nfkd_cf_nfd_b); | ||
| 82 | const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b); | ||
| 83 | defer allocator.free(cf_nfkd_cf_nfd_b); | ||
| 84 | const nfkd_cf_nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); | ||
| 85 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); | ||
| 86 | |||
| 87 | return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); | ||
| 88 | } | ||
| 89 | |||
| 90 | test "compatCaselessMatch" { | ||
| 91 | const allocator = testing.allocator; | ||
| 92 | |||
| 93 | var norm_data = try Normalizer.NormData.init(allocator); | ||
| 94 | defer norm_data.deinit(); | ||
| 95 | const n = Normalizer{ .norm_data = &norm_data }; | ||
| 96 | |||
| 97 | var fold_data = try FoldData.init(allocator); | ||
| 98 | defer fold_data.deinit(); | ||
| 99 | const caser = Self{ .fold_data = &fold_data }; | ||
| 100 | |||
| 101 | try testing.expect(try caser.compatCaselessMatch(allocator, &n, "ascii only!", "ASCII Only!")); | ||
| 102 | |||
| 103 | const a = "Héllo World! \u{3d3}"; | ||
| 104 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; | ||
| 105 | try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, b)); | ||
| 106 | |||
| 107 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; | ||
| 108 | try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, c)); | ||
| 109 | } | ||
diff --git a/src/NormData.zig b/src/NormData.zig index 3c2f614..8a7fa49 100644 --- a/src/NormData.zig +++ b/src/NormData.zig | |||
| @@ -13,7 +13,6 @@ ccc_data: CccData, | |||
| 13 | compat_data: CompatData, | 13 | compat_data: CompatData, |
| 14 | hangul_data: HangulData, | 14 | hangul_data: HangulData, |
| 15 | normp_data: NormPropsData, | 15 | normp_data: NormPropsData, |
| 16 | fold_data: FoldData, | ||
| 17 | 16 | ||
| 18 | const Self = @This(); | 17 | const Self = @This(); |
| 19 | 18 | ||
| @@ -22,7 +21,6 @@ pub fn init(allocator: std.mem.Allocator) !Self { | |||
| 22 | .canon_data = try CanonData.init(allocator), | 21 | .canon_data = try CanonData.init(allocator), |
| 23 | .ccc_data = try CccData.init(allocator), | 22 | .ccc_data = try CccData.init(allocator), |
| 24 | .compat_data = try CompatData.init(allocator), | 23 | .compat_data = try CompatData.init(allocator), |
| 25 | .fold_data = try FoldData.init(allocator), | ||
| 26 | .hangul_data = try HangulData.init(allocator), | 24 | .hangul_data = try HangulData.init(allocator), |
| 27 | .normp_data = try NormPropsData.init(allocator), | 25 | .normp_data = try NormPropsData.init(allocator), |
| 28 | }; | 26 | }; |
| @@ -33,6 +31,5 @@ pub fn deinit(self: *Self) void { | |||
| 33 | self.ccc_data.deinit(); | 31 | self.ccc_data.deinit(); |
| 34 | self.compat_data.deinit(); | 32 | self.compat_data.deinit(); |
| 35 | self.hangul_data.deinit(); | 33 | self.hangul_data.deinit(); |
| 36 | self.fold_data.deinit(); | ||
| 37 | self.normp_data.deinit(); | 34 | self.normp_data.deinit(); |
| 38 | } | 35 | } |
diff --git a/src/Normalizer.zig b/src/Normalizer.zig index 5a26dfa..3ff157c 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig | |||
| @@ -18,7 +18,7 @@ const ascii = @import("ascii"); | |||
| 18 | const CodePointIterator = @import("code_point").Iterator; | 18 | const CodePointIterator = @import("code_point").Iterator; |
| 19 | pub const NormData = @import("NormData"); | 19 | pub const NormData = @import("NormData"); |
| 20 | 20 | ||
| 21 | norm_data: *NormData, | 21 | norm_data: *const NormData, |
| 22 | 22 | ||
| 23 | const Self = @This(); | 23 | const Self = @This(); |
| 24 | 24 | ||
| @@ -255,7 +255,7 @@ pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) !Result { | |||
| 255 | return self.nfxd(allocator, str, .nfkd); | 255 | return self.nfxd(allocator, str, .nfkd); |
| 256 | } | 256 | } |
| 257 | 257 | ||
| 258 | fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) ![]u21 { | 258 | pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) ![]u21 { |
| 259 | var dcp_list = std.ArrayList(u21).init(allocator); | 259 | var dcp_list = std.ArrayList(u21).init(allocator); |
| 260 | defer dcp_list.deinit(); | 260 | defer dcp_list.deinit(); |
| 261 | 261 | ||
| @@ -343,28 +343,7 @@ test "nfkd !ASCII / alloc" { | |||
| 343 | try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); | 343 | try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); |
| 344 | } | 344 | } |
| 345 | 345 | ||
| 346 | fn caseFold( | 346 | pub fn nfkdCodePoints( |
| 347 | self: Self, | ||
| 348 | allocator: mem.Allocator, | ||
| 349 | cps: []const u21, | ||
| 350 | ) ![]const u21 { | ||
| 351 | var cfcps = std.ArrayList(u21).init(allocator); | ||
| 352 | defer cfcps.deinit(); | ||
| 353 | |||
| 354 | for (cps) |cp| { | ||
| 355 | const cf = self.norm_data.fold_data.caseFold(cp); | ||
| 356 | |||
| 357 | if (cf.len == 0) { | ||
| 358 | try cfcps.append(cp); | ||
| 359 | } else { | ||
| 360 | try cfcps.appendSlice(cf); | ||
| 361 | } | ||
| 362 | } | ||
| 363 | |||
| 364 | return try cfcps.toOwnedSlice(); | ||
| 365 | } | ||
| 366 | |||
| 367 | fn nfkdCodePoints( | ||
| 368 | self: Self, | 347 | self: Self, |
| 369 | allocator: mem.Allocator, | 348 | allocator: mem.Allocator, |
| 370 | cps: []const u21, | 349 | cps: []const u21, |
| @@ -389,77 +368,6 @@ fn nfkdCodePoints( | |||
| 389 | return try dcp_list.toOwnedSlice(); | 368 | return try dcp_list.toOwnedSlice(); |
| 390 | } | 369 | } |
| 391 | 370 | ||
| 392 | fn changesWhenCaseFolded(self: Self, cps: []const u21) bool { | ||
| 393 | return for (cps) |cp| { | ||
| 394 | if (self.norm_data.fold_data.changesWhenCaseFolded(cp)) break true; | ||
| 395 | } else false; | ||
| 396 | } | ||
| 397 | |||
| 398 | pub fn eqlIgnoreCase( | ||
| 399 | self: Self, | ||
| 400 | allocator: mem.Allocator, | ||
| 401 | a: []const u8, | ||
| 402 | b: []const u8, | ||
| 403 | ) !bool { | ||
| 404 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); | ||
| 405 | |||
| 406 | // Process a | ||
| 407 | const nfd_a = try self.nfxdCodePoints(allocator, a, .nfd); | ||
| 408 | defer allocator.free(nfd_a); | ||
| 409 | |||
| 410 | var need_frr_cf_nfd_a = false; | ||
| 411 | var cf_nfd_a: []const u21 = nfd_a; | ||
| 412 | if (self.changesWhenCaseFolded(nfd_a)) { | ||
| 413 | cf_nfd_a = try self.caseFold(allocator, nfd_a); | ||
| 414 | need_frr_cf_nfd_a = true; | ||
| 415 | } | ||
| 416 | defer if (need_frr_cf_nfd_a) allocator.free(cf_nfd_a); | ||
| 417 | |||
| 418 | const nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfd_a); | ||
| 419 | defer allocator.free(nfkd_cf_nfd_a); | ||
| 420 | const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a); | ||
| 421 | defer allocator.free(cf_nfkd_cf_nfd_a); | ||
| 422 | const nfkd_cf_nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); | ||
| 423 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); | ||
| 424 | |||
| 425 | // Process b | ||
| 426 | const nfd_b = try self.nfxdCodePoints(allocator, b, .nfd); | ||
| 427 | defer allocator.free(nfd_b); | ||
| 428 | |||
| 429 | var need_frr_cf_nfd_b = false; | ||
| 430 | var cf_nfd_b: []const u21 = nfd_b; | ||
| 431 | if (self.changesWhenCaseFolded(nfd_b)) { | ||
| 432 | cf_nfd_b = try self.caseFold(allocator, nfd_b); | ||
| 433 | need_frr_cf_nfd_b = true; | ||
| 434 | } | ||
| 435 | defer if (need_frr_cf_nfd_b) allocator.free(cf_nfd_b); | ||
| 436 | |||
| 437 | const nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfd_b); | ||
| 438 | defer allocator.free(nfkd_cf_nfd_b); | ||
| 439 | const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b); | ||
| 440 | defer allocator.free(cf_nfkd_cf_nfd_b); | ||
| 441 | const nfkd_cf_nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); | ||
| 442 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); | ||
| 443 | |||
| 444 | return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); | ||
| 445 | } | ||
| 446 | |||
| 447 | test "eqlIgnoreCase" { | ||
| 448 | const allocator = testing.allocator; | ||
| 449 | var data = try NormData.init(allocator); | ||
| 450 | defer data.deinit(); | ||
| 451 | var n = Self{ .norm_data = &data }; | ||
| 452 | |||
| 453 | try testing.expect(try n.eqlIgnoreCase(allocator, "ascii only!", "ASCII Only!")); | ||
| 454 | |||
| 455 | const a = "Héllo World! \u{3d3}"; | ||
| 456 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; | ||
| 457 | try testing.expect(try n.eqlIgnoreCase(allocator, a, b)); | ||
| 458 | |||
| 459 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; | ||
| 460 | try testing.expect(try n.eqlIgnoreCase(allocator, a, c)); | ||
| 461 | } | ||
| 462 | |||
| 463 | // Composition (NFC, NFKC) | 371 | // Composition (NFC, NFKC) |
| 464 | 372 | ||
| 465 | fn isHangul(self: Self, cp: u21) bool { | 373 | fn isHangul(self: Self, cp: u21) bool { |
diff --git a/src/main.zig b/src/main.zig index a5afa66..46e7c9d 100644 --- a/src/main.zig +++ b/src/main.zig | |||
| @@ -11,15 +11,16 @@ const std = @import("std"); | |||
| 11 | // const strWidth = @import("display_width").strWidth; | 11 | // const strWidth = @import("display_width").strWidth; |
| 12 | 12 | ||
| 13 | // const CodePointIterator = @import("ziglyph").CodePointIterator; | 13 | // const CodePointIterator = @import("ziglyph").CodePointIterator; |
| 14 | const CodePointIterator = @import("code_point").Iterator; | 14 | // const CodePointIterator = @import("code_point").Iterator; |
| 15 | 15 | ||
| 16 | // const ascii = @import("ascii"); | 16 | // const ascii = @import("ascii"); |
| 17 | // const ascii = std.ascii; | 17 | // const ascii = std.ascii; |
| 18 | 18 | ||
| 19 | // const Normalizer = @import("ziglyph").Normalizer; | 19 | // const Normalizer = @import("ziglyph").Normalizer; |
| 20 | const NormData = @import("Normalizer").NormData; | ||
| 21 | const Normalizer = @import("Normalizer"); | 20 | const Normalizer = @import("Normalizer"); |
| 22 | 21 | ||
| 22 | const Caser = @import("Caser"); | ||
| 23 | |||
| 23 | // const GenCatData = @import("GenCatData"); | 24 | // const GenCatData = @import("GenCatData"); |
| 24 | 25 | ||
| 25 | pub fn main() !void { | 26 | pub fn main() !void { |
| @@ -34,15 +35,19 @@ pub fn main() !void { | |||
| 34 | const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32)); | 35 | const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32)); |
| 35 | defer allocator.free(input); | 36 | defer allocator.free(input); |
| 36 | 37 | ||
| 37 | var data = try NormData.init(allocator); | 38 | var norm_data = try Normalizer.NormData.init(allocator); |
| 38 | defer data.deinit(); | 39 | defer norm_data.deinit(); |
| 39 | var n = Normalizer{ .norm_data = &data }; | 40 | var norm = Normalizer{ .norm_data = &norm_data }; |
| 40 | // var n = try Normalizer.init(allocator); | 41 | // var norm = try Normalizer.init(allocator); |
| 41 | // defer n.deinit(); | 42 | // defer norm.deinit(); |
| 42 | 43 | ||
| 43 | // var gencat_data = try GenCatData.init(allocator); | 44 | // var gencat_data = try GenCatData.init(allocator); |
| 44 | // defer gencat_data.deinit(); | 45 | // defer gencat_data.deinit(); |
| 45 | 46 | ||
| 47 | var fold_data = try Caser.FoldData.init(allocator); | ||
| 48 | defer fold_data.deinit(); | ||
| 49 | var caser = Caser{ .fold_data = &fold_data }; | ||
| 50 | |||
| 46 | // var iter = GraphemeIterator.init(input, &data); | 51 | // var iter = GraphemeIterator.init(input, &data); |
| 47 | // defer iter.deinit(); | 52 | // defer iter.deinit(); |
| 48 | // var iter = CodePointIterator{ .bytes = input }; | 53 | // var iter = CodePointIterator{ .bytes = input }; |
| @@ -66,7 +71,7 @@ pub fn main() !void { | |||
| 66 | // result += 1; | 71 | // result += 1; |
| 67 | // } | 72 | // } |
| 68 | while (iter.next()) |line| { | 73 | while (iter.next()) |line| { |
| 69 | if (try n.eqlIgnoreCase(allocator, prev_line, line)) { | 74 | if (try caser.compatCaselessMatch(allocator, &norm, prev_line, line)) { |
| 70 | result += line.len; | 75 | result += line.len; |
| 71 | } | 76 | } |
| 72 | prev_line = line; | 77 | prev_line = line; |