diff options
Diffstat (limited to 'src/Normalize.zig')
| -rw-r--r-- | src/Normalize.zig | 119 |
1 files changed, 38 insertions, 81 deletions
diff --git a/src/Normalize.zig b/src/Normalize.zig index 4a1bae8..3191a8c 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig | |||
| @@ -3,64 +3,22 @@ | |||
| 3 | //! NFKC, NFD, and NFKD normalization forms. | 3 | //! NFKC, NFD, and NFKD normalization forms. |
| 4 | 4 | ||
| 5 | canon_data: CanonData = undefined, | 5 | canon_data: CanonData = undefined, |
| 6 | ccc_data: CccData = undefined, | ||
| 7 | compat_data: CompatData = undefined, | ||
| 8 | hangul_data: HangulData = undefined, | ||
| 9 | normp_data: NormPropsData = undefined, | ||
| 10 | 6 | ||
| 11 | const Normalize = @This(); | 7 | const Normalize = @This(); |
| 12 | 8 | ||
| 13 | pub fn init(allocator: Allocator) Allocator.Error!Normalize { | 9 | pub fn init(allocator: Allocator) !Normalize { |
| 14 | var norm: Normalize = undefined; | 10 | var norm: Normalize = undefined; |
| 15 | try norm.setup(allocator); | 11 | try norm.setup(allocator); |
| 16 | return norm; | 12 | return norm; |
| 17 | } | 13 | } |
| 18 | 14 | ||
| 19 | pub fn setup(self: *Normalize, allocator: Allocator) Allocator.Error!void { | 15 | pub fn setup(self: *Normalize, allocator: Allocator) !void { |
| 20 | self.canon_data = CanonData.init(allocator) catch |err| { | 16 | self.canon_data = try CanonData.init(allocator); |
| 21 | switch (err) { | ||
| 22 | error.OutOfMemory => |e| return e, | ||
| 23 | else => unreachable, | ||
| 24 | } | ||
| 25 | }; | ||
| 26 | errdefer self.canon_data.deinit(allocator); | ||
| 27 | self.ccc_data = CccData.init(allocator) catch |err| { | ||
| 28 | switch (err) { | ||
| 29 | error.OutOfMemory => |e| return e, | ||
| 30 | else => unreachable, | ||
| 31 | } | ||
| 32 | }; | ||
| 33 | errdefer self.ccc_data.deinit(allocator); | ||
| 34 | self.compat_data = CompatData.init(allocator) catch |err| { | ||
| 35 | switch (err) { | ||
| 36 | error.OutOfMemory => |e| return e, | ||
| 37 | else => unreachable, | ||
| 38 | } | ||
| 39 | }; | ||
| 40 | errdefer self.compat_data.deinit(allocator); | ||
| 41 | self.hangul_data = HangulData.init(allocator) catch |err| { | ||
| 42 | switch (err) { | ||
| 43 | error.OutOfMemory => |e| return e, | ||
| 44 | else => unreachable, | ||
| 45 | } | ||
| 46 | }; | ||
| 47 | errdefer self.hangul_data.deinit(allocator); | ||
| 48 | self.normp_data = NormPropsData.init(allocator) catch |err| { | ||
| 49 | switch (err) { | ||
| 50 | error.OutOfMemory => |e| return e, | ||
| 51 | else => unreachable, | ||
| 52 | } | ||
| 53 | }; | ||
| 54 | } | 17 | } |
| 55 | 18 | ||
| 56 | pub fn deinit(norm: *const Normalize, allocator: Allocator) void { | 19 | pub fn deinit(norm: *const Normalize, allocator: Allocator) void { |
| 57 | // Reasonably safe (?) | 20 | const mut_norm = @constCast(norm); |
| 58 | var mut_norm = @constCast(norm); | ||
| 59 | mut_norm.canon_data.deinit(allocator); | 21 | mut_norm.canon_data.deinit(allocator); |
| 60 | mut_norm.ccc_data.deinit(allocator); | ||
| 61 | mut_norm.compat_data.deinit(allocator); | ||
| 62 | mut_norm.hangul_data.deinit(allocator); | ||
| 63 | mut_norm.normp_data.deinit(allocator); | ||
| 64 | } | 22 | } |
| 65 | 23 | ||
| 66 | const SBase: u21 = 0xAC00; | 24 | const SBase: u21 = 0xAC00; |
| @@ -73,8 +31,8 @@ const TCount: u21 = 28; | |||
| 73 | const NCount: u21 = 588; // VCount * TCount | 31 | const NCount: u21 = 588; // VCount * TCount |
| 74 | const SCount: u21 = 11172; // LCount * NCount | 32 | const SCount: u21 = 11172; // LCount * NCount |
| 75 | 33 | ||
| 76 | fn decomposeHangul(self: Normalize, cp: u21, buf: []u21) ?Decomp { | 34 | fn decomposeHangul(cp: u21, buf: []u21) ?Decomp { |
| 77 | const kind = self.hangul_data.syllable(cp); | 35 | const kind = HangulData.syllable(cp); |
| 78 | if (kind != .LV and kind != .LVT) return null; | 36 | if (kind != .LV and kind != .LVT) return null; |
| 79 | 37 | ||
| 80 | const SIndex: u21 = cp - SBase; | 38 | const SIndex: u21 = cp - SBase; |
| @@ -143,7 +101,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp { | |||
| 143 | }, | 101 | }, |
| 144 | 102 | ||
| 145 | .nfkd => { | 103 | .nfkd => { |
| 146 | dc.cps = self.compat_data.toNfkd(cp); | 104 | dc.cps = CompatData.toNfkd(cp); |
| 147 | if (dc.cps.len != 0) { | 105 | if (dc.cps.len != 0) { |
| 148 | dc.form = .nfkd; | 106 | dc.form = .nfkd; |
| 149 | } else { | 107 | } else { |
| @@ -170,13 +128,13 @@ fn decompose( | |||
| 170 | 128 | ||
| 171 | // NFD / NFKD quick checks. | 129 | // NFD / NFKD quick checks. |
| 172 | switch (form) { | 130 | switch (form) { |
| 173 | .nfd => if (self.normp_data.isNfd(cp)) return .{}, | 131 | .nfd => if (NormPropsData.isNfd(cp)) return .{}, |
| 174 | .nfkd => if (self.normp_data.isNfkd(cp)) return .{}, | 132 | .nfkd => if (NormPropsData.isNfkd(cp)) return .{}, |
| 175 | else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."), | 133 | else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."), |
| 176 | } | 134 | } |
| 177 | 135 | ||
| 178 | // Hangul precomposed syllable full decomposition. | 136 | // Hangul precomposed syllable full decomposition. |
| 179 | if (self.decomposeHangul(cp, buf)) |dc| return dc; | 137 | if (decomposeHangul(cp, buf)) |dc| return dc; |
| 180 | 138 | ||
| 181 | // Full decomposition. | 139 | // Full decomposition. |
| 182 | var dc = Decomp{ .form = form }; | 140 | var dc = Decomp{ .form = form }; |
| @@ -218,9 +176,8 @@ fn decompose( | |||
| 218 | 176 | ||
| 219 | test "decompose" { | 177 | test "decompose" { |
| 220 | const allocator = testing.allocator; | 178 | const allocator = testing.allocator; |
| 221 | const n = try Normalize.init(allocator); | 179 | var n = try Normalize.init(allocator); |
| 222 | defer n.deinit(allocator); | 180 | defer n.deinit(allocator); |
| 223 | |||
| 224 | var buf: [18]u21 = undefined; | 181 | var buf: [18]u21 = undefined; |
| 225 | 182 | ||
| 226 | var dc = n.decompose('é', .nfd, &buf); | 183 | var dc = n.decompose('é', .nfd, &buf); |
| @@ -280,17 +237,17 @@ pub const Result = struct { | |||
| 280 | }; | 237 | }; |
| 281 | 238 | ||
| 282 | // Compares code points by Canonical Combining Class order. | 239 | // Compares code points by Canonical Combining Class order. |
| 283 | fn cccLess(self: Normalize, lhs: u21, rhs: u21) bool { | 240 | fn cccLess(_: void, lhs: u21, rhs: u21) bool { |
| 284 | return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs); | 241 | return CombiningData.ccc(lhs) < CombiningData.ccc(rhs); |
| 285 | } | 242 | } |
| 286 | 243 | ||
| 287 | // Applies the Canonical Sorting Algorithm. | 244 | // Applies the Canonical Sorting Algorithm. |
| 288 | fn canonicalSort(self: Normalize, cps: []u21) void { | 245 | fn canonicalSort(cps: []u21) void { |
| 289 | var i: usize = 0; | 246 | var i: usize = 0; |
| 290 | while (i < cps.len) : (i += 1) { | 247 | while (i < cps.len) : (i += 1) { |
| 291 | const start: usize = i; | 248 | const start: usize = i; |
| 292 | while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} | 249 | while (i < cps.len and CombiningData.ccc(cps[i]) != 0) : (i += 1) {} |
| 293 | mem.sort(u21, cps[start..i], self, cccLess); | 250 | mem.sort(u21, cps[start..i], {}, cccLess); |
| 294 | } | 251 | } |
| 295 | } | 252 | } |
| 296 | 253 | ||
| @@ -320,7 +277,7 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo | |||
| 320 | } | 277 | } |
| 321 | } | 278 | } |
| 322 | 279 | ||
| 323 | self.canonicalSort(dcp_list.items); | 280 | canonicalSort(dcp_list.items); |
| 324 | 281 | ||
| 325 | return try dcp_list.toOwnedSlice(); | 282 | return try dcp_list.toOwnedSlice(); |
| 326 | } | 283 | } |
| @@ -346,7 +303,7 @@ fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo | |||
| 346 | 303 | ||
| 347 | test "nfd ASCII / no-alloc" { | 304 | test "nfd ASCII / no-alloc" { |
| 348 | const allocator = testing.allocator; | 305 | const allocator = testing.allocator; |
| 349 | const n = try Normalize.init(allocator); | 306 | var n = try Normalize.init(allocator); |
| 350 | defer n.deinit(allocator); | 307 | defer n.deinit(allocator); |
| 351 | 308 | ||
| 352 | const result = try n.nfd(allocator, "Hello World!"); | 309 | const result = try n.nfd(allocator, "Hello World!"); |
| @@ -357,7 +314,7 @@ test "nfd ASCII / no-alloc" { | |||
| 357 | 314 | ||
| 358 | test "nfd !ASCII / alloc" { | 315 | test "nfd !ASCII / alloc" { |
| 359 | const allocator = testing.allocator; | 316 | const allocator = testing.allocator; |
| 360 | const n = try Normalize.init(allocator); | 317 | var n = try Normalize.init(allocator); |
| 361 | defer n.deinit(allocator); | 318 | defer n.deinit(allocator); |
| 362 | 319 | ||
| 363 | const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); | 320 | const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); |
| @@ -368,7 +325,7 @@ test "nfd !ASCII / alloc" { | |||
| 368 | 325 | ||
| 369 | test "nfkd ASCII / no-alloc" { | 326 | test "nfkd ASCII / no-alloc" { |
| 370 | const allocator = testing.allocator; | 327 | const allocator = testing.allocator; |
| 371 | const n = try Normalize.init(allocator); | 328 | var n = try Normalize.init(allocator); |
| 372 | defer n.deinit(allocator); | 329 | defer n.deinit(allocator); |
| 373 | 330 | ||
| 374 | const result = try n.nfkd(allocator, "Hello World!"); | 331 | const result = try n.nfkd(allocator, "Hello World!"); |
| @@ -379,7 +336,7 @@ test "nfkd ASCII / no-alloc" { | |||
| 379 | 336 | ||
| 380 | test "nfkd !ASCII / alloc" { | 337 | test "nfkd !ASCII / alloc" { |
| 381 | const allocator = testing.allocator; | 338 | const allocator = testing.allocator; |
| 382 | const n = try Normalize.init(allocator); | 339 | var n = try Normalize.init(allocator); |
| 383 | defer n.deinit(allocator); | 340 | defer n.deinit(allocator); |
| 384 | 341 | ||
| 385 | const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); | 342 | const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); |
| @@ -408,7 +365,7 @@ pub fn nfdCodePoints( | |||
| 408 | } | 365 | } |
| 409 | } | 366 | } |
| 410 | 367 | ||
| 411 | self.canonicalSort(dcp_list.items); | 368 | canonicalSort(dcp_list.items); |
| 412 | 369 | ||
| 413 | return try dcp_list.toOwnedSlice(); | 370 | return try dcp_list.toOwnedSlice(); |
| 414 | } | 371 | } |
| @@ -433,15 +390,15 @@ pub fn nfkdCodePoints( | |||
| 433 | } | 390 | } |
| 434 | } | 391 | } |
| 435 | 392 | ||
| 436 | self.canonicalSort(dcp_list.items); | 393 | canonicalSort(dcp_list.items); |
| 437 | 394 | ||
| 438 | return try dcp_list.toOwnedSlice(); | 395 | return try dcp_list.toOwnedSlice(); |
| 439 | } | 396 | } |
| 440 | 397 | ||
| 441 | // Composition (NFC, NFKC) | 398 | // Composition (NFC, NFKC) |
| 442 | 399 | ||
| 443 | fn isHangul(self: Normalize, cp: u21) bool { | 400 | fn isHangul(cp: u21) bool { |
| 444 | return cp >= 0x1100 and self.hangul_data.syllable(cp) != .none; | 401 | return cp >= 0x1100 and HangulData.syllable(cp) != .none; |
| 445 | } | 402 | } |
| 446 | 403 | ||
| 447 | /// Normalizes `str` to NFC. | 404 | /// Normalizes `str` to NFC. |
| @@ -479,7 +436,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo | |||
| 479 | block_check: while (i < dcps.len) : (i += 1) { | 436 | block_check: while (i < dcps.len) : (i += 1) { |
| 480 | const C = dcps[i]; | 437 | const C = dcps[i]; |
| 481 | if (C == tombstone) continue :block_check; | 438 | if (C == tombstone) continue :block_check; |
| 482 | const cc_C = self.ccc_data.ccc(C); | 439 | const cc_C = CombiningData.ccc(C); |
| 483 | var starter_index: ?usize = null; | 440 | var starter_index: ?usize = null; |
| 484 | var j: usize = i; | 441 | var j: usize = i; |
| 485 | 442 | ||
| @@ -489,12 +446,12 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo | |||
| 489 | if (dcps[j] == tombstone) continue; | 446 | if (dcps[j] == tombstone) continue; |
| 490 | 447 | ||
| 491 | // Check for starter. | 448 | // Check for starter. |
| 492 | if (self.ccc_data.isStarter(dcps[j])) { | 449 | if (CombiningData.isStarter(dcps[j])) { |
| 493 | // Check for blocking conditions. | 450 | // Check for blocking conditions. |
| 494 | for (dcps[(j + 1)..i]) |B| { | 451 | for (dcps[(j + 1)..i]) |B| { |
| 495 | if (B == tombstone) continue; | 452 | if (B == tombstone) continue; |
| 496 | const cc_B = self.ccc_data.ccc(B); | 453 | const cc_B = CombiningData.ccc(B); |
| 497 | if (cc_B != 0 and self.isHangul(C)) continue :block_check; | 454 | if (cc_B != 0 and isHangul(C)) continue :block_check; |
| 498 | if (cc_B >= cc_C) continue :block_check; | 455 | if (cc_B >= cc_C) continue :block_check; |
| 499 | } | 456 | } |
| 500 | 457 | ||
| @@ -515,10 +472,10 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo | |||
| 515 | 472 | ||
| 516 | // If L and C are Hangul syllables, we can compose | 473 | // If L and C are Hangul syllables, we can compose |
| 517 | // them algorithmically if possible. | 474 | // them algorithmically if possible. |
| 518 | if (self.isHangul(L) and self.isHangul(C)) { | 475 | if (isHangul(L) and isHangul(C)) { |
| 519 | // Get Hangul syllable types. | 476 | // Get Hangul syllable types. |
| 520 | const l_stype = self.hangul_data.syllable(L); | 477 | const l_stype = HangulData.syllable(L); |
| 521 | const c_stype = self.hangul_data.syllable(C); | 478 | const c_stype = HangulData.syllable(C); |
| 522 | 479 | ||
| 523 | if (l_stype == .LV and c_stype == .T) { | 480 | if (l_stype == .LV and c_stype == .T) { |
| 524 | // LV, T canonical composition. | 481 | // LV, T canonical composition. |
| @@ -547,7 +504,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo | |||
| 547 | // Composition Exclusions (FCX) list, | 504 | // Composition Exclusions (FCX) list, |
| 548 | // preventing it from appearing in any | 505 | // preventing it from appearing in any |
| 549 | // composed form (NFC, NFKC). | 506 | // composed form (NFC, NFKC). |
| 550 | if (!self.normp_data.isFcx(P)) { | 507 | if (!NormPropsData.isFcx(P)) { |
| 551 | dcps[sidx] = P; | 508 | dcps[sidx] = P; |
| 552 | dcps[i] = tombstone; // Mark for deletion. | 509 | dcps[i] = tombstone; // Mark for deletion. |
| 553 | deleted += 1; | 510 | deleted += 1; |
| @@ -577,7 +534,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo | |||
| 577 | 534 | ||
| 578 | test "nfc" { | 535 | test "nfc" { |
| 579 | const allocator = testing.allocator; | 536 | const allocator = testing.allocator; |
| 580 | const n = try Normalize.init(allocator); | 537 | var n = try Normalize.init(allocator); |
| 581 | defer n.deinit(allocator); | 538 | defer n.deinit(allocator); |
| 582 | 539 | ||
| 583 | const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); | 540 | const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); |
| @@ -588,7 +545,7 @@ test "nfc" { | |||
| 588 | 545 | ||
| 589 | test "nfkc" { | 546 | test "nfkc" { |
| 590 | const allocator = testing.allocator; | 547 | const allocator = testing.allocator; |
| 591 | const n = try Normalize.init(allocator); | 548 | var n = try Normalize.init(allocator); |
| 592 | defer n.deinit(allocator); | 549 | defer n.deinit(allocator); |
| 593 | 550 | ||
| 594 | const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); | 551 | const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); |
| @@ -609,7 +566,7 @@ pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8) | |||
| 609 | 566 | ||
| 610 | test "eql" { | 567 | test "eql" { |
| 611 | const allocator = testing.allocator; | 568 | const allocator = testing.allocator; |
| 612 | const n = try Normalize.init(allocator); | 569 | var n = try Normalize.init(allocator); |
| 613 | defer n.deinit(allocator); | 570 | defer n.deinit(allocator); |
| 614 | 571 | ||
| 615 | try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); | 572 | try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); |
| @@ -666,13 +623,13 @@ const mem = std.mem; | |||
| 666 | const simd = std.simd; | 623 | const simd = std.simd; |
| 667 | const testing = std.testing; | 624 | const testing = std.testing; |
| 668 | const unicode = std.unicode; | 625 | const unicode = std.unicode; |
| 669 | const Allocator = std.mem.Allocator; | 626 | const Allocator = mem.Allocator; |
| 670 | 627 | ||
| 671 | const ascii = @import("ascii"); | 628 | const ascii = @import("ascii"); |
| 672 | const CodePointIterator = @import("code_point").Iterator; | 629 | const CodePointIterator = @import("code_point").Iterator; |
| 673 | 630 | ||
| 674 | const CanonData = @import("CanonData"); | 631 | const CanonData = @import("CanonData"); |
| 675 | const CccData = @import("CombiningData"); | 632 | const CombiningData = @import("CombiningData"); |
| 676 | const CompatData = @import("CompatData"); | 633 | const CompatData = @import("CompatData"); |
| 677 | const HangulData = @import("HangulData"); | 634 | const HangulData = @import("HangulData"); |
| 678 | const NormPropsData = @import("NormPropsData"); | 635 | const NormPropsData = @import("NormPropsData"); |