summaryrefslogtreecommitdiff
path: root/src/Normalize.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/Normalize.zig')
-rw-r--r--src/Normalize.zig119
1 files changed, 38 insertions, 81 deletions
diff --git a/src/Normalize.zig b/src/Normalize.zig
index 4a1bae8..3191a8c 100644
--- a/src/Normalize.zig
+++ b/src/Normalize.zig
@@ -3,64 +3,22 @@
3//! NFKC, NFD, and NFKD normalization forms. 3//! NFKC, NFD, and NFKD normalization forms.
4 4
5canon_data: CanonData = undefined, 5canon_data: CanonData = undefined,
6ccc_data: CccData = undefined,
7compat_data: CompatData = undefined,
8hangul_data: HangulData = undefined,
9normp_data: NormPropsData = undefined,
10 6
11const Normalize = @This(); 7const Normalize = @This();
12 8
13pub fn init(allocator: Allocator) Allocator.Error!Normalize { 9pub fn init(allocator: Allocator) !Normalize {
14 var norm: Normalize = undefined; 10 var norm: Normalize = undefined;
15 try norm.setup(allocator); 11 try norm.setup(allocator);
16 return norm; 12 return norm;
17} 13}
18 14
19pub fn setup(self: *Normalize, allocator: Allocator) Allocator.Error!void { 15pub fn setup(self: *Normalize, allocator: Allocator) !void {
20 self.canon_data = CanonData.init(allocator) catch |err| { 16 self.canon_data = try CanonData.init(allocator);
21 switch (err) {
22 error.OutOfMemory => |e| return e,
23 else => unreachable,
24 }
25 };
26 errdefer self.canon_data.deinit(allocator);
27 self.ccc_data = CccData.init(allocator) catch |err| {
28 switch (err) {
29 error.OutOfMemory => |e| return e,
30 else => unreachable,
31 }
32 };
33 errdefer self.ccc_data.deinit(allocator);
34 self.compat_data = CompatData.init(allocator) catch |err| {
35 switch (err) {
36 error.OutOfMemory => |e| return e,
37 else => unreachable,
38 }
39 };
40 errdefer self.compat_data.deinit(allocator);
41 self.hangul_data = HangulData.init(allocator) catch |err| {
42 switch (err) {
43 error.OutOfMemory => |e| return e,
44 else => unreachable,
45 }
46 };
47 errdefer self.hangul_data.deinit(allocator);
48 self.normp_data = NormPropsData.init(allocator) catch |err| {
49 switch (err) {
50 error.OutOfMemory => |e| return e,
51 else => unreachable,
52 }
53 };
54} 17}
55 18
56pub fn deinit(norm: *const Normalize, allocator: Allocator) void { 19pub fn deinit(norm: *const Normalize, allocator: Allocator) void {
57 // Reasonably safe (?) 20 const mut_norm = @constCast(norm);
58 var mut_norm = @constCast(norm);
59 mut_norm.canon_data.deinit(allocator); 21 mut_norm.canon_data.deinit(allocator);
60 mut_norm.ccc_data.deinit(allocator);
61 mut_norm.compat_data.deinit(allocator);
62 mut_norm.hangul_data.deinit(allocator);
63 mut_norm.normp_data.deinit(allocator);
64} 22}
65 23
66const SBase: u21 = 0xAC00; 24const SBase: u21 = 0xAC00;
@@ -73,8 +31,8 @@ const TCount: u21 = 28;
73const NCount: u21 = 588; // VCount * TCount 31const NCount: u21 = 588; // VCount * TCount
74const SCount: u21 = 11172; // LCount * NCount 32const SCount: u21 = 11172; // LCount * NCount
75 33
76fn decomposeHangul(self: Normalize, cp: u21, buf: []u21) ?Decomp { 34fn decomposeHangul(cp: u21, buf: []u21) ?Decomp {
77 const kind = self.hangul_data.syllable(cp); 35 const kind = HangulData.syllable(cp);
78 if (kind != .LV and kind != .LVT) return null; 36 if (kind != .LV and kind != .LVT) return null;
79 37
80 const SIndex: u21 = cp - SBase; 38 const SIndex: u21 = cp - SBase;
@@ -143,7 +101,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp {
143 }, 101 },
144 102
145 .nfkd => { 103 .nfkd => {
146 dc.cps = self.compat_data.toNfkd(cp); 104 dc.cps = CompatData.toNfkd(cp);
147 if (dc.cps.len != 0) { 105 if (dc.cps.len != 0) {
148 dc.form = .nfkd; 106 dc.form = .nfkd;
149 } else { 107 } else {
@@ -170,13 +128,13 @@ fn decompose(
170 128
171 // NFD / NFKD quick checks. 129 // NFD / NFKD quick checks.
172 switch (form) { 130 switch (form) {
173 .nfd => if (self.normp_data.isNfd(cp)) return .{}, 131 .nfd => if (NormPropsData.isNfd(cp)) return .{},
174 .nfkd => if (self.normp_data.isNfkd(cp)) return .{}, 132 .nfkd => if (NormPropsData.isNfkd(cp)) return .{},
175 else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."), 133 else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."),
176 } 134 }
177 135
178 // Hangul precomposed syllable full decomposition. 136 // Hangul precomposed syllable full decomposition.
179 if (self.decomposeHangul(cp, buf)) |dc| return dc; 137 if (decomposeHangul(cp, buf)) |dc| return dc;
180 138
181 // Full decomposition. 139 // Full decomposition.
182 var dc = Decomp{ .form = form }; 140 var dc = Decomp{ .form = form };
@@ -218,9 +176,8 @@ fn decompose(
218 176
219test "decompose" { 177test "decompose" {
220 const allocator = testing.allocator; 178 const allocator = testing.allocator;
221 const n = try Normalize.init(allocator); 179 var n = try Normalize.init(allocator);
222 defer n.deinit(allocator); 180 defer n.deinit(allocator);
223
224 var buf: [18]u21 = undefined; 181 var buf: [18]u21 = undefined;
225 182
226 var dc = n.decompose('é', .nfd, &buf); 183 var dc = n.decompose('é', .nfd, &buf);
@@ -280,17 +237,17 @@ pub const Result = struct {
280}; 237};
281 238
282// Compares code points by Canonical Combining Class order. 239// Compares code points by Canonical Combining Class order.
283fn cccLess(self: Normalize, lhs: u21, rhs: u21) bool { 240fn cccLess(_: void, lhs: u21, rhs: u21) bool {
284 return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs); 241 return CombiningData.ccc(lhs) < CombiningData.ccc(rhs);
285} 242}
286 243
287// Applies the Canonical Sorting Algorithm. 244// Applies the Canonical Sorting Algorithm.
288fn canonicalSort(self: Normalize, cps: []u21) void { 245fn canonicalSort(cps: []u21) void {
289 var i: usize = 0; 246 var i: usize = 0;
290 while (i < cps.len) : (i += 1) { 247 while (i < cps.len) : (i += 1) {
291 const start: usize = i; 248 const start: usize = i;
292 while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} 249 while (i < cps.len and CombiningData.ccc(cps[i]) != 0) : (i += 1) {}
293 mem.sort(u21, cps[start..i], self, cccLess); 250 mem.sort(u21, cps[start..i], {}, cccLess);
294 } 251 }
295} 252}
296 253
@@ -320,7 +277,7 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo
320 } 277 }
321 } 278 }
322 279
323 self.canonicalSort(dcp_list.items); 280 canonicalSort(dcp_list.items);
324 281
325 return try dcp_list.toOwnedSlice(); 282 return try dcp_list.toOwnedSlice();
326} 283}
@@ -346,7 +303,7 @@ fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
346 303
347test "nfd ASCII / no-alloc" { 304test "nfd ASCII / no-alloc" {
348 const allocator = testing.allocator; 305 const allocator = testing.allocator;
349 const n = try Normalize.init(allocator); 306 var n = try Normalize.init(allocator);
350 defer n.deinit(allocator); 307 defer n.deinit(allocator);
351 308
352 const result = try n.nfd(allocator, "Hello World!"); 309 const result = try n.nfd(allocator, "Hello World!");
@@ -357,7 +314,7 @@ test "nfd ASCII / no-alloc" {
357 314
358test "nfd !ASCII / alloc" { 315test "nfd !ASCII / alloc" {
359 const allocator = testing.allocator; 316 const allocator = testing.allocator;
360 const n = try Normalize.init(allocator); 317 var n = try Normalize.init(allocator);
361 defer n.deinit(allocator); 318 defer n.deinit(allocator);
362 319
363 const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); 320 const result = try n.nfd(allocator, "Héllo World! \u{3d3}");
@@ -368,7 +325,7 @@ test "nfd !ASCII / alloc" {
368 325
369test "nfkd ASCII / no-alloc" { 326test "nfkd ASCII / no-alloc" {
370 const allocator = testing.allocator; 327 const allocator = testing.allocator;
371 const n = try Normalize.init(allocator); 328 var n = try Normalize.init(allocator);
372 defer n.deinit(allocator); 329 defer n.deinit(allocator);
373 330
374 const result = try n.nfkd(allocator, "Hello World!"); 331 const result = try n.nfkd(allocator, "Hello World!");
@@ -379,7 +336,7 @@ test "nfkd ASCII / no-alloc" {
379 336
380test "nfkd !ASCII / alloc" { 337test "nfkd !ASCII / alloc" {
381 const allocator = testing.allocator; 338 const allocator = testing.allocator;
382 const n = try Normalize.init(allocator); 339 var n = try Normalize.init(allocator);
383 defer n.deinit(allocator); 340 defer n.deinit(allocator);
384 341
385 const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); 342 const result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
@@ -408,7 +365,7 @@ pub fn nfdCodePoints(
408 } 365 }
409 } 366 }
410 367
411 self.canonicalSort(dcp_list.items); 368 canonicalSort(dcp_list.items);
412 369
413 return try dcp_list.toOwnedSlice(); 370 return try dcp_list.toOwnedSlice();
414} 371}
@@ -433,15 +390,15 @@ pub fn nfkdCodePoints(
433 } 390 }
434 } 391 }
435 392
436 self.canonicalSort(dcp_list.items); 393 canonicalSort(dcp_list.items);
437 394
438 return try dcp_list.toOwnedSlice(); 395 return try dcp_list.toOwnedSlice();
439} 396}
440 397
441// Composition (NFC, NFKC) 398// Composition (NFC, NFKC)
442 399
443fn isHangul(self: Normalize, cp: u21) bool { 400fn isHangul(cp: u21) bool {
444 return cp >= 0x1100 and self.hangul_data.syllable(cp) != .none; 401 return cp >= 0x1100 and HangulData.syllable(cp) != .none;
445} 402}
446 403
447/// Normalizes `str` to NFC. 404/// Normalizes `str` to NFC.
@@ -479,7 +436,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
479 block_check: while (i < dcps.len) : (i += 1) { 436 block_check: while (i < dcps.len) : (i += 1) {
480 const C = dcps[i]; 437 const C = dcps[i];
481 if (C == tombstone) continue :block_check; 438 if (C == tombstone) continue :block_check;
482 const cc_C = self.ccc_data.ccc(C); 439 const cc_C = CombiningData.ccc(C);
483 var starter_index: ?usize = null; 440 var starter_index: ?usize = null;
484 var j: usize = i; 441 var j: usize = i;
485 442
@@ -489,12 +446,12 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
489 if (dcps[j] == tombstone) continue; 446 if (dcps[j] == tombstone) continue;
490 447
491 // Check for starter. 448 // Check for starter.
492 if (self.ccc_data.isStarter(dcps[j])) { 449 if (CombiningData.isStarter(dcps[j])) {
493 // Check for blocking conditions. 450 // Check for blocking conditions.
494 for (dcps[(j + 1)..i]) |B| { 451 for (dcps[(j + 1)..i]) |B| {
495 if (B == tombstone) continue; 452 if (B == tombstone) continue;
496 const cc_B = self.ccc_data.ccc(B); 453 const cc_B = CombiningData.ccc(B);
497 if (cc_B != 0 and self.isHangul(C)) continue :block_check; 454 if (cc_B != 0 and isHangul(C)) continue :block_check;
498 if (cc_B >= cc_C) continue :block_check; 455 if (cc_B >= cc_C) continue :block_check;
499 } 456 }
500 457
@@ -515,10 +472,10 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
515 472
516 // If L and C are Hangul syllables, we can compose 473 // If L and C are Hangul syllables, we can compose
517 // them algorithmically if possible. 474 // them algorithmically if possible.
518 if (self.isHangul(L) and self.isHangul(C)) { 475 if (isHangul(L) and isHangul(C)) {
519 // Get Hangul syllable types. 476 // Get Hangul syllable types.
520 const l_stype = self.hangul_data.syllable(L); 477 const l_stype = HangulData.syllable(L);
521 const c_stype = self.hangul_data.syllable(C); 478 const c_stype = HangulData.syllable(C);
522 479
523 if (l_stype == .LV and c_stype == .T) { 480 if (l_stype == .LV and c_stype == .T) {
524 // LV, T canonical composition. 481 // LV, T canonical composition.
@@ -547,7 +504,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
547 // Composition Exclusions (FCX) list, 504 // Composition Exclusions (FCX) list,
548 // preventing it from appearing in any 505 // preventing it from appearing in any
549 // composed form (NFC, NFKC). 506 // composed form (NFC, NFKC).
550 if (!self.normp_data.isFcx(P)) { 507 if (!NormPropsData.isFcx(P)) {
551 dcps[sidx] = P; 508 dcps[sidx] = P;
552 dcps[i] = tombstone; // Mark for deletion. 509 dcps[i] = tombstone; // Mark for deletion.
553 deleted += 1; 510 deleted += 1;
@@ -577,7 +534,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
577 534
578test "nfc" { 535test "nfc" {
579 const allocator = testing.allocator; 536 const allocator = testing.allocator;
580 const n = try Normalize.init(allocator); 537 var n = try Normalize.init(allocator);
581 defer n.deinit(allocator); 538 defer n.deinit(allocator);
582 539
583 const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); 540 const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
@@ -588,7 +545,7 @@ test "nfc" {
588 545
589test "nfkc" { 546test "nfkc" {
590 const allocator = testing.allocator; 547 const allocator = testing.allocator;
591 const n = try Normalize.init(allocator); 548 var n = try Normalize.init(allocator);
592 defer n.deinit(allocator); 549 defer n.deinit(allocator);
593 550
594 const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); 551 const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
@@ -609,7 +566,7 @@ pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8)
609 566
610test "eql" { 567test "eql" {
611 const allocator = testing.allocator; 568 const allocator = testing.allocator;
612 const n = try Normalize.init(allocator); 569 var n = try Normalize.init(allocator);
613 defer n.deinit(allocator); 570 defer n.deinit(allocator);
614 571
615 try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); 572 try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
@@ -666,13 +623,13 @@ const mem = std.mem;
666const simd = std.simd; 623const simd = std.simd;
667const testing = std.testing; 624const testing = std.testing;
668const unicode = std.unicode; 625const unicode = std.unicode;
669const Allocator = std.mem.Allocator; 626const Allocator = mem.Allocator;
670 627
671const ascii = @import("ascii"); 628const ascii = @import("ascii");
672const CodePointIterator = @import("code_point").Iterator; 629const CodePointIterator = @import("code_point").Iterator;
673 630
674const CanonData = @import("CanonData"); 631const CanonData = @import("CanonData");
675const CccData = @import("CombiningData"); 632const CombiningData = @import("CombiningData");
676const CompatData = @import("CompatData"); 633const CompatData = @import("CompatData");
677const HangulData = @import("HangulData"); 634const HangulData = @import("HangulData");
678const NormPropsData = @import("NormPropsData"); 635const NormPropsData = @import("NormPropsData");