From a498ae01d8198efba90b63dd84f1ee8f9166036a Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Thu, 29 Feb 2024 10:36:18 -0400 Subject: Major Normalizer optimizations --- src/Normalizer.zig | 135 +++++++++++++++++++++++++++++------------------------ 1 file changed, 75 insertions(+), 60 deletions(-) (limited to 'src/Normalizer.zig') diff --git a/src/Normalizer.zig b/src/Normalizer.zig index d32ad52..a575bc4 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig @@ -255,10 +255,7 @@ pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) !Result { return self.nfxd(allocator, str, .nfkd); } -fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Result { - // Quick checks. - if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; - +fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) ![]u21 { var dcp_list = std.ArrayList(u21).init(allocator); defer dcp_list.deinit(); @@ -276,13 +273,23 @@ fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Resu self.canonicalSort(dcp_list.items); - var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4); - defer dstr_list.deinit(); + return try dcp_list.toOwnedSlice(); +} + +fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Result { + // Quick checks. + if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; + + const dcps = try self.nfxdCodePoints(allocator, str, form); + defer allocator.free(dcps); + var dstr_list = std.ArrayList(u8).init(allocator); + defer dstr_list.deinit(); var buf: [4]u8 = undefined; - for (dcp_list.items) |dcp| { + + for (dcps) |dcp| { const len = try unicode.utf8Encode(dcp, &buf); - dstr_list.appendSliceAssumeCapacity(buf[0..len]); + try dstr_list.appendSlice(buf[0..len]); } return Result{ .allocator = allocator, .slice = try dstr_list.toOwnedSlice() }; @@ -342,10 +349,6 @@ fn isHangul(self: Self, cp: u21) bool { return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none; } -fn isNonHangulStarter(self: Self, cp: u21) bool { - return !self.isHangul(cp) and self.norm_data.ccc_data.isStarter(cp); -} - /// Normalizes `str` to NFC. pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) !Result { return self.nfxc(allocator, str, .nfc); @@ -362,47 +365,42 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Resu if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str }; // Decompose first. - var d_result = if (form == .nfc) - try self.nfd(allocator, str) + var dcps = if (form == .nfc) + try self.nfxdCodePoints(allocator, str, .nfd) else - try self.nfkd(allocator, str); - defer d_result.deinit(); - - // Get code points. - var cp_iter = CodePointIterator{ .bytes = d_result.slice }; - - var d_list = try std.ArrayList(u21).initCapacity(allocator, d_result.slice.len); - defer d_list.deinit(); - - while (cp_iter.next()) |cp| d_list.appendAssumeCapacity(cp.code); + try self.nfxdCodePoints(allocator, str, .nfkd); + defer allocator.free(dcps); // Compose const tombstone = 0xe000; // Start of BMP Private Use Area + // Loop over all decomposed code points. while (true) { var i: usize = 1; // start at second code point. var deleted: usize = 0; - block_check: while (i < d_list.items.len) : (i += 1) { - const C = d_list.items[i]; + // For each code point, C, find the preceding + // starter code point L, if any. + block_check: while (i < dcps.len) : (i += 1) { + const C = dcps[i]; + if (C == tombstone) continue :block_check; const cc_C = self.norm_data.ccc_data.ccc(C); var starter_index: ?usize = null; var j: usize = i; + // Seek back to find starter L, if any. while (true) { j -= 1; + if (dcps[j] == tombstone) continue; // Check for starter. - if (self.norm_data.ccc_data.isStarter(d_list.items[j])) { - if (i - j > 1) { // If there's distance between the starting point and the current position. - for (d_list.items[(j + 1)..i]) |B| { - const cc_B = self.norm_data.ccc_data.ccc(B); - // Check for blocking conditions. - if (self.isHangul(C)) { - if (cc_B != 0 or self.isNonHangulStarter(B)) continue :block_check; - } - if (cc_B >= cc_C) continue :block_check; - } + if (self.norm_data.ccc_data.isStarter(dcps[j])) { + // Check for blocking conditions. + for (dcps[(j + 1)..i]) |B| { + if (B == tombstone) continue; + const cc_B = self.norm_data.ccc_data.ccc(B); + if (cc_B != 0 and self.isHangul(C)) continue :block_check; + if (cc_B >= cc_C) continue :block_check; } // Found starter at j. @@ -413,37 +411,50 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Resu if (j == 0) break; } + // If we have a starter L, see if there's a primary + // composite, P, for the sequence L, C. If so, we must + // repace L with P and delete C. if (starter_index) |sidx| { - const L = d_list.items[sidx]; + const L = dcps[sidx]; var processed_hangul = false; + // If L and C are Hangul syllables, we can compose + // them algorithmically if possible. if (self.isHangul(L) and self.isHangul(C)) { + // Get Hangul syllable types. const l_stype = self.norm_data.hangul_data.syllable(L); const c_stype = self.norm_data.hangul_data.syllable(C); if (l_stype == .LV and c_stype == .T) { - // LV, T - d_list.items[sidx] = composeHangulCanon(L, C); - d_list.items[i] = tombstone; // Mark for deletion. + // LV, T canonical composition. + dcps[sidx] = composeHangulCanon(L, C); + dcps[i] = tombstone; // Mark for deletion. processed_hangul = true; } if (l_stype == .L and c_stype == .V) { - // Handle L, V. L, V, T is handled via main loop. - d_list.items[sidx] = composeHangulFull(L, C, 0); - d_list.items[i] = tombstone; // Mark for deletion. + // L, V full composition. L, V, T is handled via main loop. + dcps[sidx] = composeHangulFull(L, C, 0); + dcps[i] = tombstone; // Mark for deletion. processed_hangul = true; } if (processed_hangul) deleted += 1; } + // If no composition has occurred yet. if (!processed_hangul) { - // L -> C not Hangul. + // L, C are not Hangul, so check for primary composite + // in the Unicode Character Database. if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| { + // We have a primary composite P for L, C. + // We must check if P is not in the Full + // Composition Exclusions (FCX) list, + // preventing it from appearing in any + // composed form (NFC, NFKC). if (!self.norm_data.normp_data.isFcx(P)) { - d_list.items[sidx] = P; - d_list.items[i] = tombstone; // Mark for deletion. + dcps[sidx] = P; + dcps[i] = tombstone; // Mark for deletion. deleted += 1; } } @@ -451,31 +462,35 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Resu } } - // Check if finished. + // If we have no deletions. the code point sequence + // has been fully composed. if (deleted == 0) { - var cstr_list = try std.ArrayList(u8).initCapacity(allocator, d_list.items.len * 4); + var cstr_list = std.ArrayList(u8).init(allocator); defer cstr_list.deinit(); var buf: [4]u8 = undefined; - for (d_list.items) |cp| { + for (dcps) |cp| { + if (cp == tombstone) continue; if (cp == tombstone) continue; // "Delete" const len = try unicode.utf8Encode(cp, &buf); - cstr_list.appendSliceAssumeCapacity(buf[0..len]); + try cstr_list.appendSlice(buf[0..len]); } return Result{ .allocator = allocator, .slice = try cstr_list.toOwnedSlice() }; } - // Otherwise update code points list. - var tmp_d_list = try std.ArrayList(u21).initCapacity(allocator, d_list.items.len - deleted); - defer tmp_d_list.deinit(); - - for (d_list.items) |cp| { - if (cp != tombstone) tmp_d_list.appendAssumeCapacity(cp); - } - - d_list.clearRetainingCapacity(); - d_list.appendSliceAssumeCapacity(tmp_d_list.items); + // We're still not finished, so create a new + // code point sequence with the marked code points + // removed. + // var tmp_list = try std.ArrayList(u21).initCapacity(allocator, dcps.len - deleted); + // defer tmp_list.deinit(); + // + // for (dcps) |cp| { + // if (cp != tombstone) tmp_list.appendAssumeCapacity(cp); + // } + // + // allocator.free(dcps); + // dcps = try tmp_list.toOwnedSlice(); } } -- cgit v1.2.3