//! Normalizer contains functions and methods that implement //! Unicode Normalization. You can normalize strings into NFC, //! NFKC, NFD, and NFKD normalization forms. const std = @import("std"); const testing = std.testing; const CodePointIterator = @import("code_point").Iterator; const norm_props = @import("ziglyph").normalization_props; pub const NormData = @import("NormData"); norm_data: *NormData, const Self = @This(); // Hangul processing utilities. fn isHangulPrecomposed(self: Self, cp: u21) bool { const kind = self.norm_data.hangul_data.syllable(cp); return kind == .LV or kind == .LVT; } const SBase: u21 = 0xAC00; const LBase: u21 = 0x1100; const VBase: u21 = 0x1161; const TBase: u21 = 0x11A7; const LCount: u21 = 19; const VCount: u21 = 21; const TCount: u21 = 28; const NCount: u21 = 588; // VCount * TCount const SCount: u21 = 11172; // LCount * NCount fn decomposeHangul(cp: u21) [3]u21 { const SIndex: u21 = cp - SBase; const LIndex: u21 = SIndex / NCount; const VIndex: u21 = (SIndex % NCount) / TCount; const TIndex: u21 = SIndex % TCount; const LPart: u21 = LBase + LIndex; const VPart: u21 = VBase + VIndex; var TPart: u21 = 0; if (TIndex != 0) TPart = TBase + TIndex; return [3]u21{ LPart, VPart, TPart }; } fn composeHangulCanon(lv: u21, t: u21) u21 { std.debug.assert(0x11A8 <= t and t <= 0x11C2); return lv + (t - TBase); } fn composeHangulFull(l: u21, v: u21, t: u21) u21 { std.debug.assert(0x1100 <= l and l <= 0x1112); std.debug.assert(0x1161 <= v and v <= 0x1175); const LIndex = l - LBase; const VIndex = v - VBase; const LVIndex = LIndex * NCount + VIndex * TCount; if (t == 0) return SBase + LVIndex; std.debug.assert(0x11A8 <= t and t <= 0x11C2); const TIndex = t - TBase; return SBase + LVIndex + TIndex; } const Form = enum { nfc, nfd, nfkc, nfkd, same, }; const Decomp = struct { form: Form = .nfd, cps: [18]u21 = [_]u21{0} ** 18, }; /// `mapping` retrieves the decomposition mapping for a code point as per the UCD. pub fn mapping(self: Self, cp: u21, form: Form) Decomp { std.debug.assert(form == .nfd or form == .nfkd); var dc = Decomp{ .form = .nfd }; const canon_dc = self.norm_data.canon_data.toNfd(cp); const len: usize = if (canon_dc[1] == 0) 1 else 2; if (len == 1 and canon_dc[0] == cp) { dc.form = .same; dc.cps[0] = cp; } else { @memcpy(dc.cps[0..len], canon_dc[0..len]); } const compat_dc = self.norm_data.compat_data.toNfkd(cp); if (compat_dc.len != 0) { if (form != .nfd) { dc.form = .nfkd; @memcpy(dc.cps[0..compat_dc.len], compat_dc); } } return dc; } /// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`. pub fn decompose(self: Self, cp: u21, form: Form) Decomp { std.debug.assert(form == .nfd or form == .nfkd); var dc = Decomp{ .form = form }; // ASCII or NFD / NFKD quick checks. if (cp <= 127 or (form == .nfd and norm_props.isNfd(cp)) or (form == .nfkd and norm_props.isNfkd(cp))) { dc.cps[0] = cp; return dc; } // Hangul precomposed syllable full decomposition. if (self.isHangulPrecomposed(cp)) { const cps = decomposeHangul(cp); @memcpy(dc.cps[0..cps.len], &cps); return dc; } // Full decomposition. var result_index: usize = 0; var work_index: usize = 1; // Start work with argument code point. var work = [_]u21{cp} ++ [_]u21{0} ** 17; while (work_index > 0) { // Look at previous code point in work queue. work_index -= 1; const next = work[work_index]; const m = self.mapping(next, form); // No more of decompositions for this code point. if (m.form == .same) { dc.cps[result_index] = m.cps[0]; result_index += 1; continue; } // Find last index of decomposition. const m_last = for (m.cps, 0..) |mcp, i| { if (mcp == 0) break i; } else m.cps.len; // Work backwards through decomposition. // `i` starts at 1 because m_last is 1 past the last code point. var i: usize = 1; while (i <= m_last) : ({ i += 1; work_index += 1; }) { work[work_index] = m.cps[m_last - i]; } } return dc; } test "decompose" { const allocator = testing.allocator; var data = try NormData.init(allocator); defer data.deinit(); var n = Self{ .norm_data = &data }; var dc = n.decompose('é', .nfd); try std.testing.expect(dc.form == .nfd); try std.testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]); dc = n.decompose('\u{1e0a}', .nfd); try std.testing.expect(dc.form == .nfd); try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); dc = n.decompose('\u{1e0a}', .nfkd); try std.testing.expect(dc.form == .nfkd); try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); dc = n.decompose('\u{3189}', .nfd); try std.testing.expect(dc.form == .nfd); try std.testing.expectEqualSlices(u21, &[_]u21{'\u{3189}'}, dc.cps[0..1]); dc = n.decompose('\u{3189}', .nfkd); try std.testing.expect(dc.form == .nfkd); try std.testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]); dc = n.decompose('\u{ace1}', .nfd); try std.testing.expect(dc.form == .nfd); try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); dc = n.decompose('\u{ace1}', .nfkd); try std.testing.expect(dc.form == .nfkd); try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); dc = n.decompose('\u{3d3}', .nfd); try std.testing.expect(dc.form == .nfd); try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]); dc = n.decompose('\u{3d3}', .nfkd); try std.testing.expect(dc.form == .nfkd); try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]); } // Some quick checks. fn onlyAscii(str: []const u8) bool { return for (str) |b| { if (b > 127) break false; } else true; } fn onlyLatin1(str: []const u8) bool { var cp_iter = CodePointIterator{ .bytes = str }; return while (cp_iter.next()) |cp| { if (cp.code > 256) break false; } else true; } /// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory. pub const Result = struct { allocator: ?std.mem.Allocator = null, slice: []const u8, pub fn deinit(self: *Result) void { if (self.allocator) |allocator| allocator.free(self.slice); } }; // Compares code points by Canonical Combining Class order. fn cccLess(self: Self, lhs: u21, rhs: u21) bool { return self.norm_data.ccc_data.ccc(lhs) < self.norm_data.ccc_data.ccc(rhs); } // Applies the Canonical Sorting Algorithm. fn canonicalSort(self: Self, cps: []u21) void { var i: usize = 0; while (i < cps.len) : (i += 1) { const start: usize = i; while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} std.mem.sort(u21, cps[start..i], self, cccLess); } } /// Normalize `str` to NFD. pub fn nfd(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { return self.nfxd(allocator, str, .nfd); } /// Normalize `str` to NFKD. pub fn nfkd(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { return self.nfxd(allocator, str, .nfkd); } fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result { // Quick checks. if (onlyAscii(str)) return Result{ .slice = str }; var dcp_list = try std.ArrayList(u21).initCapacity(allocator, str.len + str.len / 2); defer dcp_list.deinit(); var cp_iter = CodePointIterator{ .bytes = str }; while (cp_iter.next()) |cp| { const dc = self.decompose(cp.code, form); const slice = for (dc.cps, 0..) |dcp, i| { if (dcp == 0) break dc.cps[0..i]; } else dc.cps[0..]; try dcp_list.appendSlice(slice); } self.canonicalSort(dcp_list.items); var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4); defer dstr_list.deinit(); var buf: [4]u8 = undefined; for (dcp_list.items) |dcp| { const len = try std.unicode.utf8Encode(dcp, &buf); dstr_list.appendSliceAssumeCapacity(buf[0..len]); } return Result{ .allocator = allocator, .slice = try dstr_list.toOwnedSlice() }; } test "nfd ASCII / no-alloc" { const allocator = testing.allocator; var data = try NormData.init(allocator); defer data.deinit(); var n = Self{ .norm_data = &data }; var result = try n.nfd(allocator, "Hello World!"); defer result.deinit(); try std.testing.expectEqualStrings("Hello World!", result.slice); } test "nfd !ASCII / alloc" { const allocator = testing.allocator; var data = try NormData.init(allocator); defer data.deinit(); var n = Self{ .norm_data = &data }; var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); defer result.deinit(); try std.testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice); } test "nfkd ASCII / no-alloc" { const allocator = testing.allocator; var data = try NormData.init(allocator); defer data.deinit(); var n = Self{ .norm_data = &data }; var result = try n.nfkd(allocator, "Hello World!"); defer result.deinit(); try std.testing.expectEqualStrings("Hello World!", result.slice); } test "nfkd !ASCII / alloc" { const allocator = testing.allocator; var data = try NormData.init(allocator); defer data.deinit(); var n = Self{ .norm_data = &data }; var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); defer result.deinit(); try std.testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); } // Composition utilities. fn isHangul(self: Self, cp: u21) bool { return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none; } fn isNonHangulStarter(self: Self, cp: u21) bool { return !self.isHangul(cp) and self.norm_data.ccc_data.isStarter(cp); } /// Normalizes `str` to NFC. pub fn nfc(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { return self.nfxc(allocator, str, .nfc); } /// Normalizes `str` to NFKC. pub fn nfkc(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { return self.nfxc(allocator, str, .nfkc); } fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result { // Quick checks. if (onlyAscii(str)) return Result{ .slice = str }; if (form == .nfc and onlyLatin1(str)) return Result{ .slice = str }; // Decompose first. var d_result = if (form == .nfc) try self.nfd(allocator, str) else try self.nfkd(allocator, str); defer d_result.deinit(); // Get code points. var cp_iter = CodePointIterator{ .bytes = d_result.slice }; var d_list = try std.ArrayList(u21).initCapacity(allocator, d_result.slice.len); defer d_list.deinit(); while (cp_iter.next()) |cp| d_list.appendAssumeCapacity(cp.code); // Compose const tombstone = 0xe000; // Start of BMP Private Use Area while (true) { var i: usize = 1; // start at second code point. var deleted: usize = 0; block_check: while (i < d_list.items.len) : (i += 1) { const C = d_list.items[i]; const cc_C = self.norm_data.ccc_data.ccc(C); var starter_index: ?usize = null; var j: usize = i; while (true) { j -= 1; // Check for starter. if (self.norm_data.ccc_data.isStarter(d_list.items[j])) { if (i - j > 1) { // If there's distance between the starting point and the current position. for (d_list.items[(j + 1)..i]) |B| { const cc_B = self.norm_data.ccc_data.ccc(B); // Check for blocking conditions. if (self.isHangul(C)) { if (cc_B != 0 or self.isNonHangulStarter(B)) continue :block_check; } if (cc_B >= cc_C) continue :block_check; } } // Found starter at j. starter_index = j; break; } if (j == 0) break; } if (starter_index) |sidx| { const L = d_list.items[sidx]; var processed_hangul = false; if (self.isHangul(L) and self.isHangul(C)) { const l_stype = self.norm_data.hangul_data.syllable(L); const c_stype = self.norm_data.hangul_data.syllable(C); if (l_stype == .LV and c_stype == .T) { // LV, T d_list.items[sidx] = composeHangulCanon(L, C); d_list.items[i] = tombstone; // Mark for deletion. processed_hangul = true; } if (l_stype == .L and c_stype == .V) { // Handle L, V. L, V, T is handled via main loop. d_list.items[sidx] = composeHangulFull(L, C, 0); d_list.items[i] = tombstone; // Mark for deletion. processed_hangul = true; } if (processed_hangul) deleted += 1; } if (!processed_hangul) { // L -> C not Hangul. if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| { if (!norm_props.isFcx(P)) { d_list.items[sidx] = P; d_list.items[i] = tombstone; // Mark for deletion. deleted += 1; } } } } } // Check if finished. if (deleted == 0) { var cstr_list = try std.ArrayList(u8).initCapacity(allocator, d_list.items.len * 4); defer cstr_list.deinit(); var buf: [4]u8 = undefined; for (d_list.items) |cp| { if (cp == tombstone) continue; // "Delete" const len = try std.unicode.utf8Encode(cp, &buf); cstr_list.appendSliceAssumeCapacity(buf[0..len]); } return Result{ .allocator = allocator, .slice = try cstr_list.toOwnedSlice() }; } // Otherwise update code points list. var tmp_d_list = try std.ArrayList(u21).initCapacity(allocator, d_list.items.len - deleted); defer tmp_d_list.deinit(); for (d_list.items) |cp| { if (cp != tombstone) tmp_d_list.appendAssumeCapacity(cp); } d_list.clearRetainingCapacity(); d_list.appendSliceAssumeCapacity(tmp_d_list.items); } } test "nfc" { const allocator = testing.allocator; var data = try NormData.init(allocator); defer data.deinit(); var n = Self{ .norm_data = &data }; var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); defer result.deinit(); try std.testing.expectEqualStrings("Complex char: \u{3D3}", result.slice); } test "nfkc" { const allocator = testing.allocator; var data = try NormData.init(allocator); defer data.deinit(); var n = Self{ .norm_data = &data }; var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); defer result.deinit(); try std.testing.expectEqualStrings("Complex char: \u{038E}", result.slice); } /// Tests for equality of `a` and `b` after normalizing to NFD. pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u8) !bool { var norm_result_a = try self.nfd(allocator, a); defer norm_result_a.deinit(); var norm_result_b = try self.nfd(allocator, b); defer norm_result_b.deinit(); return std.mem.eql(u8, norm_result_a.slice, norm_result_b.slice); } test "eql" { const allocator = testing.allocator; var data = try NormData.init(allocator); defer data.deinit(); var n = Self{ .norm_data = &data }; try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); try std.testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); } // FCD fn getLeadCcc(self: Self, cp: u21) u8 { const dc = self.mapping(cp, .nfd); return self.norm_data.ccc_data.ccc(dc.cps[0]); } fn getTrailCcc(self: Self, cp: u21) u8 { const dc = self.mapping(cp, .nfd); const len = for (dc.cps, 0..) |dcp, i| { if (dcp == 0) break i; } else dc.cps.len; return self.norm_data.ccc_data.ccc(dc.cps[len - 1]); } /// Fast check to detect if a string is already in NFC or NFD form. pub fn isFcd(self: Self, str: []const u8) bool { var prev_ccc: u8 = 0; var cp_iter = CodePointIterator{ .bytes = str }; return while (cp_iter.next()) |cp| { const ccc = self.getLeadCcc(cp.code); if (ccc != 0 and ccc < prev_ccc) break false; prev_ccc = self.getTrailCcc(cp.code); } else true; } test "isFcd" { const allocator = testing.allocator; var data = try NormData.init(allocator); defer data.deinit(); var n = Self{ .norm_data = &data }; const is_nfc = "José \u{3D3}"; try std.testing.expect(n.isFcd(is_nfc)); const is_nfd = "Jose\u{301} \u{3d2}\u{301}"; try std.testing.expect(n.isFcd(is_nfd)); const not_fcd = "Jose\u{301} \u{3d2}\u{315}\u{301}"; try std.testing.expect(!n.isFcd(not_fcd)); } test "Unicode normalization tests" { var arena = std.heap.ArenaAllocator.init(std.testing.allocator); defer arena.deinit(); var allocator = arena.allocator(); var data = try NormData.init(allocator); defer data.deinit(); var n = Self{ .norm_data = &data }; var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); defer file.close(); var buf_reader = std.io.bufferedReader(file.reader()); const input_stream = buf_reader.reader(); var line_no: usize = 0; var buf: [4096]u8 = undefined; var cp_buf: [4]u8 = undefined; while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| { line_no += 1; // Skip comments or empty lines. if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; // Iterate over fields. var fields = std.mem.split(u8, line, ";"); var field_index: usize = 0; var input: []u8 = undefined; defer allocator.free(input); while (fields.next()) |field| : (field_index += 1) { if (field_index == 0) { var i_buf = std.ArrayList(u8).init(allocator); defer i_buf.deinit(); var i_fields = std.mem.split(u8, field, " "); while (i_fields.next()) |s| { const icp = try std.fmt.parseInt(u21, s, 16); const len = try std.unicode.utf8Encode(icp, &cp_buf); try i_buf.appendSlice(cp_buf[0..len]); } input = try i_buf.toOwnedSlice(); } else if (field_index == 1) { //std.debug.print("\n*** {s} ***\n", .{line}); // NFC, time to test. var w_buf = std.ArrayList(u8).init(allocator); defer w_buf.deinit(); var w_fields = std.mem.split(u8, field, " "); while (w_fields.next()) |s| { const wcp = try std.fmt.parseInt(u21, s, 16); const len = try std.unicode.utf8Encode(wcp, &cp_buf); try w_buf.appendSlice(cp_buf[0..len]); } const want = w_buf.items; var got = try n.nfc(allocator, input); defer got.deinit(); try std.testing.expectEqualStrings(want, got.slice); } else if (field_index == 2) { // NFD, time to test. var w_buf = std.ArrayList(u8).init(allocator); defer w_buf.deinit(); var w_fields = std.mem.split(u8, field, " "); while (w_fields.next()) |s| { const wcp = try std.fmt.parseInt(u21, s, 16); const len = try std.unicode.utf8Encode(wcp, &cp_buf); try w_buf.appendSlice(cp_buf[0..len]); } const want = w_buf.items; var got = try n.nfd(allocator, input); defer got.deinit(); try std.testing.expectEqualStrings(want, got.slice); } else if (field_index == 3) { // NFKC, time to test. var w_buf = std.ArrayList(u8).init(allocator); defer w_buf.deinit(); var w_fields = std.mem.split(u8, field, " "); while (w_fields.next()) |s| { const wcp = try std.fmt.parseInt(u21, s, 16); const len = try std.unicode.utf8Encode(wcp, &cp_buf); try w_buf.appendSlice(cp_buf[0..len]); } const want = w_buf.items; var got = try n.nfkc(allocator, input); defer got.deinit(); try std.testing.expectEqualStrings(want, got.slice); } else if (field_index == 4) { // NFKD, time to test. var w_buf = std.ArrayList(u8).init(allocator); defer w_buf.deinit(); var w_fields = std.mem.split(u8, field, " "); while (w_fields.next()) |s| { const wcp = try std.fmt.parseInt(u21, s, 16); const len = try std.unicode.utf8Encode(wcp, &cp_buf); try w_buf.appendSlice(cp_buf[0..len]); } const want = w_buf.items; var got = try n.nfkd(allocator, input); defer got.deinit(); try std.testing.expectEqualStrings(want, got.slice); } else { continue; } } } }