Replaced ccc_map with table. 20ms faster

author: Jose Colon Rodriguez 2024-02-20 09:13:36 -0400
committer: Jose Colon Rodriguez 2024-02-20 09:13:36 -0400
commit: 134a7df8206aa66ca7b0abbc7f31f08410b502d2 (patch)
tree: e3c520dce4f529de5290bc145847e78fb5583bee /src/Normalizer.zig
parent: Cleaned up directory structure (diff)
download: zg-134a7df8206aa66ca7b0abbc7f31f08410b502d2.tar.gz
zg-134a7df8206aa66ca7b0abbc7f31f08410b502d2.tar.xz
zg-134a7df8206aa66ca7b0abbc7f31f08410b502d2.zip
1 files changed, 863 insertions, 0 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig
new file mode 100644
index 0000000..1b4a2d5
--- /dev/null
+++ b/src/Normalizer.zig
@@ -0,0 +1,863 @@
+//! Normalizer contains functions and methods that implement Unicode Normalization algorithms. You can normalize strings
+//! into NFC, NFKC, NFD, and NFKD normalization forms (see `nfc`, `nfkc`, `nfd`, and `nfkd`). You can also test for
+//! string equality under different parameters related to normalization (see `eql`, `eqlCaseless`, `eqlIdentifiers`).
+const std = @import("std");
+const CodePointIterator = @import("code_point").Iterator;
+const case_fold_map = @import("ziglyph").case_folding;
+const hangul_map = @import("ziglyph").hangul;
+const norm_props = @import("ziglyph").normalization_props;
+const normp = @import("normp");
+const Self = @This();
+nfc_map: std.AutoHashMap([2]u21, u21),
+nfd_map: std.AutoHashMap(u21, [2]u21),
+nfkd_map: std.AutoHashMap(u21, [18]u21),
+pub fn init(allocator: std.mem.Allocator) !Self {
+    var self = Self{
+        .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator),
+        .nfd_map = std.AutoHashMap(u21, [2]u21).init(allocator),
+        .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator),
+    };
+    errdefer self.deinit();
+    // Canonical compositions
+    const decompressor = std.compress.deflate.decompressor;
+    const comp_file = @embedFile("autogen/canonical_compositions.txt.deflate");
+    var comp_stream = std.io.fixedBufferStream(comp_file);
+    var comp_decomp = try decompressor(allocator, comp_stream.reader(), null);
+    defer comp_decomp.deinit();
+    var comp_buf = std.io.bufferedReader(comp_decomp.reader());
+    const comp_reader = comp_buf.reader();
+    var buf: [4096]u8 = undefined;
+    while (try comp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| {
+        if (line.len == 0) continue;
+        var fields = std.mem.split(u8, line, ";");
+        const cp_a = try std.fmt.parseInt(u21, fields.next().?, 16);
+        const cp_b = try std.fmt.parseInt(u21, fields.next().?, 16);
+        const cp_c = try std.fmt.parseInt(u21, fields.next().?, 16);
+        try self.nfc_map.put(.{ cp_a, cp_b }, cp_c);
+    }
+    // Canonical decompositions
+    const decomp_file = @embedFile("autogen/canonical_decompositions.txt.deflate");
+    var decomp_stream = std.io.fixedBufferStream(decomp_file);
+    var decomp_decomp = try decompressor(allocator, decomp_stream.reader(), null);
+    defer decomp_decomp.deinit();
+    var decomp_buf = std.io.bufferedReader(decomp_decomp.reader());
+    const decomp_reader = decomp_buf.reader();
+    while (try decomp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| {
+        if (line.len == 0) continue;
+        var fields = std.mem.split(u8, line, ";");
+        const cp_a = try std.fmt.parseInt(u21, fields.next().?, 16);
+        const cp_b = try std.fmt.parseInt(u21, fields.next().?, 16);
+        const cp_c = try std.fmt.parseInt(u21, fields.next().?, 16);
+        try self.nfd_map.put(cp_a, .{ cp_b, cp_c });
+    }
+    // Compatibility decompositions
+    const dekomp_file = @embedFile("autogen/compatibility_decompositions.txt.deflate");
+    var dekomp_stream = std.io.fixedBufferStream(dekomp_file);
+    var dekomp_decomp = try decompressor(allocator, dekomp_stream.reader(), null);
+    defer dekomp_decomp.deinit();
+    var dekomp_buf = std.io.bufferedReader(dekomp_decomp.reader());
+    const dekomp_reader = dekomp_buf.reader();
+    while (try dekomp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| {
+        if (line.len == 0) continue;
+        var fields = std.mem.split(u8, line, ";");
+        const cp_a = try std.fmt.parseInt(u21, fields.next().?, 16);
+        var cps = [_]u21{0} ** 18;
+        var i: usize = 0;
+        while (fields.next()) |cp| : (i += 1) {
+            cps[i] = try std.fmt.parseInt(u21, cp, 16);
+        }
+        try self.nfkd_map.put(cp_a, cps);
+    }
+    return self;
+}
+pub fn deinit(self: *Self) void {
+    self.nfc_map.deinit();
+    self.nfd_map.deinit();
+    self.nfkd_map.deinit();
+}
+test "init / deinit" {
+    var n = try init(std.testing.allocator);
+    defer n.deinit();
+}
+// Hangul processing utilities.
+fn isHangulPrecomposed(cp: u21) bool {
+    if (hangul_map.syllableType(cp)) |kind| return kind == .LV or kind == .LVT;
+    return false;
+}
+const SBase: u21 = 0xAC00;
+const LBase: u21 = 0x1100;
+const VBase: u21 = 0x1161;
+const TBase: u21 = 0x11A7;
+const LCount: u21 = 19;
+const VCount: u21 = 21;
+const TCount: u21 = 28;
+const NCount: u21 = 588; // VCount * TCount
+const SCount: u21 = 11172; // LCount * NCount
+fn decomposeHangul(cp: u21) [3]u21 {
+    const SIndex: u21 = cp - SBase;
+    const LIndex: u21 = SIndex / NCount;
+    const VIndex: u21 = (SIndex % NCount) / TCount;
+    const TIndex: u21 = SIndex % TCount;
+    const LPart: u21 = LBase + LIndex;
+    const VPart: u21 = VBase + VIndex;
+    var TPart: u21 = 0;
+    if (TIndex != 0) TPart = TBase + TIndex;
+    return [3]u21{ LPart, VPart, TPart };
+}
+fn composeHangulCanon(lv: u21, t: u21) u21 {
+    std.debug.assert(0x11A8 <= t and t <= 0x11C2);
+    return lv + (t - TBase);
+}
+fn composeHangulFull(l: u21, v: u21, t: u21) u21 {
+    std.debug.assert(0x1100 <= l and l <= 0x1112);
+    std.debug.assert(0x1161 <= v and v <= 0x1175);
+    const LIndex = l - LBase;
+    const VIndex = v - VBase;
+    const LVIndex = LIndex * NCount + VIndex * TCount;
+    if (t == 0) return SBase + LVIndex;
+    std.debug.assert(0x11A8 <= t and t <= 0x11C2);
+    const TIndex = t - TBase;
+    return SBase + LVIndex + TIndex;
+}
+const Form = enum {
+    nfc,
+    nfd,
+    nfkc,
+    nfkd,
+    same,
+};
+const Decomp = struct {
+    form: Form = .nfd,
+    cps: [18]u21 = [_]u21{0} ** 18,
+};
+/// `mapping` retrieves the decomposition mapping for a code point as per the UCD.
+pub fn mapping(self: Self, cp: u21, form: Form) Decomp {
+    std.debug.assert(form == .nfd or form == .nfkd);
+    var dc = Decomp{ .form = .same };
+    dc.cps[0] = cp;
+    if (self.nfkd_map.get(cp)) |array| {
+        if (form != .nfd) {
+            dc.form = .nfkd;
+            @memcpy(dc.cps[0..array.len], &array);
+        }
+    } else if (self.nfd_map.get(cp)) |array| {
+        dc.form = .nfd;
+        @memcpy(dc.cps[0..array.len], &array);
+    }
+    return dc;
+}
+/// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`.
+pub fn decompose(self: Self, cp: u21, form: Form) Decomp {
+    std.debug.assert(form == .nfd or form == .nfkd);
+    var dc = Decomp{ .form = form };
+    // ASCII or NFD / NFKD quick checks.
+    if (cp <= 127 or (form == .nfd and norm_props.isNfd(cp)) or (form == .nfkd and norm_props.isNfkd(cp))) {
+        dc.cps[0] = cp;
+        return dc;
+    }
+    // Hangul precomposed syllable full decomposition.
+    if (isHangulPrecomposed(cp)) {
+        const cps = decomposeHangul(cp);
+        @memcpy(dc.cps[0..cps.len], &cps);
+        return dc;
+    }
+    // Full decomposition.
+    var result_index: usize = 0;
+    var work_index: usize = 1;
+    // Start work with argument code point.
+    var work = [_]u21{cp} ++ [_]u21{0} ** 17;
+    while (work_index > 0) {
+        // Look at previous code point in work queue.
+        work_index -= 1;
+        const next = work[work_index];
+        const m = self.mapping(next, form);
+        // No more of decompositions for this code point.
+        if (m.form == .same) {
+            dc.cps[result_index] = m.cps[0];
+            result_index += 1;
+            continue;
+        }
+        // Find last index of decomposition.
+        const m_last = for (m.cps, 0..) |mcp, i| {
+            if (mcp == 0) break i;
+        } else m.cps.len;
+        // Work backwards through decomposition.
+        // `i` starts at 1 because m_last is 1 past the last code point.
+        var i: usize = 1;
+        while (i <= m_last) : ({
+            i += 1;
+            work_index += 1;
+        }) {
+            work[work_index] = m.cps[m_last - i];
+        }
+    }
+    return dc;
+}
+test "decompose" {
+    const allocator = std.testing.allocator;
+    var n = try init(allocator);
+    defer n.deinit();
+    var dc = n.decompose('é', .nfd);
+    try std.testing.expect(dc.form == .nfd);
+    try std.testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]);
+    dc = n.decompose('\u{1e0a}', .nfd);
+    try std.testing.expect(dc.form == .nfd);
+    try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
+    dc = n.decompose('\u{1e0a}', .nfkd);
+    try std.testing.expect(dc.form == .nfkd);
+    try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
+    dc = n.decompose('\u{3189}', .nfd);
+    try std.testing.expect(dc.form == .nfd);
+    try std.testing.expectEqualSlices(u21, &[_]u21{'\u{3189}'}, dc.cps[0..1]);
+    dc = n.decompose('\u{3189}', .nfkd);
+    try std.testing.expect(dc.form == .nfkd);
+    try std.testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]);
+    dc = n.decompose('\u{ace1}', .nfd);
+    try std.testing.expect(dc.form == .nfd);
+    try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
+    dc = n.decompose('\u{ace1}', .nfkd);
+    try std.testing.expect(dc.form == .nfkd);
+    try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
+    dc = n.decompose('\u{3d3}', .nfd);
+    try std.testing.expect(dc.form == .nfd);
+    try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]);
+    dc = n.decompose('\u{3d3}', .nfkd);
+    try std.testing.expect(dc.form == .nfkd);
+    try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]);
+}
+// Some quick checks.
+fn onlyAscii(str: []const u8) bool {
+    return for (str) |b| {
+        if (b > 127) break false;
+    } else true;
+}
+fn onlyLatin1(str: []const u8) bool {
+    var cp_iter = CodePointIterator{ .bytes = str };
+    return while (cp_iter.next()) |cp| {
+        if (cp.code > 256) break false;
+    } else true;
+}
+/// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory.
+pub const Result = struct {
+    allocator: ?std.mem.Allocator = null,
+    slice: []const u8,
+    pub fn deinit(self: *Result) void {
+        if (self.allocator) |allocator| allocator.free(self.slice);
+    }
+};
+// Compares code points by Canonical Combining Class order.
+fn cccLess(_: void, lhs: u21, rhs: u21) bool {
+    const lcc = normp.stage_2[normp.stage_1[lhs >> 8] + (lhs & 0xff)];
+    const rcc = normp.stage_2[normp.stage_1[rhs >> 8] + (rhs & 0xff)];
+    return lcc < rcc;
+}
+// Applies the Canonical Sorting Algorithm.
+fn canonicalSort(cps: []u21) void {
+    var i: usize = 0;
+    while (i < cps.len) : (i += 1) {
+        const start: usize = i;
+        while (i < cps.len and normp.stage_2[normp.stage_1[cps[i] >> 8] + (cps[i] & 0xff)] != 0) : (i += 1) {}
+        std.mem.sort(u21, cps[start..i], {}, cccLess);
+    }
+}
+/// Normalize `str` to NFD.
+pub fn nfd(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result {
+    return self.nfxd(allocator, str, .nfd);
+}
+/// Normalize `str` to NFKD.
+pub fn nfkd(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result {
+    return self.nfxd(allocator, str, .nfkd);
+}
+fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result {
+    // Quick checks.
+    if (onlyAscii(str)) return Result{ .slice = str };
+    var dcp_list = try std.ArrayList(u21).initCapacity(allocator, str.len + str.len / 2);
+    defer dcp_list.deinit();
+    var cp_iter = CodePointIterator{ .bytes = str };
+    while (cp_iter.next()) |cp| {
+        const dc = self.decompose(cp.code, form);
+        const slice = for (dc.cps, 0..) |dcp, i| {
+            if (dcp == 0) break dc.cps[0..i];
+        } else dc.cps[0..];
+        try dcp_list.appendSlice(slice);
+    }
+    canonicalSort(dcp_list.items);
+    var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4);
+    defer dstr_list.deinit();
+    var buf: [4]u8 = undefined;
+    for (dcp_list.items) |dcp| {
+        const len = try std.unicode.utf8Encode(dcp, &buf);
+        dstr_list.appendSliceAssumeCapacity(buf[0..len]);
+    }
+    return Result{ .allocator = allocator, .slice = try dstr_list.toOwnedSlice() };
+}
+test "nfd ASCII / no-alloc" {
+    const allocator = std.testing.allocator;
+    var n = try init(allocator);
+    defer n.deinit();
+    var result = try n.nfd(allocator, "Hello World!");
+    defer result.deinit();
+    try std.testing.expectEqualStrings("Hello World!", result.slice);
+}
+test "nfd !ASCII / alloc" {
+    const allocator = std.testing.allocator;
+    var n = try init(allocator);
+    defer n.deinit();
+    var result = try n.nfd(allocator, "Héllo World! \u{3d3}");
+    defer result.deinit();
+    try std.testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice);
+}
+test "nfkd ASCII / no-alloc" {
+    const allocator = std.testing.allocator;
+    var n = try init(allocator);
+    defer n.deinit();
+    var result = try n.nfkd(allocator, "Hello World!");
+    defer result.deinit();
+    try std.testing.expectEqualStrings("Hello World!", result.slice);
+}
+test "nfkd !ASCII / alloc" {
+    const allocator = std.testing.allocator;
+    var n = try init(allocator);
+    defer n.deinit();
+    var result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
+    defer result.deinit();
+    try std.testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice);
+}
+// Composition utilities.
+fn isHangul(cp: u21) bool {
+    return cp >= 0x1100 and hangul_map.syllableType(cp) != null;
+}
+fn isStarter(cp: u21) bool {
+    return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] == 0;
+}
+fn isCombining(cp: u21) bool {
+    return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] != 0;
+}
+fn isNonHangulStarter(cp: u21) bool {
+    return !isHangul(cp) and isStarter(cp);
+}
+/// Normalizes `str` to NFC.
+pub fn nfc(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result {
+    return self.nfxc(allocator, str, .nfc);
+}
+/// Normalizes `str` to NFKC.
+pub fn nfkc(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result {
+    return self.nfxc(allocator, str, .nfkc);
+}
+fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result {
+    // Quick checks.
+    if (onlyAscii(str)) return Result{ .slice = str };
+    if (form == .nfc and onlyLatin1(str)) return Result{ .slice = str };
+    // Decompose first.
+    var d_result = if (form == .nfc)
+        try self.nfd(allocator, str)
+    else
+        try self.nfkd(allocator, str);
+    defer d_result.deinit();
+    // Get code points.
+    var cp_iter = CodePointIterator{ .bytes = d_result.slice };
+    var d_list = try std.ArrayList(u21).initCapacity(allocator, d_result.slice.len);
+    defer d_list.deinit();
+    while (cp_iter.next()) |cp| d_list.appendAssumeCapacity(cp.code);
+    // Compose
+    const tombstone = 0xe000; // Start of BMP Private Use Area
+    while (true) {
+        var i: usize = 1; // start at second code point.
+        var deleted: usize = 0;
+        block_check: while (i < d_list.items.len) : (i += 1) {
+            const C = d_list.items[i];
+            const cc_C = normp.stage_2[normp.stage_1[C >> 8] + (C & 0xff)];
+            var starter_index: ?usize = null;
+            var j: usize = i;
+            while (true) {
+                j -= 1;
+                // Check for starter.
+                if (isStarter(d_list.items[j])) {
+                    if (i - j > 1) { // If there's distance between the starting point and the current position.
+                        for (d_list.items[(j + 1)..i]) |B| {
+                            // Check for blocking conditions.
+                            if (isHangul(C)) {
+                                if (isCombining(B) or isNonHangulStarter(B)) continue :block_check;
+                            }
+                            const cc_B = normp.stage_2[normp.stage_1[B >> 8] + (B & 0xff)];
+                            if (cc_B >= cc_C) continue :block_check;
+                        }
+                    }
+                    // Found starter at j.
+                    starter_index = j;
+                    break;
+                }
+                if (j == 0) break;
+            }
+            if (starter_index) |sidx| {
+                const L = d_list.items[sidx];
+                var processed_hangul = false;
+                if (isHangul(L) and isHangul(C)) {
+                    const l_stype = hangul_map.syllableType(L).?;
+                    const c_stype = hangul_map.syllableType(C).?;
+                    if (l_stype == .LV and c_stype == .T) {
+                        // LV, T
+                        d_list.items[sidx] = composeHangulCanon(L, C);
+                        d_list.items[i] = tombstone; // Mark for deletion.
+                        processed_hangul = true;
+                    }
+                    if (l_stype == .L and c_stype == .V) {
+                        // Handle L, V. L, V, T is handled via main loop.
+                        d_list.items[sidx] = composeHangulFull(L, C, 0);
+                        d_list.items[i] = tombstone; // Mark for deletion.
+                        processed_hangul = true;
+                    }
+                    if (processed_hangul) deleted += 1;
+                }
+                if (!processed_hangul) {
+                    // L -> C not Hangul.
+                    if (self.nfc_map.get(.{ L, C })) |P| {
+                        if (!norm_props.isFcx(P)) {
+                            d_list.items[sidx] = P;
+                            d_list.items[i] = tombstone; // Mark for deletion.
+                            deleted += 1;
+                        }
+                    }
+                }
+            }
+        }
+        // Check if finished.
+        if (deleted == 0) {
+            var cstr_list = try std.ArrayList(u8).initCapacity(allocator, d_list.items.len * 4);
+            defer cstr_list.deinit();
+            var buf: [4]u8 = undefined;
+            for (d_list.items) |cp| {
+                if (cp == tombstone) continue; // "Delete"
+                const len = try std.unicode.utf8Encode(cp, &buf);
+                cstr_list.appendSliceAssumeCapacity(buf[0..len]);
+            }
+            return Result{ .allocator = allocator, .slice = try cstr_list.toOwnedSlice() };
+        }
+        // Otherwise update code points list.
+        var tmp_d_list = try std.ArrayList(u21).initCapacity(allocator, d_list.items.len - deleted);
+        defer tmp_d_list.deinit();
+        for (d_list.items) |cp| {
+            if (cp != tombstone) tmp_d_list.appendAssumeCapacity(cp);
+        }
+        d_list.clearRetainingCapacity();
+        d_list.appendSliceAssumeCapacity(tmp_d_list.items);
+    }
+}
+test "nfc" {
+    const allocator = std.testing.allocator;
+    var n = try init(allocator);
+    defer n.deinit();
+    var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
+    defer result.deinit();
+    try std.testing.expectEqualStrings("Complex char: \u{3D3}", result.slice);
+}
+test "nfkc" {
+    const allocator = std.testing.allocator;
+    var n = try init(allocator);
+    defer n.deinit();
+    var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
+    defer result.deinit();
+    try std.testing.expectEqualStrings("Complex char: \u{038E}", result.slice);
+}
+/// Tests for equality as per Unicode rules for Identifiers.
+pub fn eqlIdentifiers(allocator: std.mem.Allocator, a: []const u8, b: []const u8) !bool {
+    var list_a = try std.ArrayList(u21).initCapacity(allocator, a.len);
+    defer list_a.deinit();
+    var list_b = try std.ArrayList(u21).initCapacity(allocator, b.len);
+    defer list_b.deinit();
+    const Item = struct {
+        str: []const u8,
+        list: *std.ArrayList(u21),
+    };
+    const items = [_]Item{
+        .{ .str = a, .list = &list_a },
+        .{ .str = b, .list = &list_b },
+    };
+    for (items) |item| {
+        var cp_iter = CodePointIterator{ .bytes = item.str };
+        while (cp_iter.next()) |cp| {
+            if (norm_props.toNfkcCaseFold(cp.code)) |nfkcf| {
+                for (nfkcf) |c| {
+                    if (c == 0) break;
+                    item.list.appendAssumeCapacity(c);
+                }
+            } else {
+                item.list.appendAssumeCapacity(cp.code); // maps to itself
+            }
+        }
+    }
+    return std.mem.eql(u21, list_a.items, list_b.items);
+}
+test "eqlIdentifiers" {
+    try std.testing.expect(try eqlIdentifiers(std.testing.allocator, "Foé", "foé"));
+}
+/// Tests for equality of `a` and `b` after normalizing to NFD.
+pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u8) !bool {
+    var norm_result_a = try self.nfd(allocator, a);
+    defer norm_result_a.deinit();
+    var norm_result_b = try self.nfd(allocator, b);
+    defer norm_result_b.deinit();
+    return std.mem.eql(u8, norm_result_a.slice, norm_result_b.slice);
+}
+test "eql" {
+    const allocator = std.testing.allocator;
+    var n = try init(allocator);
+    defer n.deinit();
+    try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
+    try std.testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
+}
+fn requiresNfdBeforeCaseFold(cp: u21) bool {
+    return switch (cp) {
+        0x0345 => true,
+        0x1F80...0x1FAF => true,
+        0x1FB2...0x1FB4 => true,
+        0x1FB7 => true,
+        0x1FBC => true,
+        0x1FC2...0x1FC4 => true,
+        0x1FC7 => true,
+        0x1FCC => true,
+        0x1FF2...0x1FF4 => true,
+        0x1FF7 => true,
+        0x1FFC => true,
+        else => false,
+    };
+}
+fn requiresPreNfd(str: []const u8) bool {
+    var cp_iter = CodePointIterator{ .bytes = str };
+    return while (cp_iter.next()) |cp| {
+        if (requiresNfdBeforeCaseFold(cp.code)) break true;
+    } else false;
+}
+/// `eqlCaseless` tests for equality of `a` and `b` after normalizing to NFD and ignoring letter case.
+pub fn eqlCaseless(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u8) !bool {
+    // The long winding road of normalized caseless matching...
+    // NFD(CaseFold(NFD(str))) or NFD(CaseFold(str))
+    var norm_result_a: Result = Result{ .slice = a };
+    if (requiresPreNfd(a)) {
+        if (!self.isFcd(a)) {
+            norm_result_a = try self.nfd(allocator, a);
+        }
+    }
+    defer norm_result_a.deinit();
+    const cf_a = try case_fold_map.caseFoldStr(allocator, norm_result_a.slice);
+    defer allocator.free(cf_a);
+    norm_result_a.deinit();
+    norm_result_a = try self.nfd(allocator, cf_a);
+    var norm_result_b: Result = Result{ .slice = b };
+    if (requiresPreNfd(b)) {
+        if (!self.isFcd(b)) {
+            norm_result_b = try self.nfd(allocator, b);
+        }
+    }
+    defer norm_result_b.deinit();
+    const cf_b = try case_fold_map.caseFoldStr(allocator, norm_result_b.slice);
+    defer allocator.free(cf_b);
+    norm_result_b.deinit();
+    norm_result_b = try self.nfd(allocator, cf_b);
+    return std.mem.eql(u8, norm_result_a.slice, norm_result_b.slice);
+}
+test "eqlCaseless" {
+    const allocator = std.testing.allocator;
+    var n = try init(allocator);
+    defer n.deinit();
+    try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}"));
+    try std.testing.expect(try n.eqlCaseless(allocator, "FOÉ", "foe\u{0301}")); // foÉ == foé
+}
+// FCD
+fn getLeadCcc(self: Self, cp: u21) u8 {
+    const dc = self.mapping(cp, .nfd);
+    return normp.stage_2[normp.stage_1[dc.cps[0] >> 8] + (dc.cps[0] & 0xff)];
+}
+fn getTrailCcc(self: Self, cp: u21) u8 {
+    const dc = self.mapping(cp, .nfd);
+    const len = for (dc.cps, 0..) |dcp, i| {
+        if (dcp == 0) break i;
+    } else dc.cps.len;
+    const tcp = dc.cps[len -| 1];
+    return normp.stage_2[normp.stage_1[tcp >> 8] + (tcp & 0xff)];
+}
+/// Fast check to detect if a string is already in NFC or NFD form.
+pub fn isFcd(self: Self, str: []const u8) bool {
+    var prev_ccc: u8 = 0;
+    var cp_iter = CodePointIterator{ .bytes = str };
+    return while (cp_iter.next()) |cp| {
+        const ccc = self.getLeadCcc(cp.code);
+        if (ccc != 0 and ccc < prev_ccc) break false;
+        prev_ccc = self.getTrailCcc(cp.code);
+    } else true;
+}
+test "isFcd" {
+    const allocator = std.testing.allocator;
+    var n = try init(allocator);
+    defer n.deinit();
+    const is_nfc = "José \u{3D3}";
+    try std.testing.expect(n.isFcd(is_nfc));
+    const is_nfd = "Jose\u{301} \u{3d2}\u{301}";
+    try std.testing.expect(n.isFcd(is_nfd));
+    const not_fcd = "Jose\u{301} \u{3d2}\u{315}\u{301}";
+    try std.testing.expect(!n.isFcd(not_fcd));
+}
+test "Unicode normalization tests" {
+    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
+    defer arena.deinit();
+    var allocator = arena.allocator();
+    var n = try init(allocator);
+    defer n.deinit();
+    var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
+    defer file.close();
+    var buf_reader = std.io.bufferedReader(file.reader());
+    const input_stream = buf_reader.reader();
+    var line_no: usize = 0;
+    var buf: [4096]u8 = undefined;
+    var cp_buf: [4]u8 = undefined;
+    while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| {
+        line_no += 1;
+        // Skip comments or empty lines.
+        if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
+        // Iterate over fields.
+        var fields = std.mem.split(u8, line, ";");
+        var field_index: usize = 0;
+        var input: []u8 = undefined;
+        defer allocator.free(input);
+        while (fields.next()) |field| : (field_index += 1) {
+            if (field_index == 0) {
+                var i_buf = std.ArrayList(u8).init(allocator);
+                defer i_buf.deinit();
+                var i_fields = std.mem.split(u8, field, " ");
+                while (i_fields.next()) |s| {
+                    const icp = try std.fmt.parseInt(u21, s, 16);
+                    const len = try std.unicode.utf8Encode(icp, &cp_buf);
+                    try i_buf.appendSlice(cp_buf[0..len]);
+                }
+                input = try i_buf.toOwnedSlice();
+            } else if (field_index == 1) {
+                //std.debug.print("\n*** {s} ***\n", .{line});
+                // NFC, time to test.
+                var w_buf = std.ArrayList(u8).init(allocator);
+                defer w_buf.deinit();
+                var w_fields = std.mem.split(u8, field, " ");
+                while (w_fields.next()) |s| {
+                    const wcp = try std.fmt.parseInt(u21, s, 16);
+                    const len = try std.unicode.utf8Encode(wcp, &cp_buf);
+                    try w_buf.appendSlice(cp_buf[0..len]);
+                }
+                const want = w_buf.items;
+                var got = try n.nfc(allocator, input);
+                defer got.deinit();
+                try std.testing.expectEqualStrings(want, got.slice);
+            } else if (field_index == 2) {
+                // NFD, time to test.
+                var w_buf = std.ArrayList(u8).init(allocator);
+                defer w_buf.deinit();
+                var w_fields = std.mem.split(u8, field, " ");
+                while (w_fields.next()) |s| {
+                    const wcp = try std.fmt.parseInt(u21, s, 16);
+                    const len = try std.unicode.utf8Encode(wcp, &cp_buf);
+                    try w_buf.appendSlice(cp_buf[0..len]);
+                }
+                const want = w_buf.items;
+                var got = try n.nfd(allocator, input);
+                defer got.deinit();
+                try std.testing.expectEqualStrings(want, got.slice);
+            } else if (field_index == 3) {
+                // NFKC, time to test.
+                var w_buf = std.ArrayList(u8).init(allocator);
+                defer w_buf.deinit();
+                var w_fields = std.mem.split(u8, field, " ");
+                while (w_fields.next()) |s| {
+                    const wcp = try std.fmt.parseInt(u21, s, 16);
+                    const len = try std.unicode.utf8Encode(wcp, &cp_buf);
+                    try w_buf.appendSlice(cp_buf[0..len]);
+                }
+                const want = w_buf.items;
+                var got = try n.nfkc(allocator, input);
+                defer got.deinit();
+                try std.testing.expectEqualStrings(want, got.slice);
+            } else if (field_index == 4) {
+                // NFKD, time to test.
+                var w_buf = std.ArrayList(u8).init(allocator);
+                defer w_buf.deinit();
+                var w_fields = std.mem.split(u8, field, " ");
+                while (w_fields.next()) |s| {
+                    const wcp = try std.fmt.parseInt(u21, s, 16);
+                    const len = try std.unicode.utf8Encode(wcp, &cp_buf);
+                    try w_buf.appendSlice(cp_buf[0..len]);
+                }
+                const want = w_buf.items;
+                var got = try n.nfkd(allocator, input);
+                defer got.deinit();
+                try std.testing.expectEqualStrings(want, got.slice);
+            } else {
+                continue;
+            }
+        }
+    }
+}
author	Jose Colon Rodriguez	2024-02-20 09:13:36 -0400
committer	Jose Colon Rodriguez	2024-02-20 09:13:36 -0400
commit	134a7df8206aa66ca7b0abbc7f31f08410b502d2 (patch)
tree	e3c520dce4f529de5290bc145847e78fb5583bee /src/Normalizer.zig
parent	Cleaned up directory structure (diff)
download	zg-134a7df8206aa66ca7b0abbc7f31f08410b502d2.tar.gz zg-134a7df8206aa66ca7b0abbc7f31f08410b502d2.tar.xz zg-134a7df8206aa66ca7b0abbc7f31f08410b502d2.zip