From 7cad24f76a72f534084de64153f768699170cd05 Mon Sep 17 00:00:00 2001
From: Jose Colon Rodriguez
Date: Wed, 28 Feb 2024 19:23:23 -0400
Subject: Using slices for decompositions in Normalizer

---
 src/Normalizer.zig | 185 ++++++++++++++++++++++++++---------------------------
 1 file changed, 89 insertions(+), 96 deletions(-)

(limited to 'src/Normalizer.zig')

diff --git a/src/Normalizer.zig b/src/Normalizer.zig
index 26177ac..89cc50c 100644
--- a/src/Normalizer.zig
+++ b/src/Normalizer.zig
@@ -5,6 +5,7 @@
 const std = @import("std");
 const testing = std.testing;
 
+const ascii = @import("ascii");
 const CodePointIterator = @import("code_point").Iterator;
 pub const NormData = @import("NormData");
 
@@ -12,12 +13,6 @@ norm_data: *NormData,
 
 const Self = @This();
 
-// Hangul processing utilities.
-fn isHangulPrecomposed(self: Self, cp: u21) bool {
-    const kind = self.norm_data.hangul_data.syllable(cp);
-    return kind == .LV or kind == .LVT;
-}
-
 const SBase: u21 = 0xAC00;
 const LBase: u21 = 0x1100;
 const VBase: u21 = 0x1161;
@@ -28,17 +23,30 @@ const TCount: u21 = 28;
 const NCount: u21 = 588; // VCount * TCount
 const SCount: u21 = 11172; // LCount * NCount
 
-fn decomposeHangul(cp: u21) [3]u21 {
+fn decomposeHangul(self: Self, cp: u21, buf: []u21) ?Decomp {
+    const kind = self.norm_data.hangul_data.syllable(cp);
+    if (kind != .LV and kind != .LVT) return null;
+
     const SIndex: u21 = cp - SBase;
     const LIndex: u21 = SIndex / NCount;
     const VIndex: u21 = (SIndex % NCount) / TCount;
     const TIndex: u21 = SIndex % TCount;
     const LPart: u21 = LBase + LIndex;
     const VPart: u21 = VBase + VIndex;
-    var TPart: u21 = 0;
-    if (TIndex != 0) TPart = TBase + TIndex;
 
-    return [3]u21{ LPart, VPart, TPart };
+    var dc = Decomp{ .form = .nfd };
+    buf[0] = LPart;
+    buf[1] = VPart;
+
+    if (TIndex == 0) {
+        dc.cps = buf[0..2];
+        return dc;
+    }
+
+    // TPart
+    buf[2] = TBase + TIndex;
+    dc.cps = buf[0..3];
+    return dc;
 }
 
 fn composeHangulCanon(lv: u21, t: u21) u21 {
@@ -70,59 +78,59 @@ const Form = enum {
 };
 
 const Decomp = struct {
-    form: Form = .nfd,
-    cps: [18]u21 = [_]u21{0} ** 18,
+    form: Form = .same,
+    cps: []const u21 = &.{},
 };
 
 /// `mapping` retrieves the decomposition mapping for a code point as per the UCD.
 pub fn mapping(self: Self, cp: u21, form: Form) Decomp {
-    std.debug.assert(form == .nfd or form == .nfkd);
-
-    var dc = Decomp{ .form = .nfd };
-    const canon_dc = self.norm_data.canon_data.toNfd(cp);
-    const len: usize = if (canon_dc[1] == 0) 1 else 2;
-
-    if (len == 1 and canon_dc[0] == cp) {
-        dc.form = .same;
-        dc.cps[0] = cp;
-    } else {
-        @memcpy(dc.cps[0..len], canon_dc[0..len]);
-    }
+    var dc = Decomp{};
+
+    switch (form) {
+        .nfd => {
+            dc.cps = self.norm_data.canon_data.toNfd(cp);
+            if (dc.cps.len != 0) dc.form = .nfd;
+        },
+
+        .nfkd => {
+            dc.cps = self.norm_data.compat_data.toNfkd(cp);
+            if (dc.cps.len != 0) {
+                dc.form = .nfkd;
+            } else {
+                dc.cps = self.norm_data.canon_data.toNfd(cp);
+                if (dc.cps.len != 0) dc.form = .nfkd;
+            }
+        },
 
-    const compat_dc = self.norm_data.compat_data.toNfkd(cp);
-    if (compat_dc.len != 0) {
-        if (form != .nfd) {
-            dc.form = .nfkd;
-            @memcpy(dc.cps[0..compat_dc.len], compat_dc);
-        }
+        else => @panic("Normalizer.mapping only accepts form .nfd or .nfkd."),
     }
 
     return dc;
 }
 
 /// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`.
-pub fn decompose(self: Self, cp: u21, form: Form) Decomp {
-    std.debug.assert(form == .nfd or form == .nfkd);
-
-    var dc = Decomp{ .form = form };
-
-    // ASCII or NFD / NFKD quick checks.
-    if (cp <= 127 or
-        (form == .nfd and self.norm_data.normp_data.isNfd(cp)) or
-        (form == .nfkd and self.norm_data.normp_data.isNfkd(cp)))
-    {
-        dc.cps[0] = cp;
-        return dc;
+pub fn decompose(
+    self: Self,
+    cp: u21,
+    form: Form,
+    buf: []u21,
+) Decomp {
+    // ASCII
+    if (cp < 128) return .{};
+
+    // NFD / NFKD quick checks.
+    switch (form) {
+        .nfd => if (self.norm_data.normp_data.isNfd(cp)) return .{},
+        .nfkd => if (self.norm_data.normp_data.isNfkd(cp)) return .{},
+        else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."),
     }
 
     // Hangul precomposed syllable full decomposition.
-    if (self.isHangulPrecomposed(cp)) {
-        const cps = decomposeHangul(cp);
-        @memcpy(dc.cps[0..cps.len], &cps);
-        return dc;
-    }
+    if (self.decomposeHangul(cp, buf)) |dc| return dc;
 
     // Full decomposition.
+    var dc = Decomp{ .form = form };
+
     var result_index: usize = 0;
     var work_index: usize = 1;
 
@@ -137,27 +145,24 @@ pub fn decompose(self: Self, cp: u21, form: Form) Decomp {
 
         // No more of decompositions for this code point.
         if (m.form == .same) {
-            dc.cps[result_index] = m.cps[0];
+            buf[result_index] = next;
             result_index += 1;
             continue;
         }
 
-        // Find last index of decomposition.
-        const m_last = for (m.cps, 0..) |mcp, i| {
-            if (mcp == 0) break i;
-        } else m.cps.len;
-
         // Work backwards through decomposition.
         // `i` starts at 1 because m_last is 1 past the last code point.
         var i: usize = 1;
-        while (i <= m_last) : ({
+        while (i <= m.cps.len) : ({
             i += 1;
             work_index += 1;
         }) {
-            work[work_index] = m.cps[m_last - i];
+            work[work_index] = m.cps[m.cps.len - i];
         }
     }
 
+    dc.cps = buf[0..result_index];
+
     return dc;
 }
 
@@ -167,58 +172,45 @@ test "decompose" {
     defer data.deinit();
     var n = Self{ .norm_data = &data };
 
-    var dc = n.decompose('é', .nfd);
+    var buf: [18]u21 = undefined;
+
+    var dc = n.decompose('é', .nfd, &buf);
     try std.testing.expect(dc.form == .nfd);
     try std.testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]);
 
-    dc = n.decompose('\u{1e0a}', .nfd);
+    dc = n.decompose('\u{1e0a}', .nfd, &buf);
     try std.testing.expect(dc.form == .nfd);
     try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
 
-    dc = n.decompose('\u{1e0a}', .nfkd);
+    dc = n.decompose('\u{1e0a}', .nfkd, &buf);
     try std.testing.expect(dc.form == .nfkd);
     try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
 
-    dc = n.decompose('\u{3189}', .nfd);
-    try std.testing.expect(dc.form == .nfd);
-    try std.testing.expectEqualSlices(u21, &[_]u21{'\u{3189}'}, dc.cps[0..1]);
+    dc = n.decompose('\u{3189}', .nfd, &buf);
+    try std.testing.expect(dc.form == .same);
+    try std.testing.expect(dc.cps.len == 0);
 
-    dc = n.decompose('\u{3189}', .nfkd);
+    dc = n.decompose('\u{3189}', .nfkd, &buf);
     try std.testing.expect(dc.form == .nfkd);
     try std.testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]);
 
-    dc = n.decompose('\u{ace1}', .nfd);
+    dc = n.decompose('\u{ace1}', .nfd, &buf);
     try std.testing.expect(dc.form == .nfd);
     try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
 
-    dc = n.decompose('\u{ace1}', .nfkd);
-    try std.testing.expect(dc.form == .nfkd);
+    dc = n.decompose('\u{ace1}', .nfkd, &buf);
+    try std.testing.expect(dc.form == .nfd);
     try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
 
-    dc = n.decompose('\u{3d3}', .nfd);
+    dc = n.decompose('\u{3d3}', .nfd, &buf);
     try std.testing.expect(dc.form == .nfd);
     try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]);
 
-    dc = n.decompose('\u{3d3}', .nfkd);
+    dc = n.decompose('\u{3d3}', .nfkd, &buf);
     try std.testing.expect(dc.form == .nfkd);
     try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]);
 }
 
-// Some quick checks.
-
-fn onlyAscii(str: []const u8) bool {
-    return for (str) |b| {
-        if (b > 127) break false;
-    } else true;
-}
-
-fn onlyLatin1(str: []const u8) bool {
-    var cp_iter = CodePointIterator{ .bytes = str };
-    return while (cp_iter.next()) |cp| {
-        if (cp.code > 256) break false;
-    } else true;
-}
-
 /// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory.
 pub const Result = struct {
     allocator: ?std.mem.Allocator = null,
@@ -256,18 +248,21 @@ pub fn nfkd(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result {
 
 fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result {
     // Quick checks.
-    if (onlyAscii(str)) return Result{ .slice = str };
+    if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
 
-    var dcp_list = try std.ArrayList(u21).initCapacity(allocator, str.len + str.len / 2);
+    var dcp_list = try std.ArrayList(u21).initCapacity(allocator, str.len * 3);
     defer dcp_list.deinit();
 
     var cp_iter = CodePointIterator{ .bytes = str };
+    var dc_buf: [18]u21 = undefined;
+
     while (cp_iter.next()) |cp| {
-        const dc = self.decompose(cp.code, form);
-        const slice = for (dc.cps, 0..) |dcp, i| {
-            if (dcp == 0) break dc.cps[0..i];
-        } else dc.cps[0..];
-        try dcp_list.appendSlice(slice);
+        const dc = self.decompose(cp.code, form, &dc_buf);
+        if (dc.form == .same) {
+            try dcp_list.append(cp.code);
+        } else {
+            try dcp_list.appendSlice(dc.cps);
+        }
     }
 
     self.canonicalSort(dcp_list.items);
@@ -354,8 +349,7 @@ pub fn nfkc(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result {
 
 fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result {
     // Quick checks.
-    if (onlyAscii(str)) return Result{ .slice = str };
-    if (form == .nfc and onlyLatin1(str)) return Result{ .slice = str };
+    if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
 
     // Decompose first.
     var d_result = if (form == .nfc)
@@ -522,15 +516,14 @@ test "eql" {
 // FCD
 fn getLeadCcc(self: Self, cp: u21) u8 {
     const dc = self.mapping(cp, .nfd);
-    return self.norm_data.ccc_data.ccc(dc.cps[0]);
+    const dcp = if (dc.form == .same) cp else dc.cps[0];
+    return self.norm_data.ccc_data.ccc(dcp);
 }
 
 fn getTrailCcc(self: Self, cp: u21) u8 {
     const dc = self.mapping(cp, .nfd);
-    const len = for (dc.cps, 0..) |dcp, i| {
-        if (dcp == 0) break i;
-    } else dc.cps.len;
-    return self.norm_data.ccc_data.ccc(dc.cps[len - 1]);
+    const dcp = if (dc.form == .same) cp else dc.cps[dc.cps.len - 1];
+    return self.norm_data.ccc_data.ccc(dcp);
 }
 
 /// Fast check to detect if a string is already in NFC or NFD form.
-- 
cgit v1.2.3