From 3735e5b7bbd8d0d25687f3080925084b9dbb938d Mon Sep 17 00:00:00 2001
From: Jose Colon Rodriguez
Date: Wed, 28 Feb 2024 20:30:48 -0400
Subject: Added nfc latin1 check back

---
 src/Normalizer.zig | 191 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 121 insertions(+), 70 deletions(-)

(limited to 'src/Normalizer.zig')

diff --git a/src/Normalizer.zig b/src/Normalizer.zig
index 89cc50c..d32ad52 100644
--- a/src/Normalizer.zig
+++ b/src/Normalizer.zig
@@ -3,7 +3,16 @@
 //! NFKC, NFD, and NFKD normalization forms.
 
 const std = @import("std");
+const assert = std.debug.assert;
+const debug = std.debug;
+const fmt = std.fmt;
+const fs = std.fs;
+const heap = std.heap;
+const io = std.io;
+const mem = std.mem;
+const simd = std.simd;
 const testing = std.testing;
+const unicode = std.unicode;
 
 const ascii = @import("ascii");
 const CodePointIterator = @import("code_point").Iterator;
@@ -50,20 +59,20 @@ fn decomposeHangul(self: Self, cp: u21, buf: []u21) ?Decomp {
 }
 
 fn composeHangulCanon(lv: u21, t: u21) u21 {
-    std.debug.assert(0x11A8 <= t and t <= 0x11C2);
+    assert(0x11A8 <= t and t <= 0x11C2);
     return lv + (t - TBase);
 }
 
 fn composeHangulFull(l: u21, v: u21, t: u21) u21 {
-    std.debug.assert(0x1100 <= l and l <= 0x1112);
-    std.debug.assert(0x1161 <= v and v <= 0x1175);
+    assert(0x1100 <= l and l <= 0x1112);
+    assert(0x1161 <= v and v <= 0x1175);
     const LIndex = l - LBase;
     const VIndex = v - VBase;
     const LVIndex = LIndex * NCount + VIndex * TCount;
 
     if (t == 0) return SBase + LVIndex;
 
-    std.debug.assert(0x11A8 <= t and t <= 0x11C2);
+    assert(0x11A8 <= t and t <= 0x11C2);
     const TIndex = t - TBase;
 
     return SBase + LVIndex + TIndex;
@@ -175,45 +184,45 @@ test "decompose" {
     var buf: [18]u21 = undefined;
 
     var dc = n.decompose('é', .nfd, &buf);
-    try std.testing.expect(dc.form == .nfd);
-    try std.testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]);
+    try testing.expect(dc.form == .nfd);
+    try testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]);
 
     dc = n.decompose('\u{1e0a}', .nfd, &buf);
-    try std.testing.expect(dc.form == .nfd);
-    try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
+    try testing.expect(dc.form == .nfd);
+    try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
 
     dc = n.decompose('\u{1e0a}', .nfkd, &buf);
-    try std.testing.expect(dc.form == .nfkd);
-    try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
+    try testing.expect(dc.form == .nfkd);
+    try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
 
     dc = n.decompose('\u{3189}', .nfd, &buf);
-    try std.testing.expect(dc.form == .same);
-    try std.testing.expect(dc.cps.len == 0);
+    try testing.expect(dc.form == .same);
+    try testing.expect(dc.cps.len == 0);
 
     dc = n.decompose('\u{3189}', .nfkd, &buf);
-    try std.testing.expect(dc.form == .nfkd);
-    try std.testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]);
+    try testing.expect(dc.form == .nfkd);
+    try testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]);
 
     dc = n.decompose('\u{ace1}', .nfd, &buf);
-    try std.testing.expect(dc.form == .nfd);
-    try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
+    try testing.expect(dc.form == .nfd);
+    try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
 
     dc = n.decompose('\u{ace1}', .nfkd, &buf);
-    try std.testing.expect(dc.form == .nfd);
-    try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
+    try testing.expect(dc.form == .nfd);
+    try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
 
     dc = n.decompose('\u{3d3}', .nfd, &buf);
-    try std.testing.expect(dc.form == .nfd);
-    try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]);
+    try testing.expect(dc.form == .nfd);
+    try testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]);
 
     dc = n.decompose('\u{3d3}', .nfkd, &buf);
-    try std.testing.expect(dc.form == .nfkd);
-    try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]);
+    try testing.expect(dc.form == .nfkd);
+    try testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]);
 }
 
 /// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory.
 pub const Result = struct {
-    allocator: ?std.mem.Allocator = null,
+    allocator: ?mem.Allocator = null,
     slice: []const u8,
 
     pub fn deinit(self: *Result) void {
@@ -232,25 +241,25 @@ fn canonicalSort(self: Self, cps: []u21) void {
     while (i < cps.len) : (i += 1) {
         const start: usize = i;
         while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {}
-        std.mem.sort(u21, cps[start..i], self, cccLess);
+        mem.sort(u21, cps[start..i], self, cccLess);
     }
 }
 
 /// Normalize `str` to NFD.
-pub fn nfd(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result {
+pub fn nfd(self: Self, allocator: mem.Allocator, str: []const u8) !Result {
     return self.nfxd(allocator, str, .nfd);
 }
 
 /// Normalize `str` to NFKD.
-pub fn nfkd(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result {
+pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) !Result {
     return self.nfxd(allocator, str, .nfkd);
 }
 
-fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result {
+fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Result {
     // Quick checks.
     if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
 
-    var dcp_list = try std.ArrayList(u21).initCapacity(allocator, str.len * 3);
+    var dcp_list = std.ArrayList(u21).init(allocator);
     defer dcp_list.deinit();
 
     var cp_iter = CodePointIterator{ .bytes = str };
@@ -272,7 +281,7 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
 
     var buf: [4]u8 = undefined;
     for (dcp_list.items) |dcp| {
-        const len = try std.unicode.utf8Encode(dcp, &buf);
+        const len = try unicode.utf8Encode(dcp, &buf);
         dstr_list.appendSliceAssumeCapacity(buf[0..len]);
     }
 
@@ -288,7 +297,7 @@ test "nfd ASCII / no-alloc" {
     var result = try n.nfd(allocator, "Hello World!");
     defer result.deinit();
 
-    try std.testing.expectEqualStrings("Hello World!", result.slice);
+    try testing.expectEqualStrings("Hello World!", result.slice);
 }
 
 test "nfd !ASCII / alloc" {
@@ -300,7 +309,7 @@ test "nfd !ASCII / alloc" {
     var result = try n.nfd(allocator, "Héllo World! \u{3d3}");
     defer result.deinit();
 
-    try std.testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice);
+    try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice);
 }
 
 test "nfkd ASCII / no-alloc" {
@@ -312,7 +321,7 @@ test "nfkd ASCII / no-alloc" {
     var result = try n.nfkd(allocator, "Hello World!");
     defer result.deinit();
 
-    try std.testing.expectEqualStrings("Hello World!", result.slice);
+    try testing.expectEqualStrings("Hello World!", result.slice);
 }
 
 test "nfkd !ASCII / alloc" {
@@ -324,7 +333,7 @@ test "nfkd !ASCII / alloc" {
     var result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
     defer result.deinit();
 
-    try std.testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice);
+    try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice);
 }
 
 // Composition utilities.
@@ -338,18 +347,19 @@ fn isNonHangulStarter(self: Self, cp: u21) bool {
 }
 
 /// Normalizes `str` to NFC.
-pub fn nfc(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result {
+pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) !Result {
     return self.nfxc(allocator, str, .nfc);
 }
 
 /// Normalizes `str` to NFKC.
-pub fn nfkc(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result {
+pub fn nfkc(self: Self, allocator: mem.Allocator, str: []const u8) !Result {
     return self.nfxc(allocator, str, .nfkc);
 }
 
-fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result {
+fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Result {
     // Quick checks.
     if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
+    if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str };
 
     // Decompose first.
     var d_result = if (form == .nfc)
@@ -449,7 +459,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
 
             for (d_list.items) |cp| {
                 if (cp == tombstone) continue; // "Delete"
-                const len = try std.unicode.utf8Encode(cp, &buf);
+                const len = try unicode.utf8Encode(cp, &buf);
                 cstr_list.appendSliceAssumeCapacity(buf[0..len]);
             }
 
@@ -478,7 +488,7 @@ test "nfc" {
     var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
     defer result.deinit();
 
-    try std.testing.expectEqualStrings("Complex char: \u{3D3}", result.slice);
+    try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice);
 }
 
 test "nfkc" {
@@ -490,17 +500,17 @@ test "nfkc" {
     var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
     defer result.deinit();
 
-    try std.testing.expectEqualStrings("Complex char: \u{038E}", result.slice);
+    try testing.expectEqualStrings("Complex char: \u{038E}", result.slice);
 }
 
 /// Tests for equality of `a` and `b` after normalizing to NFD.
-pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u8) !bool {
+pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool {
     var norm_result_a = try self.nfd(allocator, a);
     defer norm_result_a.deinit();
     var norm_result_b = try self.nfd(allocator, b);
     defer norm_result_b.deinit();
 
-    return std.mem.eql(u8, norm_result_a.slice, norm_result_b.slice);
+    return mem.eql(u8, norm_result_a.slice, norm_result_b.slice);
 }
 
 test "eql" {
@@ -509,8 +519,8 @@ test "eql" {
     defer data.deinit();
     var n = Self{ .norm_data = &data };
 
-    try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
-    try std.testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
+    try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
+    try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
 }
 
 // FCD
@@ -545,17 +555,17 @@ test "isFcd" {
     var n = Self{ .norm_data = &data };
 
     const is_nfc = "José \u{3D3}";
-    try std.testing.expect(n.isFcd(is_nfc));
+    try testing.expect(n.isFcd(is_nfc));
 
     const is_nfd = "Jose\u{301} \u{3d2}\u{301}";
-    try std.testing.expect(n.isFcd(is_nfd));
+    try testing.expect(n.isFcd(is_nfd));
 
     const not_fcd = "Jose\u{301} \u{3d2}\u{315}\u{301}";
-    try std.testing.expect(!n.isFcd(not_fcd));
+    try testing.expect(!n.isFcd(not_fcd));
 }
 
 test "Unicode normalization tests" {
-    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
+    var arena = heap.ArenaAllocator.init(testing.allocator);
     defer arena.deinit();
     var allocator = arena.allocator();
 
@@ -563,9 +573,9 @@ test "Unicode normalization tests" {
     defer data.deinit();
     var n = Self{ .norm_data = &data };
 
-    var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
+    var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
     defer file.close();
-    var buf_reader = std.io.bufferedReader(file.reader());
+    var buf_reader = io.bufferedReader(file.reader());
     const input_stream = buf_reader.reader();
 
     var line_no: usize = 0;
@@ -577,7 +587,7 @@ test "Unicode normalization tests" {
         // Skip comments or empty lines.
         if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
         // Iterate over fields.
-        var fields = std.mem.split(u8, line, ";");
+        var fields = mem.split(u8, line, ";");
         var field_index: usize = 0;
         var input: []u8 = undefined;
         defer allocator.free(input);
@@ -587,24 +597,24 @@ test "Unicode normalization tests" {
                 var i_buf = std.ArrayList(u8).init(allocator);
                 defer i_buf.deinit();
 
-                var i_fields = std.mem.split(u8, field, " ");
+                var i_fields = mem.split(u8, field, " ");
                 while (i_fields.next()) |s| {
-                    const icp = try std.fmt.parseInt(u21, s, 16);
-                    const len = try std.unicode.utf8Encode(icp, &cp_buf);
+                    const icp = try fmt.parseInt(u21, s, 16);
+                    const len = try unicode.utf8Encode(icp, &cp_buf);
                     try i_buf.appendSlice(cp_buf[0..len]);
                 }
 
                 input = try i_buf.toOwnedSlice();
             } else if (field_index == 1) {
-                //std.debug.print("\n*** {s} ***\n", .{line});
+                //debug.print("\n*** {s} ***\n", .{line});
                 // NFC, time to test.
                 var w_buf = std.ArrayList(u8).init(allocator);
                 defer w_buf.deinit();
 
-                var w_fields = std.mem.split(u8, field, " ");
+                var w_fields = mem.split(u8, field, " ");
                 while (w_fields.next()) |s| {
-                    const wcp = try std.fmt.parseInt(u21, s, 16);
-                    const len = try std.unicode.utf8Encode(wcp, &cp_buf);
+                    const wcp = try fmt.parseInt(u21, s, 16);
+                    const len = try unicode.utf8Encode(wcp, &cp_buf);
                     try w_buf.appendSlice(cp_buf[0..len]);
                 }
 
@@ -612,16 +622,16 @@ test "Unicode normalization tests" {
                 var got = try n.nfc(allocator, input);
                 defer got.deinit();
 
-                try std.testing.expectEqualStrings(want, got.slice);
+                try testing.expectEqualStrings(want, got.slice);
             } else if (field_index == 2) {
                 // NFD, time to test.
                 var w_buf = std.ArrayList(u8).init(allocator);
                 defer w_buf.deinit();
 
-                var w_fields = std.mem.split(u8, field, " ");
+                var w_fields = mem.split(u8, field, " ");
                 while (w_fields.next()) |s| {
-                    const wcp = try std.fmt.parseInt(u21, s, 16);
-                    const len = try std.unicode.utf8Encode(wcp, &cp_buf);
+                    const wcp = try fmt.parseInt(u21, s, 16);
+                    const len = try unicode.utf8Encode(wcp, &cp_buf);
                     try w_buf.appendSlice(cp_buf[0..len]);
                 }
 
@@ -629,16 +639,16 @@ test "Unicode normalization tests" {
                 var got = try n.nfd(allocator, input);
                 defer got.deinit();
 
-                try std.testing.expectEqualStrings(want, got.slice);
+                try testing.expectEqualStrings(want, got.slice);
             } else if (field_index == 3) {
                 // NFKC, time to test.
                 var w_buf = std.ArrayList(u8).init(allocator);
                 defer w_buf.deinit();
 
-                var w_fields = std.mem.split(u8, field, " ");
+                var w_fields = mem.split(u8, field, " ");
                 while (w_fields.next()) |s| {
-                    const wcp = try std.fmt.parseInt(u21, s, 16);
-                    const len = try std.unicode.utf8Encode(wcp, &cp_buf);
+                    const wcp = try fmt.parseInt(u21, s, 16);
+                    const len = try unicode.utf8Encode(wcp, &cp_buf);
                     try w_buf.appendSlice(cp_buf[0..len]);
                 }
 
@@ -646,16 +656,16 @@ test "Unicode normalization tests" {
                 var got = try n.nfkc(allocator, input);
                 defer got.deinit();
 
-                try std.testing.expectEqualStrings(want, got.slice);
+                try testing.expectEqualStrings(want, got.slice);
             } else if (field_index == 4) {
                 // NFKD, time to test.
                 var w_buf = std.ArrayList(u8).init(allocator);
                 defer w_buf.deinit();
 
-                var w_fields = std.mem.split(u8, field, " ");
+                var w_fields = mem.split(u8, field, " ");
                 while (w_fields.next()) |s| {
-                    const wcp = try std.fmt.parseInt(u21, s, 16);
-                    const len = try std.unicode.utf8Encode(wcp, &cp_buf);
+                    const wcp = try fmt.parseInt(u21, s, 16);
+                    const len = try unicode.utf8Encode(wcp, &cp_buf);
                     try w_buf.appendSlice(cp_buf[0..len]);
                 }
 
@@ -663,10 +673,51 @@ test "Unicode normalization tests" {
                 var got = try n.nfkd(allocator, input);
                 defer got.deinit();
 
-                try std.testing.expectEqualStrings(want, got.slice);
+                try testing.expectEqualStrings(want, got.slice);
             } else {
                 continue;
             }
         }
     }
 }
+
+/// Returns true if `str` only contains Latin-1 Supplement
+/// code points. Uses SIMD if possible.
+pub fn isLatin1Only(str: []const u8) bool {
+    var cp_iter = CodePointIterator{ .bytes = str };
+
+    const vec_len = simd.suggestVectorLength(u21) orelse return blk: {
+        break :blk while (cp_iter.next()) |cp| {
+            if (cp.code > 256) break false;
+        } else true;
+    };
+
+    const Vec = @Vector(vec_len, u21);
+
+    outer: while (true) {
+        var v1: Vec = undefined;
+        const saved_cp_i = cp_iter.i;
+
+        for (0..vec_len) |i| {
+            if (cp_iter.next()) |cp| {
+                v1[i] = cp.code;
+            } else {
+                cp_iter.i = saved_cp_i;
+                break :outer;
+            }
+        }
+        const v2: Vec = @splat(256);
+        if (@reduce(.Or, v1 > v2)) return false;
+    }
+
+    return while (cp_iter.next()) |cp| {
+        if (cp.code > 256) break false;
+    } else true;
+}
+
+test "isLatin1Only" {
+    const latin1_only = "Hello, World! \u{fe} \u{ff}";
+    try testing.expect(isLatin1Only(latin1_only));
+    const not_latin1_only = "Héllo, World! \u{3d3}";
+    try testing.expect(!isLatin1Only(not_latin1_only));
+}
-- 
cgit v1.2.3