From b823a49b6a57bc1736b33a0816b42aaaf86cf839 Mon Sep 17 00:00:00 2001
From: Sam Atman
Date: Fri, 6 Feb 2026 13:07:03 -0500
Subject: zg module, casing improvements

---
 src/CanonData.zig   | 12 ++++++++-
 src/CaseFolding.zig | 26 +++++++++++++++++---
 src/ascii.zig       | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/code_point.zig  | 13 +++-------
 src/zg.zig          | 14 +++++++++++
 5 files changed, 122 insertions(+), 14 deletions(-)
 create mode 100644 src/zg.zig

(limited to 'src')

diff --git a/src/CanonData.zig b/src/CanonData.zig
index 5c1ffa6..144346c 100644
--- a/src/CanonData.zig
+++ b/src/CanonData.zig
@@ -5,6 +5,12 @@ const Data = struct {
     s2: []const @import("canon").Canonicalization = undefined,
 };
 
+// Canonicalization looks like this:
+// const Canonicalization = struct {
+//     len: u3 = 0,
+//     cps: [2]u21 = [_]u21{0} ** 2,
+// };
+
 const canon_data = canon_data: {
     const canon_ = @import("canon");
     break :canon_data Data{
@@ -17,7 +23,7 @@ const CanonData = @This();
 
 // There's a bug here, which is down to how static u21 vs. runtime are handled,
 // the "unique representation" claim is not working out.  AutoHash casts to bytes,
-// and that won't fly.  So we do this:
+// and that won't fly.  So we do a simple custom context which works for both.
 
 const Context = struct {
     pub fn hash(_: Context, cps: [2]u21) u64 {
@@ -52,3 +58,7 @@ const std = @import("std");
 const builtin = @import("builtin");
 const mem = std.mem;
 const comptime_map = @import("comptime_map.zig");
+
+test {
+    _ = comptime_map;
+}
diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig
index d69cddc..b7aa020 100644
--- a/src/CaseFolding.zig
+++ b/src/CaseFolding.zig
@@ -103,7 +103,16 @@ pub fn compatCaselessMatch(
     a: []const u8,
     b: []const u8,
 ) Allocator.Error!bool {
-    if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
+    var a_in = a;
+    var b_in = b;
+
+    // Ascii short path.  Only applies if they're the same length:
+    if (a_in.len == b_in.len) {
+        const prefix = ascii.caselessCmpLen(a_in, b_in);
+        if (prefix == a_in.len) return true;
+        a_in = a_in[prefix..];
+        b_in = b_in[prefix..];
+    }
 
     // Process a
     const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd);
@@ -192,10 +201,19 @@ pub fn canonCaselessMatch(
     a: []const u8,
     b: []const u8,
 ) Allocator.Error!bool {
-    if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
+    var a_in = a;
+    var b_in = b;
+
+    // Ascii short path.  Only applies if they're the same length:
+    if (a_in.len == b_in.len) {
+        const prefix = ascii.caselessCmpLen(a_in, b_in);
+        if (prefix == a_in.len) return true;
+        a_in = a_in[prefix..];
+        b_in = b_in[prefix..];
+    }
 
     // Process a
-    const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd);
+    const nfd_a = try Normalize.nfxdCodePoints(allocator, a_in, .nfd);
     defer allocator.free(nfd_a);
 
     var need_free_cf_nfd_a = false;
@@ -215,7 +233,7 @@ pub fn canonCaselessMatch(
     defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a);
 
     // Process b
-    const nfd_b = try Normalize.nfxdCodePoints(allocator, b, .nfd);
+    const nfd_b = try Normalize.nfxdCodePoints(allocator, b_in, .nfd);
     defer allocator.free(nfd_b);
 
     var need_free_cf_nfd_b = false;
diff --git a/src/ascii.zig b/src/ascii.zig
index 6c28f25..5b91348 100644
--- a/src/ascii.zig
+++ b/src/ascii.zig
@@ -25,6 +25,77 @@ pub fn isAsciiOnly(str: []const u8) bool {
     return true;
 }
 
+/// Do a caseless comparison, with SIMD if possible.  Strings must be of equal
+/// length.  Returns how many bytes are case-fold-matched ASCII, this will be
+/// equal to the string length if they match.
+pub fn caselessCmpLen(str_a: []const u8, str_b: []const u8) usize {
+    std.debug.assert(str_a.len == str_b.len);
+    const vec_len = simd.suggestVectorLength(u8) orelse return caselessCmpNoSimd(str_a, str_b);
+    const Vec = @Vector(vec_len, u8);
+    const BVec = @Vector(vec_len, bool);
+
+    const msb: Vec = @splat(@as(u8, 0x80));
+    const case_bit: Vec = @splat(@as(u8, 0x20));
+    const low5: Vec = @splat(@as(u8, 0x1f));
+    const vec0: Vec = @splat(@as(u8, 0));
+    const vec1: Vec = @splat(@as(u8, 1));
+    const vec26: Vec = @splat(@as(u8, 26));
+
+    var rem_a = str_a;
+    var rem_b = str_b;
+
+    while (rem_a.len >= vec_len) {
+        const a: Vec = rem_a[0..vec_len].*;
+        const b: Vec = rem_b[0..vec_len].*;
+        // ASCII gate: MSB must be 0 in both.
+        const is_ascii: BVec = ((a | b) & msb) == vec0;
+
+        const xor: Vec = a ^ b;
+        const exact: BVec = xor == vec0;
+        const case_diff: BVec = xor == case_bit;
+
+        // Letter test (only needed when case_diff).
+        const x: Vec = (a | b) & low5;
+        const is_letter: BVec =
+            (x >= vec1) & (x <= vec26);
+
+        const matched: BVec = is_ascii & (exact | (case_diff & is_letter));
+
+        if (!@reduce(.And, matched)) break;
+        rem_a = rem_a[vec_len..];
+        rem_b = rem_b[vec_len..];
+    }
+
+    // Tail
+    return str_a.len - rem_a.len + caselessCmpNoSimd(rem_a, rem_b);
+}
+
+inline fn caselessCmpNoSimd(str_a: []const u8, str_b: []const u8) usize {
+    for (str_a, str_b, 0..) |a, b, i| {
+        // High?
+        if (((a | b) & 0x80) != 0) return i;
+        const xor = a ^ b;
+        if (xor == 0) continue; // Match
+        if (xor != 0x20) return i; // Not the upcase bit.
+
+        const lo = a | b;
+        const x = lo & 0x1f;
+        if (x < 1 or x > 26) return i; // Not a letter
+    } else return str_a.len;
+}
+
+test caselessCmpNoSimd {
+    const hi_l = "Hello, World!";
+    const hi_h = "HeLlO, wOrLd!";
+    try testing.expectEqual(hi_l.len, caselessCmpNoSimd(hi_l, hi_h));
+}
+
+test caselessCmpLen {
+    const hi_l = "Hello, World!" ** 25;
+    const hi_h = "HeLlO, wOrLd!" ** 25;
+    try testing.expectEqual(hi_l.len, caselessCmpLen(hi_l, hi_h));
+}
+
 test "isAsciiOnly" {
     const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+";
     try testing.expect(isAsciiOnly(ascii_only));
diff --git a/src/code_point.zig b/src/code_point.zig
index 7a638af..5f6c61c 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -30,13 +30,8 @@ pub const CodePoint = struct {
 /// This function is deprecated and will be removed in a later release.
 /// Use `decodeAtIndex` or `decodeAtCursor`.
 pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint {
-    var off: uoffset = 0;
-    var maybe_code = decodeAtCursor(bytes, &off);
-    if (maybe_code) |*code| {
-        code.offset = offset;
-        return code.*;
-    }
-    return null;
+    _ = .{ bytes, offset };
+    @compileError("decode has been removed, use `decodeAtIndex`.");
 }
 
 /// Return the codepoint at `index`, even if `index` is in the middle
@@ -292,7 +287,7 @@ pub const ReverseIterator = struct {
         else
             iter.i = null;
 
-        return decode(iter.bytes[i_prev..], i_prev);
+        return decodeAtIndex(iter.bytes, i_prev);
     }
 
     pub fn peek(iter: *ReverseIterator) ?CodePoint {
@@ -319,7 +314,7 @@ inline fn followbyte(b: u8) bool {
 
 test "decode" {
     const bytes = "🌩️";
-    const res = decode(bytes, 0);
+    const res = decodeAtIndex(bytes, 0);
 
     if (res) |cp| {
         try std.testing.expectEqual(@as(u21, 0x1F329), cp.code);
diff --git a/src/zg.zig b/src/zg.zig
new file mode 100644
index 0000000..2974320
--- /dev/null
+++ b/src/zg.zig
@@ -0,0 +1,14 @@
+//! zg: a Zig-native Unicode Module
+
+pub const ascii = @import("ascii");
+pub const case_folding = @import("CaseFolding");
+pub const code_point = @import("code_point");
+pub const display_width = @import("DisplawWidth");
+pub const emoji = @import("Emoji");
+pub const general_categories = @import("GeneralCategories");
+pub const graphemes = @import("Graphemes");
+pub const letter_casing = @import("LetterCasing");
+pub const normalize = @import("Normalize");
+pub const properties = @import("Properties");
+pub const scripts = @import("Scripts");
+pub const words = @import("Words");
-- 
cgit v1.2.3