From b823a49b6a57bc1736b33a0816b42aaaf86cf839 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 6 Feb 2026 13:07:03 -0500 Subject: zg module, casing improvements --- src/CanonData.zig | 12 ++++++++- src/CaseFolding.zig | 26 +++++++++++++++++--- src/ascii.zig | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/code_point.zig | 13 +++------- src/zg.zig | 14 +++++++++++ 5 files changed, 122 insertions(+), 14 deletions(-) create mode 100644 src/zg.zig (limited to 'src') diff --git a/src/CanonData.zig b/src/CanonData.zig index 5c1ffa6..144346c 100644 --- a/src/CanonData.zig +++ b/src/CanonData.zig @@ -5,6 +5,12 @@ const Data = struct { s2: []const @import("canon").Canonicalization = undefined, }; +// Canonicalization looks like this: +// const Canonicalization = struct { +// len: u3 = 0, +// cps: [2]u21 = [_]u21{0} ** 2, +// }; + const canon_data = canon_data: { const canon_ = @import("canon"); break :canon_data Data{ @@ -17,7 +23,7 @@ const CanonData = @This(); // There's a bug here, which is down to how static u21 vs. runtime are handled, // the "unique representation" claim is not working out. AutoHash casts to bytes, -// and that won't fly. So we do this: +// and that won't fly. So we do a simple custom context which works for both. const Context = struct { pub fn hash(_: Context, cps: [2]u21) u64 { @@ -52,3 +58,7 @@ const std = @import("std"); const builtin = @import("builtin"); const mem = std.mem; const comptime_map = @import("comptime_map.zig"); + +test { + _ = comptime_map; +} diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig index d69cddc..b7aa020 100644 --- a/src/CaseFolding.zig +++ b/src/CaseFolding.zig @@ -103,7 +103,16 @@ pub fn compatCaselessMatch( a: []const u8, b: []const u8, ) Allocator.Error!bool { - if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); + var a_in = a; + var b_in = b; + + // Ascii short path. Only applies if they're the same length: + if (a_in.len == b_in.len) { + const prefix = ascii.caselessCmpLen(a_in, b_in); + if (prefix == a_in.len) return true; + a_in = a_in[prefix..]; + b_in = b_in[prefix..]; + } // Process a const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd); @@ -192,10 +201,19 @@ pub fn canonCaselessMatch( a: []const u8, b: []const u8, ) Allocator.Error!bool { - if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); + var a_in = a; + var b_in = b; + + // Ascii short path. Only applies if they're the same length: + if (a_in.len == b_in.len) { + const prefix = ascii.caselessCmpLen(a_in, b_in); + if (prefix == a_in.len) return true; + a_in = a_in[prefix..]; + b_in = b_in[prefix..]; + } // Process a - const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd); + const nfd_a = try Normalize.nfxdCodePoints(allocator, a_in, .nfd); defer allocator.free(nfd_a); var need_free_cf_nfd_a = false; @@ -215,7 +233,7 @@ pub fn canonCaselessMatch( defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); // Process b - const nfd_b = try Normalize.nfxdCodePoints(allocator, b, .nfd); + const nfd_b = try Normalize.nfxdCodePoints(allocator, b_in, .nfd); defer allocator.free(nfd_b); var need_free_cf_nfd_b = false; diff --git a/src/ascii.zig b/src/ascii.zig index 6c28f25..5b91348 100644 --- a/src/ascii.zig +++ b/src/ascii.zig @@ -25,6 +25,77 @@ pub fn isAsciiOnly(str: []const u8) bool { return true; } +/// Do a caseless comparison, with SIMD if possible. Strings must be of equal +/// length. Returns how many bytes are case-fold-matched ASCII, this will be +/// equal to the string length if they match. +pub fn caselessCmpLen(str_a: []const u8, str_b: []const u8) usize { + std.debug.assert(str_a.len == str_b.len); + const vec_len = simd.suggestVectorLength(u8) orelse return caselessCmpNoSimd(str_a, str_b); + const Vec = @Vector(vec_len, u8); + const BVec = @Vector(vec_len, bool); + + const msb: Vec = @splat(@as(u8, 0x80)); + const case_bit: Vec = @splat(@as(u8, 0x20)); + const low5: Vec = @splat(@as(u8, 0x1f)); + const vec0: Vec = @splat(@as(u8, 0)); + const vec1: Vec = @splat(@as(u8, 1)); + const vec26: Vec = @splat(@as(u8, 26)); + + var rem_a = str_a; + var rem_b = str_b; + + while (rem_a.len >= vec_len) { + const a: Vec = rem_a[0..vec_len].*; + const b: Vec = rem_b[0..vec_len].*; + // ASCII gate: MSB must be 0 in both. + const is_ascii: BVec = ((a | b) & msb) == vec0; + + const xor: Vec = a ^ b; + const exact: BVec = xor == vec0; + const case_diff: BVec = xor == case_bit; + + // Letter test (only needed when case_diff). + const x: Vec = (a | b) & low5; + const is_letter: BVec = + (x >= vec1) & (x <= vec26); + + const matched: BVec = is_ascii & (exact | (case_diff & is_letter)); + + if (!@reduce(.And, matched)) break; + rem_a = rem_a[vec_len..]; + rem_b = rem_b[vec_len..]; + } + + // Tail + return str_a.len - rem_a.len + caselessCmpNoSimd(rem_a, rem_b); +} + +inline fn caselessCmpNoSimd(str_a: []const u8, str_b: []const u8) usize { + for (str_a, str_b, 0..) |a, b, i| { + // High? + if (((a | b) & 0x80) != 0) return i; + const xor = a ^ b; + if (xor == 0) continue; // Match + if (xor != 0x20) return i; // Not the upcase bit. + + const lo = a | b; + const x = lo & 0x1f; + if (x < 1 or x > 26) return i; // Not a letter + } else return str_a.len; +} + +test caselessCmpNoSimd { + const hi_l = "Hello, World!"; + const hi_h = "HeLlO, wOrLd!"; + try testing.expectEqual(hi_l.len, caselessCmpNoSimd(hi_l, hi_h)); +} + +test caselessCmpLen { + const hi_l = "Hello, World!" ** 25; + const hi_h = "HeLlO, wOrLd!" ** 25; + try testing.expectEqual(hi_l.len, caselessCmpLen(hi_l, hi_h)); +} + test "isAsciiOnly" { const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+"; try testing.expect(isAsciiOnly(ascii_only)); diff --git a/src/code_point.zig b/src/code_point.zig index 7a638af..5f6c61c 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -30,13 +30,8 @@ pub const CodePoint = struct { /// This function is deprecated and will be removed in a later release. /// Use `decodeAtIndex` or `decodeAtCursor`. pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint { - var off: uoffset = 0; - var maybe_code = decodeAtCursor(bytes, &off); - if (maybe_code) |*code| { - code.offset = offset; - return code.*; - } - return null; + _ = .{ bytes, offset }; + @compileError("decode has been removed, use `decodeAtIndex`."); } /// Return the codepoint at `index`, even if `index` is in the middle @@ -292,7 +287,7 @@ pub const ReverseIterator = struct { else iter.i = null; - return decode(iter.bytes[i_prev..], i_prev); + return decodeAtIndex(iter.bytes, i_prev); } pub fn peek(iter: *ReverseIterator) ?CodePoint { @@ -319,7 +314,7 @@ inline fn followbyte(b: u8) bool { test "decode" { const bytes = "🌩️"; - const res = decode(bytes, 0); + const res = decodeAtIndex(bytes, 0); if (res) |cp| { try std.testing.expectEqual(@as(u21, 0x1F329), cp.code); diff --git a/src/zg.zig b/src/zg.zig new file mode 100644 index 0000000..2974320 --- /dev/null +++ b/src/zg.zig @@ -0,0 +1,14 @@ +//! zg: a Zig-native Unicode Module + +pub const ascii = @import("ascii"); +pub const case_folding = @import("CaseFolding"); +pub const code_point = @import("code_point"); +pub const display_width = @import("DisplawWidth"); +pub const emoji = @import("Emoji"); +pub const general_categories = @import("GeneralCategories"); +pub const graphemes = @import("Graphemes"); +pub const letter_casing = @import("LetterCasing"); +pub const normalize = @import("Normalize"); +pub const properties = @import("Properties"); +pub const scripts = @import("Scripts"); +pub const words = @import("Words"); -- cgit v1.2.3