From b823a49b6a57bc1736b33a0816b42aaaf86cf839 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 6 Feb 2026 13:07:03 -0500 Subject: zg module, casing improvements --- src/ascii.zig | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) (limited to 'src/ascii.zig') diff --git a/src/ascii.zig b/src/ascii.zig index 6c28f25..5b91348 100644 --- a/src/ascii.zig +++ b/src/ascii.zig @@ -25,6 +25,77 @@ pub fn isAsciiOnly(str: []const u8) bool { return true; } +/// Do a caseless comparison, with SIMD if possible. Strings must be of equal +/// length. Returns how many bytes are case-fold-matched ASCII, this will be +/// equal to the string length if they match. +pub fn caselessCmpLen(str_a: []const u8, str_b: []const u8) usize { + std.debug.assert(str_a.len == str_b.len); + const vec_len = simd.suggestVectorLength(u8) orelse return caselessCmpNoSimd(str_a, str_b); + const Vec = @Vector(vec_len, u8); + const BVec = @Vector(vec_len, bool); + + const msb: Vec = @splat(@as(u8, 0x80)); + const case_bit: Vec = @splat(@as(u8, 0x20)); + const low5: Vec = @splat(@as(u8, 0x1f)); + const vec0: Vec = @splat(@as(u8, 0)); + const vec1: Vec = @splat(@as(u8, 1)); + const vec26: Vec = @splat(@as(u8, 26)); + + var rem_a = str_a; + var rem_b = str_b; + + while (rem_a.len >= vec_len) { + const a: Vec = rem_a[0..vec_len].*; + const b: Vec = rem_b[0..vec_len].*; + // ASCII gate: MSB must be 0 in both. + const is_ascii: BVec = ((a | b) & msb) == vec0; + + const xor: Vec = a ^ b; + const exact: BVec = xor == vec0; + const case_diff: BVec = xor == case_bit; + + // Letter test (only needed when case_diff). + const x: Vec = (a | b) & low5; + const is_letter: BVec = + (x >= vec1) & (x <= vec26); + + const matched: BVec = is_ascii & (exact | (case_diff & is_letter)); + + if (!@reduce(.And, matched)) break; + rem_a = rem_a[vec_len..]; + rem_b = rem_b[vec_len..]; + } + + // Tail + return str_a.len - rem_a.len + caselessCmpNoSimd(rem_a, rem_b); +} + +inline fn caselessCmpNoSimd(str_a: []const u8, str_b: []const u8) usize { + for (str_a, str_b, 0..) |a, b, i| { + // High? + if (((a | b) & 0x80) != 0) return i; + const xor = a ^ b; + if (xor == 0) continue; // Match + if (xor != 0x20) return i; // Not the upcase bit. + + const lo = a | b; + const x = lo & 0x1f; + if (x < 1 or x > 26) return i; // Not a letter + } else return str_a.len; +} + +test caselessCmpNoSimd { + const hi_l = "Hello, World!"; + const hi_h = "HeLlO, wOrLd!"; + try testing.expectEqual(hi_l.len, caselessCmpNoSimd(hi_l, hi_h)); +} + +test caselessCmpLen { + const hi_l = "Hello, World!" ** 25; + const hi_h = "HeLlO, wOrLd!" ** 25; + try testing.expectEqual(hi_l.len, caselessCmpLen(hi_l, hi_h)); +} + test "isAsciiOnly" { const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+"; try testing.expect(isAsciiOnly(ascii_only)); -- cgit v1.2.3