diff options
Diffstat (limited to 'src/ascii.zig')
| -rw-r--r-- | src/ascii.zig | 71 |
1 files changed, 71 insertions, 0 deletions
diff --git a/src/ascii.zig b/src/ascii.zig index 6c28f25..5b91348 100644 --- a/src/ascii.zig +++ b/src/ascii.zig | |||
| @@ -25,6 +25,77 @@ pub fn isAsciiOnly(str: []const u8) bool { | |||
| 25 | return true; | 25 | return true; |
| 26 | } | 26 | } |
| 27 | 27 | ||
| 28 | /// Do a caseless comparison, with SIMD if possible. Strings must be of equal | ||
| 29 | /// length. Returns how many bytes are case-fold-matched ASCII, this will be | ||
| 30 | /// equal to the string length if they match. | ||
| 31 | pub fn caselessCmpLen(str_a: []const u8, str_b: []const u8) usize { | ||
| 32 | std.debug.assert(str_a.len == str_b.len); | ||
| 33 | const vec_len = simd.suggestVectorLength(u8) orelse return caselessCmpNoSimd(str_a, str_b); | ||
| 34 | const Vec = @Vector(vec_len, u8); | ||
| 35 | const BVec = @Vector(vec_len, bool); | ||
| 36 | |||
| 37 | const msb: Vec = @splat(@as(u8, 0x80)); | ||
| 38 | const case_bit: Vec = @splat(@as(u8, 0x20)); | ||
| 39 | const low5: Vec = @splat(@as(u8, 0x1f)); | ||
| 40 | const vec0: Vec = @splat(@as(u8, 0)); | ||
| 41 | const vec1: Vec = @splat(@as(u8, 1)); | ||
| 42 | const vec26: Vec = @splat(@as(u8, 26)); | ||
| 43 | |||
| 44 | var rem_a = str_a; | ||
| 45 | var rem_b = str_b; | ||
| 46 | |||
| 47 | while (rem_a.len >= vec_len) { | ||
| 48 | const a: Vec = rem_a[0..vec_len].*; | ||
| 49 | const b: Vec = rem_b[0..vec_len].*; | ||
| 50 | // ASCII gate: MSB must be 0 in both. | ||
| 51 | const is_ascii: BVec = ((a | b) & msb) == vec0; | ||
| 52 | |||
| 53 | const xor: Vec = a ^ b; | ||
| 54 | const exact: BVec = xor == vec0; | ||
| 55 | const case_diff: BVec = xor == case_bit; | ||
| 56 | |||
| 57 | // Letter test (only needed when case_diff). | ||
| 58 | const x: Vec = (a | b) & low5; | ||
| 59 | const is_letter: BVec = | ||
| 60 | (x >= vec1) & (x <= vec26); | ||
| 61 | |||
| 62 | const matched: BVec = is_ascii & (exact | (case_diff & is_letter)); | ||
| 63 | |||
| 64 | if (!@reduce(.And, matched)) break; | ||
| 65 | rem_a = rem_a[vec_len..]; | ||
| 66 | rem_b = rem_b[vec_len..]; | ||
| 67 | } | ||
| 68 | |||
| 69 | // Tail | ||
| 70 | return str_a.len - rem_a.len + caselessCmpNoSimd(rem_a, rem_b); | ||
| 71 | } | ||
| 72 | |||
| 73 | inline fn caselessCmpNoSimd(str_a: []const u8, str_b: []const u8) usize { | ||
| 74 | for (str_a, str_b, 0..) |a, b, i| { | ||
| 75 | // High? | ||
| 76 | if (((a | b) & 0x80) != 0) return i; | ||
| 77 | const xor = a ^ b; | ||
| 78 | if (xor == 0) continue; // Match | ||
| 79 | if (xor != 0x20) return i; // Not the upcase bit. | ||
| 80 | |||
| 81 | const lo = a | b; | ||
| 82 | const x = lo & 0x1f; | ||
| 83 | if (x < 1 or x > 26) return i; // Not a letter | ||
| 84 | } else return str_a.len; | ||
| 85 | } | ||
| 86 | |||
| 87 | test caselessCmpNoSimd { | ||
| 88 | const hi_l = "Hello, World!"; | ||
| 89 | const hi_h = "HeLlO, wOrLd!"; | ||
| 90 | try testing.expectEqual(hi_l.len, caselessCmpNoSimd(hi_l, hi_h)); | ||
| 91 | } | ||
| 92 | |||
| 93 | test caselessCmpLen { | ||
| 94 | const hi_l = "Hello, World!" ** 25; | ||
| 95 | const hi_h = "HeLlO, wOrLd!" ** 25; | ||
| 96 | try testing.expectEqual(hi_l.len, caselessCmpLen(hi_l, hi_h)); | ||
| 97 | } | ||
| 98 | |||
| 28 | test "isAsciiOnly" { | 99 | test "isAsciiOnly" { |
| 29 | const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+"; | 100 | const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+"; |
| 30 | try testing.expect(isAsciiOnly(ascii_only)); | 101 | try testing.expect(isAsciiOnly(ascii_only)); |