summaryrefslogtreecommitdiff
path: root/src/ascii.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/ascii.zig')
-rw-r--r--src/ascii.zig71
1 files changed, 71 insertions, 0 deletions
diff --git a/src/ascii.zig b/src/ascii.zig
index 6c28f25..5b91348 100644
--- a/src/ascii.zig
+++ b/src/ascii.zig
@@ -25,6 +25,77 @@ pub fn isAsciiOnly(str: []const u8) bool {
25 return true; 25 return true;
26} 26}
27 27
28/// Do a caseless comparison, with SIMD if possible. Strings must be of equal
29/// length. Returns how many bytes are case-fold-matched ASCII, this will be
30/// equal to the string length if they match.
31pub fn caselessCmpLen(str_a: []const u8, str_b: []const u8) usize {
32 std.debug.assert(str_a.len == str_b.len);
33 const vec_len = simd.suggestVectorLength(u8) orelse return caselessCmpNoSimd(str_a, str_b);
34 const Vec = @Vector(vec_len, u8);
35 const BVec = @Vector(vec_len, bool);
36
37 const msb: Vec = @splat(@as(u8, 0x80));
38 const case_bit: Vec = @splat(@as(u8, 0x20));
39 const low5: Vec = @splat(@as(u8, 0x1f));
40 const vec0: Vec = @splat(@as(u8, 0));
41 const vec1: Vec = @splat(@as(u8, 1));
42 const vec26: Vec = @splat(@as(u8, 26));
43
44 var rem_a = str_a;
45 var rem_b = str_b;
46
47 while (rem_a.len >= vec_len) {
48 const a: Vec = rem_a[0..vec_len].*;
49 const b: Vec = rem_b[0..vec_len].*;
50 // ASCII gate: MSB must be 0 in both.
51 const is_ascii: BVec = ((a | b) & msb) == vec0;
52
53 const xor: Vec = a ^ b;
54 const exact: BVec = xor == vec0;
55 const case_diff: BVec = xor == case_bit;
56
57 // Letter test (only needed when case_diff).
58 const x: Vec = (a | b) & low5;
59 const is_letter: BVec =
60 (x >= vec1) & (x <= vec26);
61
62 const matched: BVec = is_ascii & (exact | (case_diff & is_letter));
63
64 if (!@reduce(.And, matched)) break;
65 rem_a = rem_a[vec_len..];
66 rem_b = rem_b[vec_len..];
67 }
68
69 // Tail
70 return str_a.len - rem_a.len + caselessCmpNoSimd(rem_a, rem_b);
71}
72
73inline fn caselessCmpNoSimd(str_a: []const u8, str_b: []const u8) usize {
74 for (str_a, str_b, 0..) |a, b, i| {
75 // High?
76 if (((a | b) & 0x80) != 0) return i;
77 const xor = a ^ b;
78 if (xor == 0) continue; // Match
79 if (xor != 0x20) return i; // Not the upcase bit.
80
81 const lo = a | b;
82 const x = lo & 0x1f;
83 if (x < 1 or x > 26) return i; // Not a letter
84 } else return str_a.len;
85}
86
87test caselessCmpNoSimd {
88 const hi_l = "Hello, World!";
89 const hi_h = "HeLlO, wOrLd!";
90 try testing.expectEqual(hi_l.len, caselessCmpNoSimd(hi_l, hi_h));
91}
92
93test caselessCmpLen {
94 const hi_l = "Hello, World!" ** 25;
95 const hi_h = "HeLlO, wOrLd!" ** 25;
96 try testing.expectEqual(hi_l.len, caselessCmpLen(hi_l, hi_h));
97}
98
28test "isAsciiOnly" { 99test "isAsciiOnly" {
29 const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+"; 100 const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+";
30 try testing.expect(isAsciiOnly(ascii_only)); 101 try testing.expect(isAsciiOnly(ascii_only));