src/ascii.zig


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

const std = @import("std");
const simd = std.simd;
const testing = std.testing;

/// Returns true if `str` only contains ASCII bytes. Uses SIMD if possible.
pub fn isAsciiOnly(str: []const u8) bool {
    const vec_len = simd.suggestVectorLength(u8) orelse return for (str) |b| {
        if (b > 127) break false;
    } else true;

    const Vec = @Vector(vec_len, u8);
    var remaining = str;

    while (true) {
        if (remaining.len < vec_len) return for (remaining) |b| {
            if (b > 127) break false;
        } else true;

        const v1 = remaining[0..vec_len].*;
        const v2: Vec = @splat(127);
        if (@reduce(.Or, v1 > v2)) return false;
        remaining = remaining[vec_len..];
    }

    return true;
}

/// Do a caseless comparison, with SIMD if possible.  Strings must be of equal
/// length.  Returns how many bytes are case-fold-matched ASCII, this will be
/// equal to the string length if they match.
pub fn caselessCmpLen(str_a: []const u8, str_b: []const u8) usize {
    std.debug.assert(str_a.len == str_b.len);
    const vec_len = simd.suggestVectorLength(u8) orelse return caselessCmpNoSimd(str_a, str_b);
    const Vec = @Vector(vec_len, u8);
    const BVec = @Vector(vec_len, bool);

    const msb: Vec = @splat(@as(u8, 0x80));
    const case_bit: Vec = @splat(@as(u8, 0x20));
    const low5: Vec = @splat(@as(u8, 0x1f));
    const vec0: Vec = @splat(@as(u8, 0));
    const vec1: Vec = @splat(@as(u8, 1));
    const vec26: Vec = @splat(@as(u8, 26));

    var rem_a = str_a;
    var rem_b = str_b;

    while (rem_a.len >= vec_len) {
        const a: Vec = rem_a[0..vec_len].*;
        const b: Vec = rem_b[0..vec_len].*;
        // ASCII gate: MSB must be 0 in both.
        const is_ascii: BVec = ((a | b) & msb) == vec0;

        const xor: Vec = a ^ b;
        const exact: BVec = xor == vec0;
        const case_diff: BVec = xor == case_bit;

        // Letter test (only needed when case_diff).
        const x: Vec = (a | b) & low5;
        const is_letter: BVec =
            (x >= vec1) & (x <= vec26);

        const matched: BVec = is_ascii & (exact | (case_diff & is_letter));

        if (!@reduce(.And, matched)) break;
        rem_a = rem_a[vec_len..];
        rem_b = rem_b[vec_len..];
    }

    // Tail
    return str_a.len - rem_a.len + caselessCmpNoSimd(rem_a, rem_b);
}

inline fn caselessCmpNoSimd(str_a: []const u8, str_b: []const u8) usize {
    for (str_a, str_b, 0..) |a, b, i| {
        // High?
        if (((a | b) & 0x80) != 0) return i;
        const xor = a ^ b;
        if (xor == 0) continue; // Match
        if (xor != 0x20) return i; // Not the upcase bit.

        const lo = a | b;
        const x = lo & 0x1f;
        if (x < 1 or x > 26) return i; // Not a letter
    } else return str_a.len;
}

test caselessCmpNoSimd {
    const hi_l = "Hello, World!";
    const hi_h = "HeLlO, wOrLd!";
    try testing.expectEqual(hi_l.len, caselessCmpNoSimd(hi_l, hi_h));
}

test caselessCmpLen {
    const hi_l = "Hello, World!" ** 25;
    const hi_h = "HeLlO, wOrLd!" ** 25;
    try testing.expectEqual(hi_l.len, caselessCmpLen(hi_l, hi_h));
}

test "isAsciiOnly" {
    const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+";
    try testing.expect(isAsciiOnly(ascii_only));
    const not_ascii_only = "Héllo, World! 0123456789 !@#$%^&*()_-=+";
    try testing.expect(!isAsciiOnly(not_ascii_only));
}