summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Sam Atman2026-02-06 13:07:03 -0500
committerGravatar Sam Atman2026-02-06 13:07:03 -0500
commitb823a49b6a57bc1736b33a0816b42aaaf86cf839 (patch)
tree533a2ffff737ba2826456fecb01bf3eb187b872a /src
parentSlightly better hash reduction for comptime_map (diff)
downloadzg-b823a49b6a57bc1736b33a0816b42aaaf86cf839.tar.gz
zg-b823a49b6a57bc1736b33a0816b42aaaf86cf839.tar.xz
zg-b823a49b6a57bc1736b33a0816b42aaaf86cf839.zip
zg module, casing improvements
Diffstat (limited to 'src')
-rw-r--r--src/CanonData.zig12
-rw-r--r--src/CaseFolding.zig26
-rw-r--r--src/ascii.zig71
-rw-r--r--src/code_point.zig13
-rw-r--r--src/zg.zig14
5 files changed, 122 insertions, 14 deletions
diff --git a/src/CanonData.zig b/src/CanonData.zig
index 5c1ffa6..144346c 100644
--- a/src/CanonData.zig
+++ b/src/CanonData.zig
@@ -5,6 +5,12 @@ const Data = struct {
5 s2: []const @import("canon").Canonicalization = undefined, 5 s2: []const @import("canon").Canonicalization = undefined,
6}; 6};
7 7
8// Canonicalization looks like this:
9// const Canonicalization = struct {
10// len: u3 = 0,
11// cps: [2]u21 = [_]u21{0} ** 2,
12// };
13
8const canon_data = canon_data: { 14const canon_data = canon_data: {
9 const canon_ = @import("canon"); 15 const canon_ = @import("canon");
10 break :canon_data Data{ 16 break :canon_data Data{
@@ -17,7 +23,7 @@ const CanonData = @This();
17 23
18// There's a bug here, which is down to how static u21 vs. runtime are handled, 24// There's a bug here, which is down to how static u21 vs. runtime are handled,
19// the "unique representation" claim is not working out. AutoHash casts to bytes, 25// the "unique representation" claim is not working out. AutoHash casts to bytes,
20// and that won't fly. So we do this: 26// and that won't fly. So we do a simple custom context which works for both.
21 27
22const Context = struct { 28const Context = struct {
23 pub fn hash(_: Context, cps: [2]u21) u64 { 29 pub fn hash(_: Context, cps: [2]u21) u64 {
@@ -52,3 +58,7 @@ const std = @import("std");
52const builtin = @import("builtin"); 58const builtin = @import("builtin");
53const mem = std.mem; 59const mem = std.mem;
54const comptime_map = @import("comptime_map.zig"); 60const comptime_map = @import("comptime_map.zig");
61
62test {
63 _ = comptime_map;
64}
diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig
index d69cddc..b7aa020 100644
--- a/src/CaseFolding.zig
+++ b/src/CaseFolding.zig
@@ -103,7 +103,16 @@ pub fn compatCaselessMatch(
103 a: []const u8, 103 a: []const u8,
104 b: []const u8, 104 b: []const u8,
105) Allocator.Error!bool { 105) Allocator.Error!bool {
106 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); 106 var a_in = a;
107 var b_in = b;
108
109 // Ascii short path. Only applies if they're the same length:
110 if (a_in.len == b_in.len) {
111 const prefix = ascii.caselessCmpLen(a_in, b_in);
112 if (prefix == a_in.len) return true;
113 a_in = a_in[prefix..];
114 b_in = b_in[prefix..];
115 }
107 116
108 // Process a 117 // Process a
109 const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd); 118 const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd);
@@ -192,10 +201,19 @@ pub fn canonCaselessMatch(
192 a: []const u8, 201 a: []const u8,
193 b: []const u8, 202 b: []const u8,
194) Allocator.Error!bool { 203) Allocator.Error!bool {
195 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); 204 var a_in = a;
205 var b_in = b;
206
207 // Ascii short path. Only applies if they're the same length:
208 if (a_in.len == b_in.len) {
209 const prefix = ascii.caselessCmpLen(a_in, b_in);
210 if (prefix == a_in.len) return true;
211 a_in = a_in[prefix..];
212 b_in = b_in[prefix..];
213 }
196 214
197 // Process a 215 // Process a
198 const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd); 216 const nfd_a = try Normalize.nfxdCodePoints(allocator, a_in, .nfd);
199 defer allocator.free(nfd_a); 217 defer allocator.free(nfd_a);
200 218
201 var need_free_cf_nfd_a = false; 219 var need_free_cf_nfd_a = false;
@@ -215,7 +233,7 @@ pub fn canonCaselessMatch(
215 defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); 233 defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a);
216 234
217 // Process b 235 // Process b
218 const nfd_b = try Normalize.nfxdCodePoints(allocator, b, .nfd); 236 const nfd_b = try Normalize.nfxdCodePoints(allocator, b_in, .nfd);
219 defer allocator.free(nfd_b); 237 defer allocator.free(nfd_b);
220 238
221 var need_free_cf_nfd_b = false; 239 var need_free_cf_nfd_b = false;
diff --git a/src/ascii.zig b/src/ascii.zig
index 6c28f25..5b91348 100644
--- a/src/ascii.zig
+++ b/src/ascii.zig
@@ -25,6 +25,77 @@ pub fn isAsciiOnly(str: []const u8) bool {
25 return true; 25 return true;
26} 26}
27 27
28/// Do a caseless comparison, with SIMD if possible. Strings must be of equal
29/// length. Returns how many bytes are case-fold-matched ASCII, this will be
30/// equal to the string length if they match.
31pub fn caselessCmpLen(str_a: []const u8, str_b: []const u8) usize {
32 std.debug.assert(str_a.len == str_b.len);
33 const vec_len = simd.suggestVectorLength(u8) orelse return caselessCmpNoSimd(str_a, str_b);
34 const Vec = @Vector(vec_len, u8);
35 const BVec = @Vector(vec_len, bool);
36
37 const msb: Vec = @splat(@as(u8, 0x80));
38 const case_bit: Vec = @splat(@as(u8, 0x20));
39 const low5: Vec = @splat(@as(u8, 0x1f));
40 const vec0: Vec = @splat(@as(u8, 0));
41 const vec1: Vec = @splat(@as(u8, 1));
42 const vec26: Vec = @splat(@as(u8, 26));
43
44 var rem_a = str_a;
45 var rem_b = str_b;
46
47 while (rem_a.len >= vec_len) {
48 const a: Vec = rem_a[0..vec_len].*;
49 const b: Vec = rem_b[0..vec_len].*;
50 // ASCII gate: MSB must be 0 in both.
51 const is_ascii: BVec = ((a | b) & msb) == vec0;
52
53 const xor: Vec = a ^ b;
54 const exact: BVec = xor == vec0;
55 const case_diff: BVec = xor == case_bit;
56
57 // Letter test (only needed when case_diff).
58 const x: Vec = (a | b) & low5;
59 const is_letter: BVec =
60 (x >= vec1) & (x <= vec26);
61
62 const matched: BVec = is_ascii & (exact | (case_diff & is_letter));
63
64 if (!@reduce(.And, matched)) break;
65 rem_a = rem_a[vec_len..];
66 rem_b = rem_b[vec_len..];
67 }
68
69 // Tail
70 return str_a.len - rem_a.len + caselessCmpNoSimd(rem_a, rem_b);
71}
72
73inline fn caselessCmpNoSimd(str_a: []const u8, str_b: []const u8) usize {
74 for (str_a, str_b, 0..) |a, b, i| {
75 // High?
76 if (((a | b) & 0x80) != 0) return i;
77 const xor = a ^ b;
78 if (xor == 0) continue; // Match
79 if (xor != 0x20) return i; // Not the upcase bit.
80
81 const lo = a | b;
82 const x = lo & 0x1f;
83 if (x < 1 or x > 26) return i; // Not a letter
84 } else return str_a.len;
85}
86
87test caselessCmpNoSimd {
88 const hi_l = "Hello, World!";
89 const hi_h = "HeLlO, wOrLd!";
90 try testing.expectEqual(hi_l.len, caselessCmpNoSimd(hi_l, hi_h));
91}
92
93test caselessCmpLen {
94 const hi_l = "Hello, World!" ** 25;
95 const hi_h = "HeLlO, wOrLd!" ** 25;
96 try testing.expectEqual(hi_l.len, caselessCmpLen(hi_l, hi_h));
97}
98
28test "isAsciiOnly" { 99test "isAsciiOnly" {
29 const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+"; 100 const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+";
30 try testing.expect(isAsciiOnly(ascii_only)); 101 try testing.expect(isAsciiOnly(ascii_only));
diff --git a/src/code_point.zig b/src/code_point.zig
index 7a638af..5f6c61c 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -30,13 +30,8 @@ pub const CodePoint = struct {
30/// This function is deprecated and will be removed in a later release. 30/// This function is deprecated and will be removed in a later release.
31/// Use `decodeAtIndex` or `decodeAtCursor`. 31/// Use `decodeAtIndex` or `decodeAtCursor`.
32pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint { 32pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint {
33 var off: uoffset = 0; 33 _ = .{ bytes, offset };
34 var maybe_code = decodeAtCursor(bytes, &off); 34 @compileError("decode has been removed, use `decodeAtIndex`.");
35 if (maybe_code) |*code| {
36 code.offset = offset;
37 return code.*;
38 }
39 return null;
40} 35}
41 36
42/// Return the codepoint at `index`, even if `index` is in the middle 37/// Return the codepoint at `index`, even if `index` is in the middle
@@ -292,7 +287,7 @@ pub const ReverseIterator = struct {
292 else 287 else
293 iter.i = null; 288 iter.i = null;
294 289
295 return decode(iter.bytes[i_prev..], i_prev); 290 return decodeAtIndex(iter.bytes, i_prev);
296 } 291 }
297 292
298 pub fn peek(iter: *ReverseIterator) ?CodePoint { 293 pub fn peek(iter: *ReverseIterator) ?CodePoint {
@@ -319,7 +314,7 @@ inline fn followbyte(b: u8) bool {
319 314
320test "decode" { 315test "decode" {
321 const bytes = "🌩️"; 316 const bytes = "🌩️";
322 const res = decode(bytes, 0); 317 const res = decodeAtIndex(bytes, 0);
323 318
324 if (res) |cp| { 319 if (res) |cp| {
325 try std.testing.expectEqual(@as(u21, 0x1F329), cp.code); 320 try std.testing.expectEqual(@as(u21, 0x1F329), cp.code);
diff --git a/src/zg.zig b/src/zg.zig
new file mode 100644
index 0000000..2974320
--- /dev/null
+++ b/src/zg.zig
@@ -0,0 +1,14 @@
1//! zg: a Zig-native Unicode Module
2
3pub const ascii = @import("ascii");
4pub const case_folding = @import("CaseFolding");
5pub const code_point = @import("code_point");
6pub const display_width = @import("DisplawWidth");
7pub const emoji = @import("Emoji");
8pub const general_categories = @import("GeneralCategories");
9pub const graphemes = @import("Graphemes");
10pub const letter_casing = @import("LetterCasing");
11pub const normalize = @import("Normalize");
12pub const properties = @import("Properties");
13pub const scripts = @import("Scripts");
14pub const words = @import("Words");