zg module, casing improvements

author: Sam Atman 2026-02-06 13:07:03 -0500
committer: Sam Atman 2026-02-06 13:07:03 -0500
commit: b823a49b6a57bc1736b33a0816b42aaaf86cf839 (patch)
tree: 533a2ffff737ba2826456fecb01bf3eb187b872a /src
parent: Slightly better hash reduction for comptime_map (diff)
download: zg-b823a49b6a57bc1736b33a0816b42aaaf86cf839.tar.gz
zg-b823a49b6a57bc1736b33a0816b42aaaf86cf839.tar.xz
zg-b823a49b6a57bc1736b33a0816b42aaaf86cf839.zip
5 files changed, 122 insertions, 14 deletions
diff --git a/src/CanonData.zig b/src/CanonData.zig
index 5c1ffa6..144346c 100644
--- a/src/CanonData.zig
+++ b/src/CanonData.zig
@@ -5,6 +5,12 @@ const Data = struct {
    s2: []const @import("canon").Canonicalization = undefined,
 };
+// Canonicalization looks like this:
+// const Canonicalization = struct {
+//     len: u3 = 0,
+//     cps: [2]u21 = [_]u21{0} ** 2,
+// };
 const canon_data = canon_data: {
    const canon_ = @import("canon");
    break :canon_data Data{
@@ -17,7 +23,7 @@ const CanonData = @This();
 // There's a bug here, which is down to how static u21 vs. runtime are handled,
 // the "unique representation" claim is not working out.  AutoHash casts to bytes,
-// and that won't fly.  So we do this:
+// and that won't fly.  So we do a simple custom context which works for both.
 const Context = struct {
    pub fn hash(_: Context, cps: [2]u21) u64 {
@@ -52,3 +58,7 @@ const std = @import("std");
 const builtin = @import("builtin");
 const mem = std.mem;
 const comptime_map = @import("comptime_map.zig");
+test {
+    _ = comptime_map;
+}
diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig
index d69cddc..b7aa020 100644
--- a/src/CaseFolding.zig
+++ b/src/CaseFolding.zig
@@ -103,7 +103,16 @@ pub fn compatCaselessMatch(
    a: []const u8,
    b: []const u8,
 ) Allocator.Error!bool {
-    if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
+    var a_in = a;
+    var b_in = b;
+    // Ascii short path.  Only applies if they're the same length:
+    if (a_in.len == b_in.len) {
+        const prefix = ascii.caselessCmpLen(a_in, b_in);
+        if (prefix == a_in.len) return true;
+        a_in = a_in[prefix..];
+        b_in = b_in[prefix..];
+    }
    // Process a
    const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd);
@@ -192,10 +201,19 @@ pub fn canonCaselessMatch(
    a: []const u8,
    b: []const u8,
 ) Allocator.Error!bool {
-    if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
+    var a_in = a;
+    var b_in = b;
+    // Ascii short path.  Only applies if they're the same length:
+    if (a_in.len == b_in.len) {
+        const prefix = ascii.caselessCmpLen(a_in, b_in);
+        if (prefix == a_in.len) return true;
+        a_in = a_in[prefix..];
+        b_in = b_in[prefix..];
+    }
    // Process a
-    const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd);
+    const nfd_a = try Normalize.nfxdCodePoints(allocator, a_in, .nfd);
    defer allocator.free(nfd_a);
    var need_free_cf_nfd_a = false;
@@ -215,7 +233,7 @@ pub fn canonCaselessMatch(
    defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a);
    // Process b
-    const nfd_b = try Normalize.nfxdCodePoints(allocator, b, .nfd);
+    const nfd_b = try Normalize.nfxdCodePoints(allocator, b_in, .nfd);
    defer allocator.free(nfd_b);
    var need_free_cf_nfd_b = false;
diff --git a/src/ascii.zig b/src/ascii.zig
index 6c28f25..5b91348 100644
--- a/src/ascii.zig
+++ b/src/ascii.zig
@@ -25,6 +25,77 @@ pub fn isAsciiOnly(str: []const u8) bool {
    return true;
 }
+/// Do a caseless comparison, with SIMD if possible.  Strings must be of equal
+/// length.  Returns how many bytes are case-fold-matched ASCII, this will be
+/// equal to the string length if they match.
+pub fn caselessCmpLen(str_a: []const u8, str_b: []const u8) usize {
+    std.debug.assert(str_a.len == str_b.len);
+    const vec_len = simd.suggestVectorLength(u8) orelse return caselessCmpNoSimd(str_a, str_b);
+    const Vec = @Vector(vec_len, u8);
+    const BVec = @Vector(vec_len, bool);
+    const msb: Vec = @splat(@as(u8, 0x80));
+    const case_bit: Vec = @splat(@as(u8, 0x20));
+    const low5: Vec = @splat(@as(u8, 0x1f));
+    const vec0: Vec = @splat(@as(u8, 0));
+    const vec1: Vec = @splat(@as(u8, 1));
+    const vec26: Vec = @splat(@as(u8, 26));
+    var rem_a = str_a;
+    var rem_b = str_b;
+    while (rem_a.len >= vec_len) {
+        const a: Vec = rem_a[0..vec_len].*;
+        const b: Vec = rem_b[0..vec_len].*;
+        // ASCII gate: MSB must be 0 in both.
+        const is_ascii: BVec = ((a | b) & msb) == vec0;
+        const xor: Vec = a ^ b;
+        const exact: BVec = xor == vec0;
+        const case_diff: BVec = xor == case_bit;
+        // Letter test (only needed when case_diff).
+        const x: Vec = (a | b) & low5;
+        const is_letter: BVec =
+            (x >= vec1) & (x <= vec26);
+        const matched: BVec = is_ascii & (exact | (case_diff & is_letter));
+        if (!@reduce(.And, matched)) break;
+        rem_a = rem_a[vec_len..];
+        rem_b = rem_b[vec_len..];
+    }
+    // Tail
+    return str_a.len - rem_a.len + caselessCmpNoSimd(rem_a, rem_b);
+}
+inline fn caselessCmpNoSimd(str_a: []const u8, str_b: []const u8) usize {
+    for (str_a, str_b, 0..) |a, b, i| {
+        // High?
+        if (((a | b) & 0x80) != 0) return i;
+        const xor = a ^ b;
+        if (xor == 0) continue; // Match
+        if (xor != 0x20) return i; // Not the upcase bit.
+        const lo = a | b;
+        const x = lo & 0x1f;
+        if (x < 1 or x > 26) return i; // Not a letter
+    } else return str_a.len;
+}
+test caselessCmpNoSimd {
+    const hi_l = "Hello, World!";
+    const hi_h = "HeLlO, wOrLd!";
+    try testing.expectEqual(hi_l.len, caselessCmpNoSimd(hi_l, hi_h));
+}
+test caselessCmpLen {
+    const hi_l = "Hello, World!" ** 25;
+    const hi_h = "HeLlO, wOrLd!" ** 25;
+    try testing.expectEqual(hi_l.len, caselessCmpLen(hi_l, hi_h));
+}
 test "isAsciiOnly" {
    const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+";
    try testing.expect(isAsciiOnly(ascii_only));
diff --git a/src/code_point.zig b/src/code_point.zig
index 7a638af..5f6c61c 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -30,13 +30,8 @@ pub const CodePoint = struct {
 /// This function is deprecated and will be removed in a later release.
 /// Use `decodeAtIndex` or `decodeAtCursor`.
 pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint {
-    var off: uoffset = 0;
+    _ = .{ bytes, offset };
-    var maybe_code = decodeAtCursor(bytes, &off);
+    @compileError("decode has been removed, use `decodeAtIndex`.");
-    if (maybe_code) |*code| {
-        code.offset = offset;
-        return code.*;
-    }
-    return null;
 }
 /// Return the codepoint at `index`, even if `index` is in the middle
@@ -292,7 +287,7 @@ pub const ReverseIterator = struct {
        else
            iter.i = null;
-        return decode(iter.bytes[i_prev..], i_prev);
+        return decodeAtIndex(iter.bytes, i_prev);
    }
    pub fn peek(iter: *ReverseIterator) ?CodePoint {
@@ -319,7 +314,7 @@ inline fn followbyte(b: u8) bool {
 test "decode" {
    const bytes = "🌩️";
-    const res = decode(bytes, 0);
+    const res = decodeAtIndex(bytes, 0);
    if (res) |cp| {
        try std.testing.expectEqual(@as(u21, 0x1F329), cp.code);
diff --git a/src/zg.zig b/src/zg.zig
new file mode 100644
index 0000000..2974320
--- /dev/null
+++ b/src/zg.zig
@@ -0,0 +1,14 @@
+//! zg: a Zig-native Unicode Module
+pub const ascii = @import("ascii");
+pub const case_folding = @import("CaseFolding");
+pub const code_point = @import("code_point");
+pub const display_width = @import("DisplawWidth");
+pub const emoji = @import("Emoji");
+pub const general_categories = @import("GeneralCategories");
+pub const graphemes = @import("Graphemes");
+pub const letter_casing = @import("LetterCasing");
+pub const normalize = @import("Normalize");
+pub const properties = @import("Properties");
+pub const scripts = @import("Scripts");
+pub const words = @import("Words");
author	Sam Atman	2026-02-06 13:07:03 -0500
committer	Sam Atman	2026-02-06 13:07:03 -0500
commit	b823a49b6a57bc1736b33a0816b42aaaf86cf839 (patch)
tree	533a2ffff737ba2826456fecb01bf3eb187b872a /src
parent	Slightly better hash reduction for comptime_map (diff)
download	zg-b823a49b6a57bc1736b33a0816b42aaaf86cf839.tar.gz zg-b823a49b6a57bc1736b33a0816b42aaaf86cf839.tar.xz zg-b823a49b6a57bc1736b33a0816b42aaaf86cf839.zip

diff --git a/src/CanonData.zig b/src/CanonData.zig index 5c1ffa6..144346c 100644 --- a/src/CanonData.zig +++ b/src/CanonData.zig
@@ -5,6 +5,12 @@ const Data = struct {
5	s2: []const @import("canon").Canonicalization = undefined,	5	s2: []const @import("canon").Canonicalization = undefined,
6	};	6	};
7		7
		8	// Canonicalization looks like this:
		9	// const Canonicalization = struct {
		10	// len: u3 = 0,
		11	// cps: [2]u21 = [_]u21{0} ** 2,
		12	// };
		13
8	const canon_data = canon_data: {	14	const canon_data = canon_data: {
9	const canon_ = @import("canon");	15	const canon_ = @import("canon");
10	break :canon_data Data{	16	break :canon_data Data{
@@ -17,7 +23,7 @@ const CanonData = @This();
17		23
18	// There's a bug here, which is down to how static u21 vs. runtime are handled,	24	// There's a bug here, which is down to how static u21 vs. runtime are handled,
19	// the "unique representation" claim is not working out. AutoHash casts to bytes,	25	// the "unique representation" claim is not working out. AutoHash casts to bytes,
20	// and that won't fly. So we do this:	26	// and that won't fly. So we do a simple custom context which works for both.
21		27
22	const Context = struct {	28	const Context = struct {
23	pub fn hash(_: Context, cps: [2]u21) u64 {	29	pub fn hash(_: Context, cps: [2]u21) u64 {
@@ -52,3 +58,7 @@ const std = @import("std");
52	const builtin = @import("builtin");	58	const builtin = @import("builtin");
53	const mem = std.mem;	59	const mem = std.mem;
54	const comptime_map = @import("comptime_map.zig");	60	const comptime_map = @import("comptime_map.zig");
		61
		62	test {
		63	_ = comptime_map;
		64	}


diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig index d69cddc..b7aa020 100644 --- a/src/CaseFolding.zig +++ b/src/CaseFolding.zig
@@ -103,7 +103,16 @@ pub fn compatCaselessMatch(
103	a: []const u8,	103	a: []const u8,
104	b: []const u8,	104	b: []const u8,
105	) Allocator.Error!bool {	105	) Allocator.Error!bool {
106	if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);	106	var a_in = a;
		107	var b_in = b;
		108
		109	// Ascii short path. Only applies if they're the same length:
		110	if (a_in.len == b_in.len) {
		111	const prefix = ascii.caselessCmpLen(a_in, b_in);
		112	if (prefix == a_in.len) return true;
		113	a_in = a_in[prefix..];
		114	b_in = b_in[prefix..];
		115	}
107		116
108	// Process a	117	// Process a
109	const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd);	118	const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd);
@@ -192,10 +201,19 @@ pub fn canonCaselessMatch(
192	a: []const u8,	201	a: []const u8,
193	b: []const u8,	202	b: []const u8,
194	) Allocator.Error!bool {	203	) Allocator.Error!bool {
195	if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);	204	var a_in = a;
		205	var b_in = b;
		206
		207	// Ascii short path. Only applies if they're the same length:
		208	if (a_in.len == b_in.len) {
		209	const prefix = ascii.caselessCmpLen(a_in, b_in);
		210	if (prefix == a_in.len) return true;
		211	a_in = a_in[prefix..];
		212	b_in = b_in[prefix..];
		213	}
196		214
197	// Process a	215	// Process a
198	const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd);	216	const nfd_a = try Normalize.nfxdCodePoints(allocator, a_in, .nfd);
199	defer allocator.free(nfd_a);	217	defer allocator.free(nfd_a);
200		218
201	var need_free_cf_nfd_a = false;	219	var need_free_cf_nfd_a = false;
@@ -215,7 +233,7 @@ pub fn canonCaselessMatch(
215	defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a);	233	defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a);
216		234
217	// Process b	235	// Process b
218	const nfd_b = try Normalize.nfxdCodePoints(allocator, b, .nfd);	236	const nfd_b = try Normalize.nfxdCodePoints(allocator, b_in, .nfd);
219	defer allocator.free(nfd_b);	237	defer allocator.free(nfd_b);
220		238
221	var need_free_cf_nfd_b = false;	239	var need_free_cf_nfd_b = false;


diff --git a/src/ascii.zig b/src/ascii.zig index 6c28f25..5b91348 100644 --- a/src/ascii.zig +++ b/src/ascii.zig
@@ -25,6 +25,77 @@ pub fn isAsciiOnly(str: []const u8) bool {
25	return true;	25	return true;
26	}	26	}
27		27
		28	/// Do a caseless comparison, with SIMD if possible. Strings must be of equal
		29	/// length. Returns how many bytes are case-fold-matched ASCII, this will be
		30	/// equal to the string length if they match.
		31	pub fn caselessCmpLen(str_a: []const u8, str_b: []const u8) usize {
		32	std.debug.assert(str_a.len == str_b.len);
		33	const vec_len = simd.suggestVectorLength(u8) orelse return caselessCmpNoSimd(str_a, str_b);
		34	const Vec = @Vector(vec_len, u8);
		35	const BVec = @Vector(vec_len, bool);
		36
		37	const msb: Vec = @splat(@as(u8, 0x80));
		38	const case_bit: Vec = @splat(@as(u8, 0x20));
		39	const low5: Vec = @splat(@as(u8, 0x1f));
		40	const vec0: Vec = @splat(@as(u8, 0));
		41	const vec1: Vec = @splat(@as(u8, 1));
		42	const vec26: Vec = @splat(@as(u8, 26));
		43
		44	var rem_a = str_a;
		45	var rem_b = str_b;
		46
		47	while (rem_a.len >= vec_len) {
		48	const a: Vec = rem_a[0..vec_len].*;
		49	const b: Vec = rem_b[0..vec_len].*;
		50	// ASCII gate: MSB must be 0 in both.
		51	const is_ascii: BVec = ((a \| b) & msb) == vec0;
		52
		53	const xor: Vec = a ^ b;
		54	const exact: BVec = xor == vec0;
		55	const case_diff: BVec = xor == case_bit;
		56
		57	// Letter test (only needed when case_diff).
		58	const x: Vec = (a \| b) & low5;
		59	const is_letter: BVec =
		60	(x >= vec1) & (x <= vec26);
		61
		62	const matched: BVec = is_ascii & (exact \| (case_diff & is_letter));
		63
		64	if (!@reduce(.And, matched)) break;
		65	rem_a = rem_a[vec_len..];
		66	rem_b = rem_b[vec_len..];
		67	}
		68
		69	// Tail
		70	return str_a.len - rem_a.len + caselessCmpNoSimd(rem_a, rem_b);
		71	}
		72
		73	inline fn caselessCmpNoSimd(str_a: []const u8, str_b: []const u8) usize {
		74	for (str_a, str_b, 0..) \|a, b, i\| {
		75	// High?
		76	if (((a \| b) & 0x80) != 0) return i;
		77	const xor = a ^ b;
		78	if (xor == 0) continue; // Match
		79	if (xor != 0x20) return i; // Not the upcase bit.
		80
		81	const lo = a \| b;
		82	const x = lo & 0x1f;
		83	if (x < 1 or x > 26) return i; // Not a letter
		84	} else return str_a.len;
		85	}
		86
		87	test caselessCmpNoSimd {
		88	const hi_l = "Hello, World!";
		89	const hi_h = "HeLlO, wOrLd!";
		90	try testing.expectEqual(hi_l.len, caselessCmpNoSimd(hi_l, hi_h));
		91	}
		92
		93	test caselessCmpLen {
		94	const hi_l = "Hello, World!" ** 25;
		95	const hi_h = "HeLlO, wOrLd!" ** 25;
		96	try testing.expectEqual(hi_l.len, caselessCmpLen(hi_l, hi_h));
		97	}
		98
28	test "isAsciiOnly" {	99	test "isAsciiOnly" {
29	const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+";	100	const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+";
30	try testing.expect(isAsciiOnly(ascii_only));	101	try testing.expect(isAsciiOnly(ascii_only));


diff --git a/src/code_point.zig b/src/code_point.zig index 7a638af..5f6c61c 100644 --- a/src/code_point.zig +++ b/src/code_point.zig
@@ -30,13 +30,8 @@ pub const CodePoint = struct {
30	/// This function is deprecated and will be removed in a later release.	30	/// This function is deprecated and will be removed in a later release.
31	/// Use `decodeAtIndex` or `decodeAtCursor`.	31	/// Use `decodeAtIndex` or `decodeAtCursor`.
32	pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint {	32	pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint {
33	var off: uoffset = 0;	33	_ = .{ bytes, offset };
34	var maybe_code = decodeAtCursor(bytes, &off);	34	@compileError("decode has been removed, use `decodeAtIndex`.");
35	if (maybe_code) \|*code\| {
36	code.offset = offset;
37	return code.*;
38	}
39	return null;
40	}	35	}
41		36
42	/// Return the codepoint at `index`, even if `index` is in the middle	37	/// Return the codepoint at `index`, even if `index` is in the middle
@@ -292,7 +287,7 @@ pub const ReverseIterator = struct {
292	else	287	else
293	iter.i = null;	288	iter.i = null;
294		289
295	return decode(iter.bytes[i_prev..], i_prev);	290	return decodeAtIndex(iter.bytes, i_prev);
296	}	291	}
297		292
298	pub fn peek(iter: *ReverseIterator) ?CodePoint {	293	pub fn peek(iter: *ReverseIterator) ?CodePoint {
@@ -319,7 +314,7 @@ inline fn followbyte(b: u8) bool {
319		314
320	test "decode" {	315	test "decode" {
321	const bytes = "🌩️";	316	const bytes = "🌩️";
322	const res = decode(bytes, 0);	317	const res = decodeAtIndex(bytes, 0);
323		318
324	if (res) \|cp\| {	319	if (res) \|cp\| {
325	try std.testing.expectEqual(@as(u21, 0x1F329), cp.code);	320	try std.testing.expectEqual(@as(u21, 0x1F329), cp.code);


diff --git a/src/zg.zig b/src/zg.zig new file mode 100644 index 0000000..2974320 --- /dev/null +++ b/src/zg.zig
@@ -0,0 +1,14 @@
		1	//! zg: a Zig-native Unicode Module
		2
		3	pub const ascii = @import("ascii");
		4	pub const case_folding = @import("CaseFolding");
		5	pub const code_point = @import("code_point");
		6	pub const display_width = @import("DisplawWidth");
		7	pub const emoji = @import("Emoji");
		8	pub const general_categories = @import("GeneralCategories");
		9	pub const graphemes = @import("Graphemes");
		10	pub const letter_casing = @import("LetterCasing");
		11	pub const normalize = @import("Normalize");
		12	pub const properties = @import("Properties");
		13	pub const scripts = @import("Scripts");
		14	pub const words = @import("Words");