From b823a49b6a57bc1736b33a0816b42aaaf86cf839 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 6 Feb 2026 13:07:03 -0500 Subject: zg module, casing improvements --- NEWS.md | 94 ++++++++++++++++++++++++++++++++++++++++++----------- build.zig | 18 ++++++++++ src/CanonData.zig | 12 ++++++- src/CaseFolding.zig | 26 ++++++++++++--- src/ascii.zig | 71 ++++++++++++++++++++++++++++++++++++++++ src/code_point.zig | 13 +++----- src/zg.zig | 14 ++++++++ 7 files changed, 215 insertions(+), 33 deletions(-) create mode 100644 src/zg.zig diff --git a/NEWS.md b/NEWS.md index c484b62..ada1405 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ # News + ## zg v0.16.0-pre Release Notes This brings another major change to `zg`, touching basically everything. @@ -24,6 +25,10 @@ I did manage to dispose of one last allocation in `CanonData`, by integrating Vexu's very clever [comptime hash map][chm], so, that's nice. +[uucode]: https://github.com/jacobsandlund/uucode +[chm]: https://github.com/Vexu/comptime_hash_map/blob/master/src/main.zig + + ### Migration Is simplicity itself: just call the module instead of calling the @@ -33,41 +38,73 @@ disposing of it when you're done with it. Technically the laws of style dictate that, since these are now containers, and not instantiable types, they should be lowercased. -I did not do that in the README. But feel free! +I didn't see the point in changing all those names, it would add labor +to what should be a very brief and pleasant upgrade (but see below). +But feel free! Pro tip: use LSP superpowers to rename the instance to the name of the module, then just delete the initializer. Couldn't be simpler. + +### zg: The Module + +The take-what-you-need approach, of packaging the interface in a bunch of +separate modules, remains available for those who prefer it. + +Or, your code can just import `"zg"`, a module containing all of the +other modules. Zig's lazy compilation model gives us take-what-you-need +already, so while there's no reason to remove the submodules, there's no +reason to prefer using them either. + +As mentioned above, none of these are instance types any longer, and that +dictates that they take lower-case (as `code_point` and `ascii` always +have, for that reason). So in `zg`, the modules are styled in lower case. + +I did not want to combine a purely stylistic change, and one which would +require editing build scripts, with the functional changes needed to use +the (much nicer!) allocation-free interface. It is possible that later +releases will lowercase the submodules as well, or maybe just remove +them in favor of importing `zg`. Then again, maybe not. + + ### Emoji Module -Also Jacob's work. +Also Jacob's work. Exposes the basic useful Unicode emoji properties. -[uucode]: https://github.com/jacobsandlund/uucode -[chm]: https://github.com/Vexu/comptime_hash_map/blob/master/src/main.zig -### graphemeClusterWidth +### `graphemeClusterWidth` @lch361 submitted a minor refactor which makes it cleaner to obtain the display width of a grapheme cluster. Thanks Lich! -### The Future -I hope I don't jinx it, but this is the last major change I've wanted -to make to `zg`. I brought it up with José before he handed over -maintainence to me, we agreed that compressing and decompressing and -heap-allocating was one of those ideas which turned out not to pencil -out. Hey, it happens. +### Better Fast-pathing in Caseless Comparison + +Caseless comparison only tries the ASCII fast-path when strings are the +same size, which is the only time it can work. The fast path has also +been SIMD accelerated when possible. + +Canonicalization, and caseless comparision (which uses it), are in need +of attention. They do things in the most expensive possible fashion, +without taking advantage of any opportunities to do the cheaper thing. +While the result is correct, even in pathological cases, it is not +optimal, especially given the reality that Unicode text is, in a modern +context, nearly always in canonical form already. + +Changes to that will have to wait for another release, despite my +inclinations to the contrary. + + +### code_point.decode fully deprecated -Features? There are few left, sure. Bugfixes? Always, of course. -But major API changes are now unlikely. +Slicing to decode a point is an anti-pattern, and calling this +deprecated function is now a `@compileError`, suggesting `decodeAtIndex` +instead. I suggest taking a look at `decodeAtCursor` as well, which +takes a pointer to an index and moves it to the next codepoint while +decoding, this is often what you want. -I have a "no 1.0 until Zig 1.0" policy, because no one can guarantee the -stability of anything until that happens. Nor is `zg` actually ready -for a stability policy; if I decide to break apart a module, or move -something where it makes more sense, I'm going to do that. +A future release will remove the function entirely. -But it is _likely_ that this is the last 'global' refactor of the -library. ## zg v0.15.2-4 Release Notes @@ -82,6 +119,7 @@ but we could have vendored it: more importantly, it turned out to be basically useless. Savings per data set were in the bytes to low KiB range, and startup time was negatively affected. + ## zg v0.14.1 Release Notes In a flurry of activity during and after the `v0.14.0` beta, several @@ -92,6 +130,7 @@ Presenting `zg v0.14.1`. As should be expected from a patch release, there are no breaking changes to the interface, just bug fixes and features. + ### Grapheme Zalgo Text Bugfix Until this release, `zg` was using a `u8` to store the length of a @@ -109,6 +148,7 @@ Actually, both fields are now `uoffset`, for reasons described next. [Zalgo]: https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454 + ### Limits Section Added to README The README now clearly documents that some data structures and iterators @@ -125,6 +165,7 @@ I believe this to be neither necessary nor sufficient for handling data of that size. But I can't anticipate every requirement, and don't want to preclude it as a solution. + ### Iterators, Back and Forth A new contributor, Nemoos, took on the challenge of adding a reverse @@ -158,6 +199,7 @@ of those functions. For codepoints, starting an iterator at either `.offset` or `.offset + .len` will suffice, since the `CodePoint` iterator is otherwise stateless. + ### Words Module The [Unicode annex][tr29] with the canonical grapheme segmentation @@ -173,6 +215,7 @@ segmentation and [line breaking][tr14]. [tr29]: https://www.unicode.org/reports/tr29/ [tr14]: https://www.unicode.org/reports/tr14/ + #### Runeset Used As a point of interest: @@ -190,6 +233,7 @@ transitive dependencies. [UCD]: https://www.unicode.org/reports/tr44/ [Rune]: https://github.com/mnemnion/runeset + ## zg v0.14.0 Release Notes This is the first minor point release since Sam Atman (me) took over @@ -201,12 +245,14 @@ The changes are fairly large, and most user code will need to be updated. The result is substantially streamlined and easier to use, and updating will mainly take place around importing, creating, and deinitializing. + ### The Great Renaming The most obvious change is on the surface API: more than half of the modules have been renamed. There are no user-facing modules with `Data` in the name, and some abbreviations have been spelled in full. + ### No More Separation of Data and Functionality It is no longer necessary to separately create, for example, a @@ -222,6 +268,7 @@ This would make user structs larger in some cases, while eliminating a pointer chase. If that isn't a desirable trade off for your code, read on. + ### All Allocated Data is Unmanaged Prior to `v0.14`, all structs which need heap allocation no longer @@ -235,6 +282,7 @@ Getting up to speed is a matter of passing the allocator to `deinit`. This change comes courtesy of [lch361](https://lch361.net), in his first contribution to the repo. Thanks Lich! + ### `code_point` Now Unicode-Compliant The `v0.15.x` decoder used a simple, fast, but naïve method to decode @@ -267,6 +315,7 @@ is good for the fetch pipeline, and more ergonomic in many cases. [^1]: A bit more than twice as fast as the standard library for decoding, according to my (limited) benchmarks. + ### DisplayWidth and CaseFolding Can Share Data Both of these modules use another module to get the job done, @@ -276,6 +325,7 @@ It is now possible to initialize them with a borrowed copy of those modules, to make it simpler to write code which also needs the base modules. + ### Grapheme Iterator Creation This is a modest streamlining of how a grapheme iterator is created. @@ -304,6 +354,7 @@ var iter = Graphemes.Iterator.init("stri̵̫̗̗̱̳̼̖͚͉͂̌̈́̓̄͋̇̎͠ͅ If one were to prefer doing so. + ### Initialization vs. Setup Every allocating module now has both an `init` function, which @@ -321,6 +372,7 @@ have been turned `unreachable`, leaving only `error.OutOfMemory`. Encountering any of the other errors would indicate an internal problem, so we no longer make user code deal with that unlikely event. + ### New DisplayWidth options A `DisplayWidth` can now be compiled to treat `c0` and `c1` control codes @@ -330,18 +382,21 @@ need to escape control codes to make them visible. Setting these options will let `DisplayWidth` return the correct widths when this is done. + ### Unicode 16.0 This updates `zg` to use the latest Unicode edition. This should be the only change which will change behavior of user code, other than through the use of the new `DisplayWidth` options. + ### Tests It is now possible to run all the tests, not just the `unicode-test` subset. Accordingly, that step is removed, and `zig build test` runs everything. + #### Allocations Tested Every allocate-able now has a `checkAllAllocationFailures` test. This @@ -349,6 +404,7 @@ process turned up two bugs. Also discovered were 8,663 allocations, which were reduced to two, these were also being individually freed on deinit. So that's nice. + #### That's It! I hope you find converting over `zg v0.13` code to be fairly painless diff --git a/build.zig b/build.zig index ee2a6ec..694d887 100644 --- a/build.zig +++ b/build.zig @@ -491,6 +491,24 @@ pub fn build(b: *std.Build) void { properties.addAnonymousImport("props", .{ .root_source_file = props_gen_out }); properties.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out }); + const zg_module = b.addModule("zg", .{ + .root_source_file = b.path("src/zg.zig"), + .target = target, + .optimize = optimize, + }); + zg_module.addImport("ascii", ascii); + zg_module.addImport("CaseFolding", case_fold); + zg_module.addImport("code_point", code_point); + zg_module.addImport("DisplawWidth", display_width); + zg_module.addImport("Emoji", emoji); + zg_module.addImport("GeneralCategories", gencat); + zg_module.addImport("Graphemes", graphemes); + zg_module.addImport("LetterCasing", letter_case); + zg_module.addImport("Normalize", norm); + zg_module.addImport("Properties", properties); + zg_module.addImport("Scripts", scripts); + zg_module.addImport("Words", words); + const properties_t = b.addTest(.{ .name = "properties", .root_module = properties, diff --git a/src/CanonData.zig b/src/CanonData.zig index 5c1ffa6..144346c 100644 --- a/src/CanonData.zig +++ b/src/CanonData.zig @@ -5,6 +5,12 @@ const Data = struct { s2: []const @import("canon").Canonicalization = undefined, }; +// Canonicalization looks like this: +// const Canonicalization = struct { +// len: u3 = 0, +// cps: [2]u21 = [_]u21{0} ** 2, +// }; + const canon_data = canon_data: { const canon_ = @import("canon"); break :canon_data Data{ @@ -17,7 +23,7 @@ const CanonData = @This(); // There's a bug here, which is down to how static u21 vs. runtime are handled, // the "unique representation" claim is not working out. AutoHash casts to bytes, -// and that won't fly. So we do this: +// and that won't fly. So we do a simple custom context which works for both. const Context = struct { pub fn hash(_: Context, cps: [2]u21) u64 { @@ -52,3 +58,7 @@ const std = @import("std"); const builtin = @import("builtin"); const mem = std.mem; const comptime_map = @import("comptime_map.zig"); + +test { + _ = comptime_map; +} diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig index d69cddc..b7aa020 100644 --- a/src/CaseFolding.zig +++ b/src/CaseFolding.zig @@ -103,7 +103,16 @@ pub fn compatCaselessMatch( a: []const u8, b: []const u8, ) Allocator.Error!bool { - if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); + var a_in = a; + var b_in = b; + + // Ascii short path. Only applies if they're the same length: + if (a_in.len == b_in.len) { + const prefix = ascii.caselessCmpLen(a_in, b_in); + if (prefix == a_in.len) return true; + a_in = a_in[prefix..]; + b_in = b_in[prefix..]; + } // Process a const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd); @@ -192,10 +201,19 @@ pub fn canonCaselessMatch( a: []const u8, b: []const u8, ) Allocator.Error!bool { - if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); + var a_in = a; + var b_in = b; + + // Ascii short path. Only applies if they're the same length: + if (a_in.len == b_in.len) { + const prefix = ascii.caselessCmpLen(a_in, b_in); + if (prefix == a_in.len) return true; + a_in = a_in[prefix..]; + b_in = b_in[prefix..]; + } // Process a - const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd); + const nfd_a = try Normalize.nfxdCodePoints(allocator, a_in, .nfd); defer allocator.free(nfd_a); var need_free_cf_nfd_a = false; @@ -215,7 +233,7 @@ pub fn canonCaselessMatch( defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); // Process b - const nfd_b = try Normalize.nfxdCodePoints(allocator, b, .nfd); + const nfd_b = try Normalize.nfxdCodePoints(allocator, b_in, .nfd); defer allocator.free(nfd_b); var need_free_cf_nfd_b = false; diff --git a/src/ascii.zig b/src/ascii.zig index 6c28f25..5b91348 100644 --- a/src/ascii.zig +++ b/src/ascii.zig @@ -25,6 +25,77 @@ pub fn isAsciiOnly(str: []const u8) bool { return true; } +/// Do a caseless comparison, with SIMD if possible. Strings must be of equal +/// length. Returns how many bytes are case-fold-matched ASCII, this will be +/// equal to the string length if they match. +pub fn caselessCmpLen(str_a: []const u8, str_b: []const u8) usize { + std.debug.assert(str_a.len == str_b.len); + const vec_len = simd.suggestVectorLength(u8) orelse return caselessCmpNoSimd(str_a, str_b); + const Vec = @Vector(vec_len, u8); + const BVec = @Vector(vec_len, bool); + + const msb: Vec = @splat(@as(u8, 0x80)); + const case_bit: Vec = @splat(@as(u8, 0x20)); + const low5: Vec = @splat(@as(u8, 0x1f)); + const vec0: Vec = @splat(@as(u8, 0)); + const vec1: Vec = @splat(@as(u8, 1)); + const vec26: Vec = @splat(@as(u8, 26)); + + var rem_a = str_a; + var rem_b = str_b; + + while (rem_a.len >= vec_len) { + const a: Vec = rem_a[0..vec_len].*; + const b: Vec = rem_b[0..vec_len].*; + // ASCII gate: MSB must be 0 in both. + const is_ascii: BVec = ((a | b) & msb) == vec0; + + const xor: Vec = a ^ b; + const exact: BVec = xor == vec0; + const case_diff: BVec = xor == case_bit; + + // Letter test (only needed when case_diff). + const x: Vec = (a | b) & low5; + const is_letter: BVec = + (x >= vec1) & (x <= vec26); + + const matched: BVec = is_ascii & (exact | (case_diff & is_letter)); + + if (!@reduce(.And, matched)) break; + rem_a = rem_a[vec_len..]; + rem_b = rem_b[vec_len..]; + } + + // Tail + return str_a.len - rem_a.len + caselessCmpNoSimd(rem_a, rem_b); +} + +inline fn caselessCmpNoSimd(str_a: []const u8, str_b: []const u8) usize { + for (str_a, str_b, 0..) |a, b, i| { + // High? + if (((a | b) & 0x80) != 0) return i; + const xor = a ^ b; + if (xor == 0) continue; // Match + if (xor != 0x20) return i; // Not the upcase bit. + + const lo = a | b; + const x = lo & 0x1f; + if (x < 1 or x > 26) return i; // Not a letter + } else return str_a.len; +} + +test caselessCmpNoSimd { + const hi_l = "Hello, World!"; + const hi_h = "HeLlO, wOrLd!"; + try testing.expectEqual(hi_l.len, caselessCmpNoSimd(hi_l, hi_h)); +} + +test caselessCmpLen { + const hi_l = "Hello, World!" ** 25; + const hi_h = "HeLlO, wOrLd!" ** 25; + try testing.expectEqual(hi_l.len, caselessCmpLen(hi_l, hi_h)); +} + test "isAsciiOnly" { const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+"; try testing.expect(isAsciiOnly(ascii_only)); diff --git a/src/code_point.zig b/src/code_point.zig index 7a638af..5f6c61c 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -30,13 +30,8 @@ pub const CodePoint = struct { /// This function is deprecated and will be removed in a later release. /// Use `decodeAtIndex` or `decodeAtCursor`. pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint { - var off: uoffset = 0; - var maybe_code = decodeAtCursor(bytes, &off); - if (maybe_code) |*code| { - code.offset = offset; - return code.*; - } - return null; + _ = .{ bytes, offset }; + @compileError("decode has been removed, use `decodeAtIndex`."); } /// Return the codepoint at `index`, even if `index` is in the middle @@ -292,7 +287,7 @@ pub const ReverseIterator = struct { else iter.i = null; - return decode(iter.bytes[i_prev..], i_prev); + return decodeAtIndex(iter.bytes, i_prev); } pub fn peek(iter: *ReverseIterator) ?CodePoint { @@ -319,7 +314,7 @@ inline fn followbyte(b: u8) bool { test "decode" { const bytes = "🌩️"; - const res = decode(bytes, 0); + const res = decodeAtIndex(bytes, 0); if (res) |cp| { try std.testing.expectEqual(@as(u21, 0x1F329), cp.code); diff --git a/src/zg.zig b/src/zg.zig new file mode 100644 index 0000000..2974320 --- /dev/null +++ b/src/zg.zig @@ -0,0 +1,14 @@ +//! zg: a Zig-native Unicode Module + +pub const ascii = @import("ascii"); +pub const case_folding = @import("CaseFolding"); +pub const code_point = @import("code_point"); +pub const display_width = @import("DisplawWidth"); +pub const emoji = @import("Emoji"); +pub const general_categories = @import("GeneralCategories"); +pub const graphemes = @import("Graphemes"); +pub const letter_casing = @import("LetterCasing"); +pub const normalize = @import("Normalize"); +pub const properties = @import("Properties"); +pub const scripts = @import("Scripts"); +pub const words = @import("Words"); -- cgit v1.2.3