diff options
| -rw-r--r-- | NEWS.md | 94 | ||||
| -rw-r--r-- | build.zig | 18 | ||||
| -rw-r--r-- | src/CanonData.zig | 12 | ||||
| -rw-r--r-- | src/CaseFolding.zig | 26 | ||||
| -rw-r--r-- | src/ascii.zig | 71 | ||||
| -rw-r--r-- | src/code_point.zig | 13 | ||||
| -rw-r--r-- | src/zg.zig | 14 |
7 files changed, 215 insertions, 33 deletions
| @@ -1,5 +1,6 @@ | |||
| 1 | # News | 1 | # News |
| 2 | 2 | ||
| 3 | |||
| 3 | ## zg v0.16.0-pre Release Notes | 4 | ## zg v0.16.0-pre Release Notes |
| 4 | 5 | ||
| 5 | This brings another major change to `zg`, touching basically everything. | 6 | This brings another major change to `zg`, touching basically everything. |
| @@ -24,6 +25,10 @@ I did manage to dispose of one last allocation in `CanonData`, by | |||
| 24 | integrating Vexu's very clever [comptime hash map][chm], so, that's | 25 | integrating Vexu's very clever [comptime hash map][chm], so, that's |
| 25 | nice. | 26 | nice. |
| 26 | 27 | ||
| 28 | [uucode]: https://github.com/jacobsandlund/uucode | ||
| 29 | [chm]: https://github.com/Vexu/comptime_hash_map/blob/master/src/main.zig | ||
| 30 | |||
| 31 | |||
| 27 | ### Migration | 32 | ### Migration |
| 28 | 33 | ||
| 29 | Is simplicity itself: just call the module instead of calling the | 34 | Is simplicity itself: just call the module instead of calling the |
| @@ -33,41 +38,73 @@ disposing of it when you're done with it. | |||
| 33 | Technically the laws of style dictate that, since these are now | 38 | Technically the laws of style dictate that, since these are now |
| 34 | containers, and not instantiable types, they should be lowercased. | 39 | containers, and not instantiable types, they should be lowercased. |
| 35 | 40 | ||
| 36 | I did not do that in the README. But feel free! | 41 | I didn't see the point in changing all those names, it would add labor |
| 42 | to what should be a very brief and pleasant upgrade (but see below). | ||
| 43 | But feel free! | ||
| 37 | 44 | ||
| 38 | Pro tip: use LSP superpowers to rename the instance to the name of the | 45 | Pro tip: use LSP superpowers to rename the instance to the name of the |
| 39 | module, then just delete the initializer. Couldn't be simpler. | 46 | module, then just delete the initializer. Couldn't be simpler. |
| 40 | 47 | ||
| 48 | |||
| 49 | ### zg: The Module | ||
| 50 | |||
| 51 | The take-what-you-need approach, of packaging the interface in a bunch of | ||
| 52 | separate modules, remains available for those who prefer it. | ||
| 53 | |||
| 54 | Or, your code can just import `"zg"`, a module containing all of the | ||
| 55 | other modules. Zig's lazy compilation model gives us take-what-you-need | ||
| 56 | already, so while there's no reason to remove the submodules, there's no | ||
| 57 | reason to prefer using them either. | ||
| 58 | |||
| 59 | As mentioned above, none of these are instance types any longer, and that | ||
| 60 | dictates that they take lower-case (as `code_point` and `ascii` always | ||
| 61 | have, for that reason). So in `zg`, the modules are styled in lower case. | ||
| 62 | |||
| 63 | I did not want to combine a purely stylistic change, and one which would | ||
| 64 | require editing build scripts, with the functional changes needed to use | ||
| 65 | the (much nicer!) allocation-free interface. It is possible that later | ||
| 66 | releases will lowercase the submodules as well, or maybe just remove | ||
| 67 | them in favor of importing `zg`. Then again, maybe not. | ||
| 68 | |||
| 69 | |||
| 41 | ### Emoji Module | 70 | ### Emoji Module |
| 42 | 71 | ||
| 43 | Also Jacob's work. | 72 | Also Jacob's work. Exposes the basic useful Unicode emoji properties. |
| 44 | 73 | ||
| 45 | [uucode]: https://github.com/jacobsandlund/uucode | ||
| 46 | [chm]: https://github.com/Vexu/comptime_hash_map/blob/master/src/main.zig | ||
| 47 | 74 | ||
| 48 | ### graphemeClusterWidth | 75 | ### `graphemeClusterWidth` |
| 49 | 76 | ||
| 50 | @lch361 submitted a minor refactor which makes it cleaner to obtain the | 77 | @lch361 submitted a minor refactor which makes it cleaner to obtain the |
| 51 | display width of a grapheme cluster. Thanks Lich! | 78 | display width of a grapheme cluster. Thanks Lich! |
| 52 | 79 | ||
| 53 | ### The Future | ||
| 54 | 80 | ||
| 55 | I hope I don't jinx it, but this is the last major change I've wanted | 81 | ### Better Fast-pathing in Caseless Comparison |
| 56 | to make to `zg`. I brought it up with José before he handed over | 82 | |
| 57 | maintainence to me, we agreed that compressing and decompressing and | 83 | Caseless comparison only tries the ASCII fast-path when strings are the |
| 58 | heap-allocating was one of those ideas which turned out not to pencil | 84 | same size, which is the only time it can work. The fast path has also |
| 59 | out. Hey, it happens. | 85 | been SIMD accelerated when possible. |
| 86 | |||
| 87 | Canonicalization, and caseless comparision (which uses it), are in need | ||
| 88 | of attention. They do things in the most expensive possible fashion, | ||
| 89 | without taking advantage of any opportunities to do the cheaper thing. | ||
| 90 | While the result is correct, even in pathological cases, it is not | ||
| 91 | optimal, especially given the reality that Unicode text is, in a modern | ||
| 92 | context, nearly always in canonical form already. | ||
| 93 | |||
| 94 | Changes to that will have to wait for another release, despite my | ||
| 95 | inclinations to the contrary. | ||
| 96 | |||
| 97 | |||
| 98 | ### code_point.decode fully deprecated | ||
| 60 | 99 | ||
| 61 | Features? There are few left, sure. Bugfixes? Always, of course. | 100 | Slicing to decode a point is an anti-pattern, and calling this |
| 62 | But major API changes are now unlikely. | 101 | deprecated function is now a `@compileError`, suggesting `decodeAtIndex` |
| 102 | instead. I suggest taking a look at `decodeAtCursor` as well, which | ||
| 103 | takes a pointer to an index and moves it to the next codepoint while | ||
| 104 | decoding, this is often what you want. | ||
| 63 | 105 | ||
| 64 | I have a "no 1.0 until Zig 1.0" policy, because no one can guarantee the | 106 | A future release will remove the function entirely. |
| 65 | stability of anything until that happens. Nor is `zg` actually ready | ||
| 66 | for a stability policy; if I decide to break apart a module, or move | ||
| 67 | something where it makes more sense, I'm going to do that. | ||
| 68 | 107 | ||
| 69 | But it is _likely_ that this is the last 'global' refactor of the | ||
| 70 | library. | ||
| 71 | 108 | ||
| 72 | ## zg v0.15.2-4 Release Notes | 109 | ## zg v0.15.2-4 Release Notes |
| 73 | 110 | ||
| @@ -82,6 +119,7 @@ but we could have vendored it: more importantly, it turned out to be | |||
| 82 | basically useless. Savings per data set were in the bytes to low | 119 | basically useless. Savings per data set were in the bytes to low |
| 83 | KiB range, and startup time was negatively affected. | 120 | KiB range, and startup time was negatively affected. |
| 84 | 121 | ||
| 122 | |||
| 85 | ## zg v0.14.1 Release Notes | 123 | ## zg v0.14.1 Release Notes |
| 86 | 124 | ||
| 87 | In a flurry of activity during and after the `v0.14.0` beta, several | 125 | In a flurry of activity during and after the `v0.14.0` beta, several |
| @@ -92,6 +130,7 @@ Presenting `zg v0.14.1`. As should be expected from a patch release, | |||
| 92 | there are no breaking changes to the interface, just bug fixes and | 130 | there are no breaking changes to the interface, just bug fixes and |
| 93 | features. | 131 | features. |
| 94 | 132 | ||
| 133 | |||
| 95 | ### Grapheme Zalgo Text Bugfix | 134 | ### Grapheme Zalgo Text Bugfix |
| 96 | 135 | ||
| 97 | Until this release, `zg` was using a `u8` to store the length of a | 136 | Until this release, `zg` was using a `u8` to store the length of a |
| @@ -109,6 +148,7 @@ Actually, both fields are now `uoffset`, for reasons described next. | |||
| 109 | 148 | ||
| 110 | [Zalgo]: https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454 | 149 | [Zalgo]: https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454 |
| 111 | 150 | ||
| 151 | |||
| 112 | ### Limits Section Added to README | 152 | ### Limits Section Added to README |
| 113 | 153 | ||
| 114 | The README now clearly documents that some data structures and iterators | 154 | The README now clearly documents that some data structures and iterators |
| @@ -125,6 +165,7 @@ I believe this to be neither necessary nor sufficient for handling data of | |||
| 125 | that size. But I can't anticipate every requirement, and don't want to | 165 | that size. But I can't anticipate every requirement, and don't want to |
| 126 | preclude it as a solution. | 166 | preclude it as a solution. |
| 127 | 167 | ||
| 168 | |||
| 128 | ### Iterators, Back and Forth | 169 | ### Iterators, Back and Forth |
| 129 | 170 | ||
| 130 | A new contributor, Nemoos, took on the challenge of adding a reverse | 171 | A new contributor, Nemoos, took on the challenge of adding a reverse |
| @@ -158,6 +199,7 @@ of those functions. For codepoints, starting an iterator at either | |||
| 158 | `.offset` or `.offset + .len` will suffice, since the `CodePoint` | 199 | `.offset` or `.offset + .len` will suffice, since the `CodePoint` |
| 159 | iterator is otherwise stateless. | 200 | iterator is otherwise stateless. |
| 160 | 201 | ||
| 202 | |||
| 161 | ### Words Module | 203 | ### Words Module |
| 162 | 204 | ||
| 163 | The [Unicode annex][tr29] with the canonical grapheme segmentation | 205 | The [Unicode annex][tr29] with the canonical grapheme segmentation |
| @@ -173,6 +215,7 @@ segmentation and [line breaking][tr14]. | |||
| 173 | [tr29]: https://www.unicode.org/reports/tr29/ | 215 | [tr29]: https://www.unicode.org/reports/tr29/ |
| 174 | [tr14]: https://www.unicode.org/reports/tr14/ | 216 | [tr14]: https://www.unicode.org/reports/tr14/ |
| 175 | 217 | ||
| 218 | |||
| 176 | #### Runeset Used | 219 | #### Runeset Used |
| 177 | 220 | ||
| 178 | As a point of interest: | 221 | As a point of interest: |
| @@ -190,6 +233,7 @@ transitive dependencies. | |||
| 190 | [UCD]: https://www.unicode.org/reports/tr44/ | 233 | [UCD]: https://www.unicode.org/reports/tr44/ |
| 191 | [Rune]: https://github.com/mnemnion/runeset | 234 | [Rune]: https://github.com/mnemnion/runeset |
| 192 | 235 | ||
| 236 | |||
| 193 | ## zg v0.14.0 Release Notes | 237 | ## zg v0.14.0 Release Notes |
| 194 | 238 | ||
| 195 | This is the first minor point release since Sam Atman (me) took over | 239 | This is the first minor point release since Sam Atman (me) took over |
| @@ -201,12 +245,14 @@ The changes are fairly large, and most user code will need to be updated. | |||
| 201 | The result is substantially streamlined and easier to use, and updating | 245 | The result is substantially streamlined and easier to use, and updating |
| 202 | will mainly take place around importing, creating, and deinitializing. | 246 | will mainly take place around importing, creating, and deinitializing. |
| 203 | 247 | ||
| 248 | |||
| 204 | ### The Great Renaming | 249 | ### The Great Renaming |
| 205 | 250 | ||
| 206 | The most obvious change is on the surface API: more than half of the | 251 | The most obvious change is on the surface API: more than half of the |
| 207 | modules have been renamed. There are no user-facing modules with `Data` | 252 | modules have been renamed. There are no user-facing modules with `Data` |
| 208 | in the name, and some abbreviations have been spelled in full. | 253 | in the name, and some abbreviations have been spelled in full. |
| 209 | 254 | ||
| 255 | |||
| 210 | ### No More Separation of Data and Functionality | 256 | ### No More Separation of Data and Functionality |
| 211 | 257 | ||
| 212 | It is no longer necessary to separately create, for example, a | 258 | It is no longer necessary to separately create, for example, a |
| @@ -222,6 +268,7 @@ This would make user structs larger in some cases, while eliminating a | |||
| 222 | pointer chase. If that isn't a desirable trade off for your code, | 268 | pointer chase. If that isn't a desirable trade off for your code, |
| 223 | read on. | 269 | read on. |
| 224 | 270 | ||
| 271 | |||
| 225 | ### All Allocated Data is Unmanaged | 272 | ### All Allocated Data is Unmanaged |
| 226 | 273 | ||
| 227 | Prior to `v0.14`, all structs which need heap allocation no longer | 274 | Prior to `v0.14`, all structs which need heap allocation no longer |
| @@ -235,6 +282,7 @@ Getting up to speed is a matter of passing the allocator to `deinit`. | |||
| 235 | This change comes courtesy of [lch361](https://lch361.net), in his | 282 | This change comes courtesy of [lch361](https://lch361.net), in his |
| 236 | first contribution to the repo. Thanks Lich! | 283 | first contribution to the repo. Thanks Lich! |
| 237 | 284 | ||
| 285 | |||
| 238 | ### `code_point` Now Unicode-Compliant | 286 | ### `code_point` Now Unicode-Compliant |
| 239 | 287 | ||
| 240 | The `v0.15.x` decoder used a simple, fast, but naïve method to decode | 288 | The `v0.15.x` decoder used a simple, fast, but naïve method to decode |
| @@ -267,6 +315,7 @@ is good for the fetch pipeline, and more ergonomic in many cases. | |||
| 267 | [^1]: A bit more than twice as fast as the standard library for | 315 | [^1]: A bit more than twice as fast as the standard library for |
| 268 | decoding, according to my (limited) benchmarks. | 316 | decoding, according to my (limited) benchmarks. |
| 269 | 317 | ||
| 318 | |||
| 270 | ### DisplayWidth and CaseFolding Can Share Data | 319 | ### DisplayWidth and CaseFolding Can Share Data |
| 271 | 320 | ||
| 272 | Both of these modules use another module to get the job done, | 321 | Both of these modules use another module to get the job done, |
| @@ -276,6 +325,7 @@ It is now possible to initialize them with a borrowed copy of those | |||
| 276 | modules, to make it simpler to write code which also needs the base | 325 | modules, to make it simpler to write code which also needs the base |
| 277 | modules. | 326 | modules. |
| 278 | 327 | ||
| 328 | |||
| 279 | ### Grapheme Iterator Creation | 329 | ### Grapheme Iterator Creation |
| 280 | 330 | ||
| 281 | This is a modest streamlining of how a grapheme iterator is created. | 331 | This is a modest streamlining of how a grapheme iterator is created. |
| @@ -304,6 +354,7 @@ var iter = Graphemes.Iterator.init("stri̵̫̗̗̱̳̼̖͚͉͂̌̈́̓̄͋̇̎͠ͅ | |||
| 304 | 354 | ||
| 305 | If one were to prefer doing so. | 355 | If one were to prefer doing so. |
| 306 | 356 | ||
| 357 | |||
| 307 | ### Initialization vs. Setup | 358 | ### Initialization vs. Setup |
| 308 | 359 | ||
| 309 | Every allocating module now has both an `init` function, which | 360 | Every allocating module now has both an `init` function, which |
| @@ -321,6 +372,7 @@ have been turned `unreachable`, leaving only `error.OutOfMemory`. | |||
| 321 | Encountering any of the other errors would indicate an internal problem, | 372 | Encountering any of the other errors would indicate an internal problem, |
| 322 | so we no longer make user code deal with that unlikely event. | 373 | so we no longer make user code deal with that unlikely event. |
| 323 | 374 | ||
| 375 | |||
| 324 | ### New DisplayWidth options | 376 | ### New DisplayWidth options |
| 325 | 377 | ||
| 326 | A `DisplayWidth` can now be compiled to treat `c0` and `c1` control codes | 378 | A `DisplayWidth` can now be compiled to treat `c0` and `c1` control codes |
| @@ -330,18 +382,21 @@ need to escape control codes to make them visible. Setting these | |||
| 330 | options will let `DisplayWidth` return the correct widths when this | 382 | options will let `DisplayWidth` return the correct widths when this |
| 331 | is done. | 383 | is done. |
| 332 | 384 | ||
| 385 | |||
| 333 | ### Unicode 16.0 | 386 | ### Unicode 16.0 |
| 334 | 387 | ||
| 335 | This updates `zg` to use the latest Unicode edition. This should be | 388 | This updates `zg` to use the latest Unicode edition. This should be |
| 336 | the only change which will change behavior of user code, other than | 389 | the only change which will change behavior of user code, other than |
| 337 | through the use of the new `DisplayWidth` options. | 390 | through the use of the new `DisplayWidth` options. |
| 338 | 391 | ||
| 392 | |||
| 339 | ### Tests | 393 | ### Tests |
| 340 | 394 | ||
| 341 | It is now possible to run all the tests, not just the `unicode-test` | 395 | It is now possible to run all the tests, not just the `unicode-test` |
| 342 | subset. Accordingly, that step is removed, and `zig build test` | 396 | subset. Accordingly, that step is removed, and `zig build test` |
| 343 | runs everything. | 397 | runs everything. |
| 344 | 398 | ||
| 399 | |||
| 345 | #### Allocations Tested | 400 | #### Allocations Tested |
| 346 | 401 | ||
| 347 | Every allocate-able now has a `checkAllAllocationFailures` test. This | 402 | Every allocate-able now has a `checkAllAllocationFailures` test. This |
| @@ -349,6 +404,7 @@ process turned up two bugs. Also discovered were 8,663 allocations, | |||
| 349 | which were reduced to two, these were also being individually freed | 404 | which were reduced to two, these were also being individually freed |
| 350 | on deinit. So that's nice. | 405 | on deinit. So that's nice. |
| 351 | 406 | ||
| 407 | |||
| 352 | #### That's It! | 408 | #### That's It! |
| 353 | 409 | ||
| 354 | I hope you find converting over `zg v0.13` code to be fairly painless | 410 | I hope you find converting over `zg v0.13` code to be fairly painless |
| @@ -491,6 +491,24 @@ pub fn build(b: *std.Build) void { | |||
| 491 | properties.addAnonymousImport("props", .{ .root_source_file = props_gen_out }); | 491 | properties.addAnonymousImport("props", .{ .root_source_file = props_gen_out }); |
| 492 | properties.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out }); | 492 | properties.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out }); |
| 493 | 493 | ||
| 494 | const zg_module = b.addModule("zg", .{ | ||
| 495 | .root_source_file = b.path("src/zg.zig"), | ||
| 496 | .target = target, | ||
| 497 | .optimize = optimize, | ||
| 498 | }); | ||
| 499 | zg_module.addImport("ascii", ascii); | ||
| 500 | zg_module.addImport("CaseFolding", case_fold); | ||
| 501 | zg_module.addImport("code_point", code_point); | ||
| 502 | zg_module.addImport("DisplawWidth", display_width); | ||
| 503 | zg_module.addImport("Emoji", emoji); | ||
| 504 | zg_module.addImport("GeneralCategories", gencat); | ||
| 505 | zg_module.addImport("Graphemes", graphemes); | ||
| 506 | zg_module.addImport("LetterCasing", letter_case); | ||
| 507 | zg_module.addImport("Normalize", norm); | ||
| 508 | zg_module.addImport("Properties", properties); | ||
| 509 | zg_module.addImport("Scripts", scripts); | ||
| 510 | zg_module.addImport("Words", words); | ||
| 511 | |||
| 494 | const properties_t = b.addTest(.{ | 512 | const properties_t = b.addTest(.{ |
| 495 | .name = "properties", | 513 | .name = "properties", |
| 496 | .root_module = properties, | 514 | .root_module = properties, |
diff --git a/src/CanonData.zig b/src/CanonData.zig index 5c1ffa6..144346c 100644 --- a/src/CanonData.zig +++ b/src/CanonData.zig | |||
| @@ -5,6 +5,12 @@ const Data = struct { | |||
| 5 | s2: []const @import("canon").Canonicalization = undefined, | 5 | s2: []const @import("canon").Canonicalization = undefined, |
| 6 | }; | 6 | }; |
| 7 | 7 | ||
| 8 | // Canonicalization looks like this: | ||
| 9 | // const Canonicalization = struct { | ||
| 10 | // len: u3 = 0, | ||
| 11 | // cps: [2]u21 = [_]u21{0} ** 2, | ||
| 12 | // }; | ||
| 13 | |||
| 8 | const canon_data = canon_data: { | 14 | const canon_data = canon_data: { |
| 9 | const canon_ = @import("canon"); | 15 | const canon_ = @import("canon"); |
| 10 | break :canon_data Data{ | 16 | break :canon_data Data{ |
| @@ -17,7 +23,7 @@ const CanonData = @This(); | |||
| 17 | 23 | ||
| 18 | // There's a bug here, which is down to how static u21 vs. runtime are handled, | 24 | // There's a bug here, which is down to how static u21 vs. runtime are handled, |
| 19 | // the "unique representation" claim is not working out. AutoHash casts to bytes, | 25 | // the "unique representation" claim is not working out. AutoHash casts to bytes, |
| 20 | // and that won't fly. So we do this: | 26 | // and that won't fly. So we do a simple custom context which works for both. |
| 21 | 27 | ||
| 22 | const Context = struct { | 28 | const Context = struct { |
| 23 | pub fn hash(_: Context, cps: [2]u21) u64 { | 29 | pub fn hash(_: Context, cps: [2]u21) u64 { |
| @@ -52,3 +58,7 @@ const std = @import("std"); | |||
| 52 | const builtin = @import("builtin"); | 58 | const builtin = @import("builtin"); |
| 53 | const mem = std.mem; | 59 | const mem = std.mem; |
| 54 | const comptime_map = @import("comptime_map.zig"); | 60 | const comptime_map = @import("comptime_map.zig"); |
| 61 | |||
| 62 | test { | ||
| 63 | _ = comptime_map; | ||
| 64 | } | ||
diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig index d69cddc..b7aa020 100644 --- a/src/CaseFolding.zig +++ b/src/CaseFolding.zig | |||
| @@ -103,7 +103,16 @@ pub fn compatCaselessMatch( | |||
| 103 | a: []const u8, | 103 | a: []const u8, |
| 104 | b: []const u8, | 104 | b: []const u8, |
| 105 | ) Allocator.Error!bool { | 105 | ) Allocator.Error!bool { |
| 106 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); | 106 | var a_in = a; |
| 107 | var b_in = b; | ||
| 108 | |||
| 109 | // Ascii short path. Only applies if they're the same length: | ||
| 110 | if (a_in.len == b_in.len) { | ||
| 111 | const prefix = ascii.caselessCmpLen(a_in, b_in); | ||
| 112 | if (prefix == a_in.len) return true; | ||
| 113 | a_in = a_in[prefix..]; | ||
| 114 | b_in = b_in[prefix..]; | ||
| 115 | } | ||
| 107 | 116 | ||
| 108 | // Process a | 117 | // Process a |
| 109 | const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd); | 118 | const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd); |
| @@ -192,10 +201,19 @@ pub fn canonCaselessMatch( | |||
| 192 | a: []const u8, | 201 | a: []const u8, |
| 193 | b: []const u8, | 202 | b: []const u8, |
| 194 | ) Allocator.Error!bool { | 203 | ) Allocator.Error!bool { |
| 195 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); | 204 | var a_in = a; |
| 205 | var b_in = b; | ||
| 206 | |||
| 207 | // Ascii short path. Only applies if they're the same length: | ||
| 208 | if (a_in.len == b_in.len) { | ||
| 209 | const prefix = ascii.caselessCmpLen(a_in, b_in); | ||
| 210 | if (prefix == a_in.len) return true; | ||
| 211 | a_in = a_in[prefix..]; | ||
| 212 | b_in = b_in[prefix..]; | ||
| 213 | } | ||
| 196 | 214 | ||
| 197 | // Process a | 215 | // Process a |
| 198 | const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd); | 216 | const nfd_a = try Normalize.nfxdCodePoints(allocator, a_in, .nfd); |
| 199 | defer allocator.free(nfd_a); | 217 | defer allocator.free(nfd_a); |
| 200 | 218 | ||
| 201 | var need_free_cf_nfd_a = false; | 219 | var need_free_cf_nfd_a = false; |
| @@ -215,7 +233,7 @@ pub fn canonCaselessMatch( | |||
| 215 | defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); | 233 | defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); |
| 216 | 234 | ||
| 217 | // Process b | 235 | // Process b |
| 218 | const nfd_b = try Normalize.nfxdCodePoints(allocator, b, .nfd); | 236 | const nfd_b = try Normalize.nfxdCodePoints(allocator, b_in, .nfd); |
| 219 | defer allocator.free(nfd_b); | 237 | defer allocator.free(nfd_b); |
| 220 | 238 | ||
| 221 | var need_free_cf_nfd_b = false; | 239 | var need_free_cf_nfd_b = false; |
diff --git a/src/ascii.zig b/src/ascii.zig index 6c28f25..5b91348 100644 --- a/src/ascii.zig +++ b/src/ascii.zig | |||
| @@ -25,6 +25,77 @@ pub fn isAsciiOnly(str: []const u8) bool { | |||
| 25 | return true; | 25 | return true; |
| 26 | } | 26 | } |
| 27 | 27 | ||
| 28 | /// Do a caseless comparison, with SIMD if possible. Strings must be of equal | ||
| 29 | /// length. Returns how many bytes are case-fold-matched ASCII, this will be | ||
| 30 | /// equal to the string length if they match. | ||
| 31 | pub fn caselessCmpLen(str_a: []const u8, str_b: []const u8) usize { | ||
| 32 | std.debug.assert(str_a.len == str_b.len); | ||
| 33 | const vec_len = simd.suggestVectorLength(u8) orelse return caselessCmpNoSimd(str_a, str_b); | ||
| 34 | const Vec = @Vector(vec_len, u8); | ||
| 35 | const BVec = @Vector(vec_len, bool); | ||
| 36 | |||
| 37 | const msb: Vec = @splat(@as(u8, 0x80)); | ||
| 38 | const case_bit: Vec = @splat(@as(u8, 0x20)); | ||
| 39 | const low5: Vec = @splat(@as(u8, 0x1f)); | ||
| 40 | const vec0: Vec = @splat(@as(u8, 0)); | ||
| 41 | const vec1: Vec = @splat(@as(u8, 1)); | ||
| 42 | const vec26: Vec = @splat(@as(u8, 26)); | ||
| 43 | |||
| 44 | var rem_a = str_a; | ||
| 45 | var rem_b = str_b; | ||
| 46 | |||
| 47 | while (rem_a.len >= vec_len) { | ||
| 48 | const a: Vec = rem_a[0..vec_len].*; | ||
| 49 | const b: Vec = rem_b[0..vec_len].*; | ||
| 50 | // ASCII gate: MSB must be 0 in both. | ||
| 51 | const is_ascii: BVec = ((a | b) & msb) == vec0; | ||
| 52 | |||
| 53 | const xor: Vec = a ^ b; | ||
| 54 | const exact: BVec = xor == vec0; | ||
| 55 | const case_diff: BVec = xor == case_bit; | ||
| 56 | |||
| 57 | // Letter test (only needed when case_diff). | ||
| 58 | const x: Vec = (a | b) & low5; | ||
| 59 | const is_letter: BVec = | ||
| 60 | (x >= vec1) & (x <= vec26); | ||
| 61 | |||
| 62 | const matched: BVec = is_ascii & (exact | (case_diff & is_letter)); | ||
| 63 | |||
| 64 | if (!@reduce(.And, matched)) break; | ||
| 65 | rem_a = rem_a[vec_len..]; | ||
| 66 | rem_b = rem_b[vec_len..]; | ||
| 67 | } | ||
| 68 | |||
| 69 | // Tail | ||
| 70 | return str_a.len - rem_a.len + caselessCmpNoSimd(rem_a, rem_b); | ||
| 71 | } | ||
| 72 | |||
| 73 | inline fn caselessCmpNoSimd(str_a: []const u8, str_b: []const u8) usize { | ||
| 74 | for (str_a, str_b, 0..) |a, b, i| { | ||
| 75 | // High? | ||
| 76 | if (((a | b) & 0x80) != 0) return i; | ||
| 77 | const xor = a ^ b; | ||
| 78 | if (xor == 0) continue; // Match | ||
| 79 | if (xor != 0x20) return i; // Not the upcase bit. | ||
| 80 | |||
| 81 | const lo = a | b; | ||
| 82 | const x = lo & 0x1f; | ||
| 83 | if (x < 1 or x > 26) return i; // Not a letter | ||
| 84 | } else return str_a.len; | ||
| 85 | } | ||
| 86 | |||
| 87 | test caselessCmpNoSimd { | ||
| 88 | const hi_l = "Hello, World!"; | ||
| 89 | const hi_h = "HeLlO, wOrLd!"; | ||
| 90 | try testing.expectEqual(hi_l.len, caselessCmpNoSimd(hi_l, hi_h)); | ||
| 91 | } | ||
| 92 | |||
| 93 | test caselessCmpLen { | ||
| 94 | const hi_l = "Hello, World!" ** 25; | ||
| 95 | const hi_h = "HeLlO, wOrLd!" ** 25; | ||
| 96 | try testing.expectEqual(hi_l.len, caselessCmpLen(hi_l, hi_h)); | ||
| 97 | } | ||
| 98 | |||
| 28 | test "isAsciiOnly" { | 99 | test "isAsciiOnly" { |
| 29 | const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+"; | 100 | const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+"; |
| 30 | try testing.expect(isAsciiOnly(ascii_only)); | 101 | try testing.expect(isAsciiOnly(ascii_only)); |
diff --git a/src/code_point.zig b/src/code_point.zig index 7a638af..5f6c61c 100644 --- a/src/code_point.zig +++ b/src/code_point.zig | |||
| @@ -30,13 +30,8 @@ pub const CodePoint = struct { | |||
| 30 | /// This function is deprecated and will be removed in a later release. | 30 | /// This function is deprecated and will be removed in a later release. |
| 31 | /// Use `decodeAtIndex` or `decodeAtCursor`. | 31 | /// Use `decodeAtIndex` or `decodeAtCursor`. |
| 32 | pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint { | 32 | pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint { |
| 33 | var off: uoffset = 0; | 33 | _ = .{ bytes, offset }; |
| 34 | var maybe_code = decodeAtCursor(bytes, &off); | 34 | @compileError("decode has been removed, use `decodeAtIndex`."); |
| 35 | if (maybe_code) |*code| { | ||
| 36 | code.offset = offset; | ||
| 37 | return code.*; | ||
| 38 | } | ||
| 39 | return null; | ||
| 40 | } | 35 | } |
| 41 | 36 | ||
| 42 | /// Return the codepoint at `index`, even if `index` is in the middle | 37 | /// Return the codepoint at `index`, even if `index` is in the middle |
| @@ -292,7 +287,7 @@ pub const ReverseIterator = struct { | |||
| 292 | else | 287 | else |
| 293 | iter.i = null; | 288 | iter.i = null; |
| 294 | 289 | ||
| 295 | return decode(iter.bytes[i_prev..], i_prev); | 290 | return decodeAtIndex(iter.bytes, i_prev); |
| 296 | } | 291 | } |
| 297 | 292 | ||
| 298 | pub fn peek(iter: *ReverseIterator) ?CodePoint { | 293 | pub fn peek(iter: *ReverseIterator) ?CodePoint { |
| @@ -319,7 +314,7 @@ inline fn followbyte(b: u8) bool { | |||
| 319 | 314 | ||
| 320 | test "decode" { | 315 | test "decode" { |
| 321 | const bytes = "🌩️"; | 316 | const bytes = "🌩️"; |
| 322 | const res = decode(bytes, 0); | 317 | const res = decodeAtIndex(bytes, 0); |
| 323 | 318 | ||
| 324 | if (res) |cp| { | 319 | if (res) |cp| { |
| 325 | try std.testing.expectEqual(@as(u21, 0x1F329), cp.code); | 320 | try std.testing.expectEqual(@as(u21, 0x1F329), cp.code); |
diff --git a/src/zg.zig b/src/zg.zig new file mode 100644 index 0000000..2974320 --- /dev/null +++ b/src/zg.zig | |||
| @@ -0,0 +1,14 @@ | |||
| 1 | //! zg: a Zig-native Unicode Module | ||
| 2 | |||
| 3 | pub const ascii = @import("ascii"); | ||
| 4 | pub const case_folding = @import("CaseFolding"); | ||
| 5 | pub const code_point = @import("code_point"); | ||
| 6 | pub const display_width = @import("DisplawWidth"); | ||
| 7 | pub const emoji = @import("Emoji"); | ||
| 8 | pub const general_categories = @import("GeneralCategories"); | ||
| 9 | pub const graphemes = @import("Graphemes"); | ||
| 10 | pub const letter_casing = @import("LetterCasing"); | ||
| 11 | pub const normalize = @import("Normalize"); | ||
| 12 | pub const properties = @import("Properties"); | ||
| 13 | pub const scripts = @import("Scripts"); | ||
| 14 | pub const words = @import("Words"); | ||