diff options
| author | 2025-07-08 12:15:32 -0400 | |
|---|---|---|
| committer | 2025-07-08 12:15:32 -0400 | |
| commit | 9427a9e53aaa29ee071f4dcb35b809a699d75aa9 (patch) | |
| tree | 2607c185fd8053b84d60041fadc35c05a0225d34 /README.md | |
| parent | Merge pull request 'Fix benchmarks' (#56) from jacobsandlund/zg:benchmarks in... (diff) | |
| parent | Add Words.zig example to README (diff) | |
| download | zg-master.tar.gz zg-master.tar.xz zg-master.zip | |
Diffstat (limited to 'README.md')
| -rw-r--r-- | README.md | 230 |
1 files changed, 136 insertions, 94 deletions
| @@ -2,21 +2,24 @@ | |||
| 2 | 2 | ||
| 3 | zg provides Unicode text processing for Zig projects. | 3 | zg provides Unicode text processing for Zig projects. |
| 4 | 4 | ||
| 5 | |||
| 5 | ## Unicode Version | 6 | ## Unicode Version |
| 6 | 7 | ||
| 7 | The Unicode version supported by zg is `16.0.0`. | 8 | The Unicode version supported by zg is `16.0.0`. |
| 8 | 9 | ||
| 10 | |||
| 9 | ## Zig Version | 11 | ## Zig Version |
| 10 | 12 | ||
| 11 | The minimum Zig version required is `0.14`. | 13 | The minimum Zig version required is `0.14`. |
| 12 | 14 | ||
| 15 | |||
| 13 | ## Integrating zg into your Zig Project | 16 | ## Integrating zg into your Zig Project |
| 14 | 17 | ||
| 15 | You first need to add zg as a dependency in your `build.zig.zon` file. In your | 18 | You first need to add zg as a dependency in your `build.zig.zon` file. In your |
| 16 | Zig project's root directory, run: | 19 | Zig project's root directory, run: |
| 17 | 20 | ||
| 18 | ```plain | 21 | ```plain |
| 19 | zig fetch --save https://codeberg.org/atman/zg/archive/v0.14.0-rc1.tar.gz | 22 | zig fetch --save https://codeberg.org/atman/zg/archive/v0.14.1.tar.gz |
| 20 | ``` | 23 | ``` |
| 21 | 24 | ||
| 22 | Then instantiate the dependency in your `build.zig`: | 25 | Then instantiate the dependency in your `build.zig`: |
| @@ -25,12 +28,14 @@ Then instantiate the dependency in your `build.zig`: | |||
| 25 | const zg = b.dependency("zg", .{}); | 28 | const zg = b.dependency("zg", .{}); |
| 26 | ``` | 29 | ``` |
| 27 | 30 | ||
| 31 | |||
| 28 | ## A Modular Approach | 32 | ## A Modular Approach |
| 29 | 33 | ||
| 30 | zg is a modular library. This approach minimizes binary file size and memory | 34 | zg is a modular library. This approach minimizes binary file size and memory |
| 31 | requirements by only including the Unicode data required for the specified module. | 35 | requirements by only including the Unicode data required for the specified module. |
| 32 | The following sections describe the various modules and their specific use case. | 36 | The following sections describe the various modules and their specific use case. |
| 33 | 37 | ||
| 38 | |||
| 34 | ### Init and Setup | 39 | ### Init and Setup |
| 35 | 40 | ||
| 36 | The code examples will show the use of `Module.init(allocator)` to create the | 41 | The code examples will show the use of `Module.init(allocator)` to create the |
| @@ -67,7 +72,7 @@ const code_point = @import("code_point"); | |||
| 67 | 72 | ||
| 68 | test "Code point iterator" { | 73 | test "Code point iterator" { |
| 69 | const str = "Hi ๐"; | 74 | const str = "Hi ๐"; |
| 70 | var iter = code_point.Iterator{ .bytes = str }; | 75 | var iter: code_point.Iterator = .init(str); |
| 71 | var i: usize = 0; | 76 | var i: usize = 0; |
| 72 | 77 | ||
| 73 | while (iter.next()) |cp| : (i += 1) { | 78 | while (iter.next()) |cp| : (i += 1) { |
| @@ -78,25 +83,60 @@ test "Code point iterator" { | |||
| 78 | 83 | ||
| 79 | if (i == 3) { | 84 | if (i == 3) { |
| 80 | try expect(cp.code == '๐'); | 85 | try expect(cp.code == '๐'); |
| 81 | |||
| 82 | // The `offset` field is the byte offset in the | 86 | // The `offset` field is the byte offset in the |
| 83 | // source string. | 87 | // source string. |
| 84 | try expect(cp.offset == 3); | 88 | try expect(cp.offset == 3); |
| 85 | try expectEqual(code_point.CodePoint, code_point.decodeAtIndex(str, cp.offset)); | 89 | try expectEqual(cp, code_point.decodeAtIndex(str, cp.offset).?); |
| 86 | |||
| 87 | // The `len` field is the length in bytes of the | 90 | // The `len` field is the length in bytes of the |
| 88 | // code point in the source string. | 91 | // code point in the source string. |
| 89 | try expect(cp.len == 4); | 92 | try expect(cp.len == 4); |
| 93 | // There is also a 'cursor' decode, like so: | ||
| 94 | { | ||
| 95 | var cursor = cp.offset; | ||
| 96 | try expectEqual(cp, code_point.decodeAtCursor(str, &cursor).?); | ||
| 97 | // Which advances the cursor variable to the next possible | ||
| 98 | // offset, in this case, `str.len`. Don't forget to account | ||
| 99 | // for this possibility! | ||
| 100 | try expectEqual(cp.offset + cp.len, cursor); | ||
| 101 | } | ||
| 102 | // There's also this, for when you aren't sure if you have the | ||
| 103 | // correct start for a code point: | ||
| 104 | try expectEqual(cp, code_point.codepointAtIndex(str, cp.offset + 1).?); | ||
| 90 | } | 105 | } |
| 106 | // Reverse iteration is also an option: | ||
| 107 | var r_iter: code_point.ReverseIterator = .init(str); | ||
| 108 | // Both iterators can be peeked: | ||
| 109 | try expectEqual('๐', r_iter.peek().?.code); | ||
| 110 | try expectEqual('๐', r_iter.prev().?.code); | ||
| 111 | // Both kinds of iterators can be reversed: | ||
| 112 | var fwd_iter = r_iter.forwardIterator(); // or iter.reverseIterator(); | ||
| 113 | // This will always return the last codepoint from | ||
| 114 | // the prior iterator, _if_ it yielded one: | ||
| 115 | try expectEqual('๐', fwd_iter.next().?.code); | ||
| 91 | } | 116 | } |
| 92 | } | 117 | } |
| 93 | ``` | 118 | ``` |
| 94 | 119 | ||
| 120 | Note that it's safe to call CodePoint functions on invalid | ||
| 121 | UTF-8. Iterators and decode functions will return the Unicode | ||
| 122 | Replacement Character `U+FFFD`, according to the Substitution of Maximal | ||
| 123 | Subparts algorithm, for any invalid code unit sequences encountered. | ||
| 124 | |||
| 125 | |||
| 95 | ## Grapheme Clusters | 126 | ## Grapheme Clusters |
| 96 | 127 | ||
| 97 | Many characters are composed from more than one code point. These are known as | 128 | Many characters are composed from more than one code point. These |
| 98 | Grapheme Clusters, and the `Graphemes` module has a data structure to represent | 129 | are known as Grapheme Clusters, and the `Graphemes` module has a |
| 99 | them, `Grapheme`, and an `Iterator` to iterate over them in a string. | 130 | data structure to represent them, `Grapheme`, and an `Iterator` and |
| 131 | `ReverseIterator` to iterate over them in a string. | ||
| 132 | |||
| 133 | There is also `graphemeAtIndex`, which returns whatever grapheme | ||
| 134 | belongs to the index; this does not have to be on a valid grapheme | ||
| 135 | or codepoint boundary, but it is illegal to call on an empty string. | ||
| 136 | Last, `iterateAfterGrapheme` or `iterateBeforeGrapheme` will provide | ||
| 137 | forward or backward grapheme iterators of the string, from the grapheme | ||
| 138 | provided. Thus, given an index, you can begin forward or backward | ||
| 139 | iteration at that index without needing to slice the string. | ||
| 100 | 140 | ||
| 101 | In your `build.zig`: | 141 | In your `build.zig`: |
| 102 | 142 | ||
| @@ -139,6 +179,56 @@ test "Grapheme cluster iterator" { | |||
| 139 | } | 179 | } |
| 140 | ``` | 180 | ``` |
| 141 | 181 | ||
| 182 | |||
| 183 | ## Words | ||
| 184 | |||
| 185 | Unicode has a standard word segmentation algorithm, which gives good | ||
| 186 | results for most languages. Some languages, such as Thai, require a | ||
| 187 | dictionary to find the boundary between words; these cases are not | ||
| 188 | handled by the standard algorithm. | ||
| 189 | |||
| 190 | `zg` implements that algorithm in the `Words` module. As a note, | ||
| 191 | the iterators and functions provided here will yield segments which | ||
| 192 | are not a "word" in the conventional sense, but word _boundaries_. | ||
| 193 | Specifically, the iterators in this module will return every segment of | ||
| 194 | a string, ensuring that words are kept whole when encountered. If the | ||
| 195 | word breaks are of primary interest, you'll want to use the `.offset` | ||
| 196 | field of each iterated value, and handle `string.len` as the final case | ||
| 197 | when the iteration returns `null`. | ||
| 198 | |||
| 199 | The API is congruent with `Graphemes`: forward and backward iterators, | ||
| 200 | `wordAtIndex`, and `iterateAfter` and before. | ||
| 201 | |||
| 202 | In your `build.zig`: | ||
| 203 | |||
| 204 | ```zig | ||
| 205 | exe.root_module.addImport("Words", zg.module("Words")); | ||
| 206 | ``` | ||
| 207 | |||
| 208 | In your code: | ||
| 209 | |||
| 210 | ```zig | ||
| 211 | const Words = @import("Words"); | ||
| 212 | |||
| 213 | test "Words" { | ||
| 214 | const wb = try Words.init(testing.allocator); | ||
| 215 | defer wb.deinit(testing.allocator); | ||
| 216 | const word_str = "Metonym ฮฮตฯฯฮฝฯฮผฮนฮฟ ใกใใใ "; | ||
| 217 | var w_iter = wb.iterator(word_str); | ||
| 218 | try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str)); | ||
| 219 | // Spaces are "words" too! | ||
| 220 | try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str)); | ||
| 221 | const in_greek = w_iter.next().?; | ||
| 222 | // wordAtIndex doesn't care if the index is valid for a codepoint: | ||
| 223 | for (in_greek.offset..in_greek.offset + in_greek.len) |i| { | ||
| 224 | const at_index = wb.wordAtIndex(word_str, i).bytes(word_str); | ||
| 225 | try testing.expectEqualStrings("ฮฮตฯฯฮฝฯฮผฮนฮฟ", at_index); | ||
| 226 | } | ||
| 227 | _ = w_iter.next(); | ||
| 228 | try testing.expectEqualStrings("ใกใใใ ", w_iter.next().?.bytes(word_str)); | ||
| 229 | } | ||
| 230 | ``` | ||
| 231 | |||
| 142 | ## Unicode General Categories | 232 | ## Unicode General Categories |
| 143 | 233 | ||
| 144 | To detect the general category for a code point, use the `GeneralCategories` module. | 234 | To detect the general category for a code point, use the `GeneralCategories` module. |
| @@ -279,24 +369,24 @@ Unicode normalization is the process of converting a string into a uniform | |||
| 279 | representation that can guarantee a known structure by following a strict set | 369 | representation that can guarantee a known structure by following a strict set |
| 280 | of rules. There are four normalization forms: | 370 | of rules. There are four normalization forms: |
| 281 | 371 | ||
| 282 | Canonical Composition (NFC) | 372 | **Canonical Composition (NFC)** |
| 283 | : The most compact representation obtained by first | 373 | : The most compact representation obtained by first |
| 284 | decomposing to Canonical Decomposition and then composing to NFC. | 374 | decomposing to Canonical Decomposition and then composing to NFC. |
| 285 | 375 | ||
| 286 | Compatibility Composition (NFKC) | 376 | **Compatibility Composition (NFKC)** |
| 287 | : The most comprehensive composition obtained | 377 | : The most comprehensive composition obtained |
| 288 | by first decomposing to Compatibility Decomposition and then composing to NFKC. | 378 | by first decomposing to Compatibility Decomposition and then composing to NFKC. |
| 289 | 379 | ||
| 290 | Canonical Decomposition (NFD) | 380 | **Canonical Decomposition (NFD)** |
| 291 | : Only code points with canonical decompositions | 381 | : Only code points with canonical decompositions |
| 292 | are decomposed. This is a more compact and faster decomposition but will not | 382 | are decomposed. This is a more compact and faster decomposition but will not |
| 293 | provide the most comprehensive normalization possible. | 383 | provide the most comprehensive normalization possible. |
| 294 | 384 | ||
| 295 | Compatibility Decomposition (NFKD) | 385 | **Compatibility Decomposition (NFKD)** |
| 296 | : The most comprehensive decomposition method | 386 | : The most comprehensive decomposition method |
| 297 | where both canonical and compatibility decompositions are performed recursively. | 387 | where both canonical and compatibility decompositions are performed recursively. |
| 298 | 388 | ||
| 299 | zg has methods to produce all four normalization forms in the `Normalize` module. | 389 | `zg` has methods to produce all four normalization forms in the `Normalize` module. |
| 300 | 390 | ||
| 301 | In your `build.zig`: | 391 | In your `build.zig`: |
| 302 | 392 | ||
| @@ -493,7 +583,7 @@ in the same fashion as shown for `CaseFolding` and `Normalize`. | |||
| 493 | 583 | ||
| 494 | ## Scripts | 584 | ## Scripts |
| 495 | 585 | ||
| 496 | Unicode categorizes code points by the Script in which they belong. A Script | 586 | Unicode categorizes code points by the Script in which they belong. A Script |
| 497 | collects letters and other symbols that belong to a particular writing system. | 587 | collects letters and other symbols that belong to a particular writing system. |
| 498 | You can detect the Script for a code point with the `Scripts` module. | 588 | You can detect the Script for a code point with the `Scripts` module. |
| 499 | 589 | ||
| @@ -520,83 +610,35 @@ test "Scripts" { | |||
| 520 | } | 610 | } |
| 521 | ``` | 611 | ``` |
| 522 | 612 | ||
| 523 | ## Relation to Ziglyph | 613 | ## Limits |
| 524 | 614 | ||
| 525 | zg is a total re-write of some of the components of Ziglyph. The idea was to | 615 | Iterators, and fragment types such as `CodePoint`, `Grapheme` and |
| 526 | reduce binary size and improve performance. These goals were achieved by using | 616 | `Word`, use a `u32` to store the offset into a string, and the length of |
| 527 | trie-like data structures (inspired by [Ghostty's implementation](https://mitchellh.com/writing/ghostty-devlog-006)) | 617 | the fragment (`CodePoint` uses a `u3` for length, actually). |
| 528 | instead of generated functions. Where Ziglyph uses a function call, zg uses an | 618 | |
| 529 | array lookup, which is quite faster. In addition, all these data structures in | 619 | 4GiB is a lot of string. There are a few reasons to work with that much |
| 530 | zg are loaded at runtime from compressed versions in the binary. This allows | 620 | string, log files primarily, but fewer to bring it all into memory at |
| 531 | for smaller binary sizes at the expense of increased memory | 621 | once, and practically no reason at all to do anything to such a string |
| 532 | footprint at runtime. | 622 | without breaking it into smaller piece to work with. |
| 533 | 623 | ||
| 534 | Benchmarks demonstrate the above stated goals have been met: | 624 | Also, Zig compiles on 32 bit systems, where `usize` is a `u32`. Code |
| 535 | 625 | running on such systems has no choice but to handle slices in smaller | |
| 536 | ```plain | 626 | pieces. In general, if you want code to perform correctly when |
| 537 | Binary sizes ======= | 627 | encountering multi-gigabyte strings, you'll need to code for that, at a |
| 538 | 628 | level one or two steps above that in which you'll want to, for example, | |
| 539 | 172K ziglyph_case | 629 | iterate some graphemes of that string. |
| 540 | 109K zg_case | 630 | |
| 541 | 631 | That all said, `zg` modules can be passed the Boolean config option | |
| 542 | 299K ziglyph_caseless | 632 | `fat_offset`, which will make all of those data structures use a `u64` |
| 543 | 175K zg_caseless | 633 | instead. I added this option not because you should use it, which you |
| 544 | 634 | should not, but to encourage awareness that code operating on strings | |
| 545 | 91K ziglyph_codepoint | 635 | needs to pay attention to the size of those strings, and have a plan for |
| 546 | 91K zg_codepoint | 636 | when sizes get out of specification. What would your code do with a |
| 547 | 637 | 1MiB region of string with no newline? There are many questions of this | |
| 548 | 108K ziglyph_grapheme | 638 | nature, and robust code must detect when data is out of the expected |
| 549 | 109K zg_grapheme | 639 | envelope, so it can respond accordingly. |
| 550 | 640 | ||
| 551 | 208K ziglyph_normalizer | 641 | Code which does pay attention to these questions has no need for `u64` |
| 552 | 175K zg_normalize | 642 | sized offsets, and code which does not will not be helped by them. But |
| 553 | 643 | perhaps yours is an exception, in which case, by all means, configure | |
| 554 | 124K ziglyph_width | 644 | accordingly. |
| 555 | 109K zg_width | ||
| 556 | |||
| 557 | Benchmarks ========== | ||
| 558 | |||
| 559 | Ziglyph toUpperStr/toLowerStr: result: 7756580, took: 74 | ||
| 560 | Ziglyph isUpperStr/isLowerStr: result: 110959, took: 17 | ||
| 561 | zg toUpperStr/toLowerStr: result: 7756580, took: 58 | ||
| 562 | zg isUpperStr/isLowerStr: result: 110959, took: 11 | ||
| 563 | |||
| 564 | Ziglyph Normalizer.eqlCaseless: result: 626, took: 479 | ||
| 565 | zg CaseFolding.canonCaselessMatch: result: 626, took: 296 | ||
| 566 | zg CaseFolding.compatCaselessMatch: result: 626, took: 604 | ||
| 567 | |||
| 568 | Ziglyph CodePointIterator: result: 3691806, took: 2.5 | ||
| 569 | zg code_point.Iterator: result: 3691806, took: 3.3 | ||
| 570 | |||
| 571 | Ziglyph GraphemeIterator: result: 3691806, took: 78 | ||
| 572 | zg Graphemes.Iterator: result: 3691806, took: 31 | ||
| 573 | |||
| 574 | Ziglyph Normalizer.nfkc: result: 3856654, took: 411 | ||
| 575 | zg Normalize.nfkc: result: 3856654, took: 208 | ||
| 576 | |||
| 577 | Ziglyph Normalizer.nfc: result: 3878290, took: 56 | ||
| 578 | zg Normalize.nfc: result: 3878290, took: 31 | ||
| 579 | |||
| 580 | Ziglyph Normalizer.nfkd: result: 3928890, took: 163 | ||
| 581 | zg Normalize.nfkd: result: 3928890, took: 101 | ||
| 582 | |||
| 583 | Ziglyph Normalizer.nfd: result: 3950526, took: 160 | ||
| 584 | zg Normalize.nfd: result: 3950526, took: 101 | ||
| 585 | |||
| 586 | Ziglyph Normalizer.eql: result: 626, took: 321 | ||
| 587 | Zg Normalize.eql: result: 626, took: 60 | ||
| 588 | |||
| 589 | Ziglyph display_width.strWidth: result: 3700914, took: 89 | ||
| 590 | zg DisplayWidth.strWidth: result: 3700914, took: 46 | ||
| 591 | ``` | ||
| 592 | |||
| 593 | These results were obtained on a MacBook Pro (2021) with M1 Pro and 16 GiB of RAM. | ||
| 594 | |||
| 595 | In contrast to Ziglyph, zg does not have: | ||
| 596 | |||
| 597 | - Word segmentation | ||
| 598 | - Sentence segmentation | ||
| 599 | - Collation | ||
| 600 | |||
| 601 | It's possible that any missing functionality will be added in future versions, | ||
| 602 | but only if enough demand is present in the community. | ||