From 1e837fe5901a822c2096198e0294541cda743dbe Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 16 May 2025 13:11:24 -0400 Subject: Merge stranded README changes from v0.14 release Rebasing my way through that again was just not in the cards. --- README.md | 84 +++++++++++++++++++++++++++++++-------------------------------- 1 file changed, 42 insertions(+), 42 deletions(-) (limited to 'README.md') diff --git a/README.md b/README.md index 4af2ca2..bfa8d5e 100644 --- a/README.md +++ b/README.md @@ -154,7 +154,7 @@ In your code: ```zig const GeneralCategories = @import("GeneralCategories"); -test "General Category" { +test "General Categories" { const gen_cat = try GeneralCategories.init(allocator); defer gen_cat.deinit(allocator); @@ -246,7 +246,7 @@ In your code: ```zig const LetterCasing = @import("LetterCasing"); -test "Case" { +test "LetterCasing" { const case = try LetterCasing.init(allocator); defer case.deinit(allocator); @@ -309,7 +309,7 @@ In your code: ```zig const Normalize = @import("Normalize"); -test "Normalization" { +test "Normalize" { const normalize = try Normalize.init(allocator); defer normalize.deinit(allocator); @@ -377,15 +377,15 @@ test "Caseless matching" { // matching because it decomposes fully to NFKD. const a = "Héllo World! \u{3d3}"; const b = "He\u{301}llo World! \u{3a5}\u{301}"; - try expect(try case_fold.compatCaselessMatch(allocator, &n, a, b)); + try expect(try case_fold.compatCaselessMatch(allocator, a, b)); const c = "He\u{301}llo World! \u{3d2}\u{301}"; - try expect(try case_fold.compatCaselessMatch(allocator, &n, a, c)); + try expect(try case_fold.compatCaselessMatch(allocator, a, c)); // `canonCaselessMatch` isn't as comprehensive as `compatCaselessMatch` // because it only decomposes to NFD. Naturally, it's faster because of this. - try expect(!try case_fold.canonCaselessMatch(allocator, &n, a, b)); - try expect(try case_fold.canonCaselessMatch(allocator, &n, a, c)); + try expect(!try case_fold.canonCaselessMatch(allocator, a, b)); + try expect(try case_fold.canonCaselessMatch(allocator, a, c)); } ``` Case folding needs to use the `Normalize` module in order to produce the compatibility @@ -536,61 +536,61 @@ Benchmarks demonstrate the above stated goals have been met: ```plain Binary sizes ======= -149K ziglyph_case -87K zg_case +172K ziglyph_case +109K zg_case -275K ziglyph_caseless -168K zg_caseless +299K ziglyph_caseless +175K zg_caseless -68K ziglyph_codepoint -68K zg_codepoint +91K ziglyph_codepoint +91K zg_codepoint -101K ziglyph_grapheme -86K zg_grapheme +108K ziglyph_grapheme +109K zg_grapheme -185K ziglyph_normalizer -152K zg_normalize +208K ziglyph_normalizer +175K zg_normalize -101K ziglyph_width -86K zg_width +124K ziglyph_width +109K zg_width Benchmarks ========== -Ziglyph toUpperStr/toLowerStr: result: 7911596, took: 80 +Ziglyph toUpperStr/toLowerStr: result: 7756580, took: 74 Ziglyph isUpperStr/isLowerStr: result: 110959, took: 17 -zg toUpperStr/toLowerStr: result: 7911596, took: 62 -zg isUpperStr/isLowerStr: result: 110959, took: 7 +zg toUpperStr/toLowerStr: result: 7756580, took: 58 +zg isUpperStr/isLowerStr: result: 110959, took: 11 -Ziglyph Normalizer.eqlCaseless: result: 625, took: 500 -zg CaseFold.canonCaselessMatch: result: 625, took: 385 -zg CaseFold.compatCaselessMatch: result: 625, took: 593 +Ziglyph Normalizer.eqlCaseless: result: 626, took: 479 +zg CaseFolding.canonCaselessMatch: result: 626, took: 296 +zg CaseFolding.compatCaselessMatch: result: 626, took: 604 -Ziglyph CodePointIterator: result: 3769314, took: 2 -zg CodePointIterator: result: 3769314, took: 3 +Ziglyph CodePointIterator: result: 3691806, took: 2.5 +zg code_point.Iterator: result: 3691806, took: 3.3 -Ziglyph GraphemeIterator: result: 3691806, took: 48 -zg GraphemeIterator: result: 3691806, took: 16 +Ziglyph GraphemeIterator: result: 3691806, took: 78 +zg Graphemes.Iterator: result: 3691806, took: 31 -Ziglyph Normalizer.nfkc: result: 3934162, took: 416 -zg Normalize.nfkc: result: 3934162, took: 182 +Ziglyph Normalizer.nfkc: result: 3856654, took: 411 +zg Normalize.nfkc: result: 3856654, took: 208 -Ziglyph Normalizer.nfc: result: 3955798, took: 57 -zg Normalize.nfc: result: 3955798, took: 28 +Ziglyph Normalizer.nfc: result: 3878290, took: 56 +zg Normalize.nfc: result: 3878290, took: 31 -Ziglyph Normalizer.nfkd: result: 4006398, took: 172 -zg Normalize.nfkd: result: 4006398, took: 104 +Ziglyph Normalizer.nfkd: result: 3928890, took: 163 +zg Normalize.nfkd: result: 3928890, took: 101 -Ziglyph Normalizer.nfd: result: 4028034, took: 169 -zg Normalize.nfd: result: 4028034, took: 104 +Ziglyph Normalizer.nfd: result: 3950526, took: 160 +zg Normalize.nfd: result: 3950526, took: 101 -Ziglyph Normalizer.eql: result: 625, took: 337 -Zg Normalize.eql: result: 625, took: 53 +Ziglyph Normalizer.eql: result: 626, took: 321 +Zg Normalize.eql: result: 626, took: 60 -Ziglyph display_width.strWidth: result: 3700914, took: 71 -zg DisplayWidth.strWidth: result: 3700914, took: 24 +Ziglyph display_width.strWidth: result: 3700914, took: 89 +zg DisplayWidth.strWidth: result: 3700914, took: 46 ``` -These results were obtained on an M1 Mac with 16 GiB of RAM. +These results were obtained on a MacBook Pro (2021) with M1 Pro and 16 GiB of RAM. In contrast to Ziglyph, zg does not have: -- cgit v1.2.3 From 036923515ec05b07c381448402fc256d9c564c10 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 16 May 2025 13:13:01 -0400 Subject: Remove benchmarks, ziglyph references `ziglyph` is no longer maintained and basically abandoned, there's no need to keep the comparison between them active going forward. --- README.md | 81 --------------------------------------------------------------- 1 file changed, 81 deletions(-) (limited to 'README.md') diff --git a/README.md b/README.md index bfa8d5e..1d3899c 100644 --- a/README.md +++ b/README.md @@ -519,84 +519,3 @@ test "Scripts" { try expect(scripts.script('צ') == .Hebrew); } ``` - -## Relation to Ziglyph - -zg is a total re-write of some of the components of Ziglyph. The idea was to -reduce binary size and improve performance. These goals were achieved by using -trie-like data structures (inspired by [Ghostty's implementation](https://mitchellh.com/writing/ghostty-devlog-006)) -instead of generated functions. Where Ziglyph uses a function call, zg uses an -array lookup, which is quite faster. In addition, all these data structures in -zg are loaded at runtime from compressed versions in the binary. This allows -for smaller binary sizes at the expense of increased memory -footprint at runtime. - -Benchmarks demonstrate the above stated goals have been met: - -```plain -Binary sizes ======= - -172K ziglyph_case -109K zg_case - -299K ziglyph_caseless -175K zg_caseless - -91K ziglyph_codepoint -91K zg_codepoint - -108K ziglyph_grapheme -109K zg_grapheme - -208K ziglyph_normalizer -175K zg_normalize - -124K ziglyph_width -109K zg_width - -Benchmarks ========== - -Ziglyph toUpperStr/toLowerStr: result: 7756580, took: 74 -Ziglyph isUpperStr/isLowerStr: result: 110959, took: 17 -zg toUpperStr/toLowerStr: result: 7756580, took: 58 -zg isUpperStr/isLowerStr: result: 110959, took: 11 - -Ziglyph Normalizer.eqlCaseless: result: 626, took: 479 -zg CaseFolding.canonCaselessMatch: result: 626, took: 296 -zg CaseFolding.compatCaselessMatch: result: 626, took: 604 - -Ziglyph CodePointIterator: result: 3691806, took: 2.5 -zg code_point.Iterator: result: 3691806, took: 3.3 - -Ziglyph GraphemeIterator: result: 3691806, took: 78 -zg Graphemes.Iterator: result: 3691806, took: 31 - -Ziglyph Normalizer.nfkc: result: 3856654, took: 411 -zg Normalize.nfkc: result: 3856654, took: 208 - -Ziglyph Normalizer.nfc: result: 3878290, took: 56 -zg Normalize.nfc: result: 3878290, took: 31 - -Ziglyph Normalizer.nfkd: result: 3928890, took: 163 -zg Normalize.nfkd: result: 3928890, took: 101 - -Ziglyph Normalizer.nfd: result: 3950526, took: 160 -zg Normalize.nfd: result: 3950526, took: 101 - -Ziglyph Normalizer.eql: result: 626, took: 321 -Zg Normalize.eql: result: 626, took: 60 - -Ziglyph display_width.strWidth: result: 3700914, took: 89 -zg DisplayWidth.strWidth: result: 3700914, took: 46 -``` - -These results were obtained on a MacBook Pro (2021) with M1 Pro and 16 GiB of RAM. - -In contrast to Ziglyph, zg does not have: - -- Word segmentation -- Sentence segmentation -- Collation - -It's possible that any missing functionality will be added in future versions, -but only if enough demand is present in the community. -- cgit v1.2.3 From f4a174e27052e38aec09840e9195981cc2f24c88 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 23 May 2025 19:01:57 -0400 Subject: Document "fat_offset" in README --- README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'README.md') diff --git a/README.md b/README.md index 1d3899c..1da50f3 100644 --- a/README.md +++ b/README.md @@ -519,3 +519,24 @@ test "Scripts" { try expect(scripts.script('צ') == .Hebrew); } ``` + +## Limits + +Iterators, and fragment types such as `CodePoint`, `Grapheme` and `Word`, use a +`u32` to store the offset into a string, and the length of the fragment +(`CodePoint` uses a `u3` for length, actually). + +4GiB is a lot of string. There are a few reasons to work with that much +string, log files primarily, but fewer to bring it all into memory at once, and +practically no reason at all to do anything to such a string without breaking +it into smaller piece to work with. + +Also, Zig compiles on 32 bit systems, where `usize` is 32. Code running on +such systems has no choice but to handle slices in smaller pieces. In general, +if you want code to perform correctly when encountering multi- gigabyte +strings, you'll need to code for that, at a level one or two steps above that +in which you'll want to, for example, iterate some graphemes of that string. + +That all said, `zg` modules can be passed the Boolean config option +`fat_offset`, which will make all of those data structures use a `u64` instead. +You don't actually want to do this. But you can. -- cgit v1.2.3 From e3082e64b3ab8a8aa0777d63be69eb8b6d50a654 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Tue, 8 Jul 2025 12:12:20 -0400 Subject: Add Words.zig example to README --- README.md | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 129 insertions(+), 27 deletions(-) (limited to 'README.md') diff --git a/README.md b/README.md index 1da50f3..3abe480 100644 --- a/README.md +++ b/README.md @@ -2,21 +2,24 @@ zg provides Unicode text processing for Zig projects. + ## Unicode Version The Unicode version supported by zg is `16.0.0`. + ## Zig Version The minimum Zig version required is `0.14`. + ## Integrating zg into your Zig Project You first need to add zg as a dependency in your `build.zig.zon` file. In your Zig project's root directory, run: ```plain -zig fetch --save https://codeberg.org/atman/zg/archive/v0.14.0-rc1.tar.gz +zig fetch --save https://codeberg.org/atman/zg/archive/v0.14.1.tar.gz ``` Then instantiate the dependency in your `build.zig`: @@ -25,12 +28,14 @@ Then instantiate the dependency in your `build.zig`: const zg = b.dependency("zg", .{}); ``` + ## A Modular Approach zg is a modular library. This approach minimizes binary file size and memory requirements by only including the Unicode data required for the specified module. The following sections describe the various modules and their specific use case. + ### Init and Setup The code examples will show the use of `Module.init(allocator)` to create the @@ -67,7 +72,7 @@ const code_point = @import("code_point"); test "Code point iterator" { const str = "Hi 😊"; - var iter = code_point.Iterator{ .bytes = str }; + var iter: code_point.Iterator = .init(str); var i: usize = 0; while (iter.next()) |cp| : (i += 1) { @@ -78,25 +83,60 @@ test "Code point iterator" { if (i == 3) { try expect(cp.code == '😊'); - // The `offset` field is the byte offset in the // source string. try expect(cp.offset == 3); - try expectEqual(code_point.CodePoint, code_point.decodeAtIndex(str, cp.offset)); - + try expectEqual(cp, code_point.decodeAtIndex(str, cp.offset).?); // The `len` field is the length in bytes of the // code point in the source string. try expect(cp.len == 4); + // There is also a 'cursor' decode, like so: + { + var cursor = cp.offset; + try expectEqual(cp, code_point.decodeAtCursor(str, &cursor).?); + // Which advances the cursor variable to the next possible + // offset, in this case, `str.len`. Don't forget to account + // for this possibility! + try expectEqual(cp.offset + cp.len, cursor); + } + // There's also this, for when you aren't sure if you have the + // correct start for a code point: + try expectEqual(cp, code_point.codepointAtIndex(str, cp.offset + 1).?); } + // Reverse iteration is also an option: + var r_iter: code_point.ReverseIterator = .init(str); + // Both iterators can be peeked: + try expectEqual('😊', r_iter.peek().?.code); + try expectEqual('😊', r_iter.prev().?.code); + // Both kinds of iterators can be reversed: + var fwd_iter = r_iter.forwardIterator(); // or iter.reverseIterator(); + // This will always return the last codepoint from + // the prior iterator, _if_ it yielded one: + try expectEqual('😊', fwd_iter.next().?.code); } } ``` +Note that it's safe to call CodePoint functions on invalid +UTF-8. Iterators and decode functions will return the Unicode +Replacement Character `U+FFFD`, according to the Substitution of Maximal +Subparts algorithm, for any invalid code unit sequences encountered. + + ## Grapheme Clusters -Many characters are composed from more than one code point. These are known as -Grapheme Clusters, and the `Graphemes` module has a data structure to represent -them, `Grapheme`, and an `Iterator` to iterate over them in a string. +Many characters are composed from more than one code point. These +are known as Grapheme Clusters, and the `Graphemes` module has a +data structure to represent them, `Grapheme`, and an `Iterator` and +`ReverseIterator` to iterate over them in a string. + +There is also `graphemeAtIndex`, which returns whatever grapheme +belongs to the index; this does not have to be on a valid grapheme +or codepoint boundary, but it is illegal to call on an empty string. +Last, `iterateAfterGrapheme` or `iterateBeforeGrapheme` will provide +forward or backward grapheme iterators of the string, from the grapheme +provided. Thus, given an index, you can begin forward or backward +iteration at that index without needing to slice the string. In your `build.zig`: @@ -139,6 +179,56 @@ test "Grapheme cluster iterator" { } ``` + +## Words + +Unicode has a standard word segmentation algorithm, which gives good +results for most languages. Some languages, such as Thai, require a +dictionary to find the boundary between words; these cases are not +handled by the standard algorithm. + +`zg` implements that algorithm in the `Words` module. As a note, +the iterators and functions provided here will yield segments which +are not a "word" in the conventional sense, but word _boundaries_. +Specifically, the iterators in this module will return every segment of +a string, ensuring that words are kept whole when encountered. If the +word breaks are of primary interest, you'll want to use the `.offset` +field of each iterated value, and handle `string.len` as the final case +when the iteration returns `null`. + +The API is congruent with `Graphemes`: forward and backward iterators, +`wordAtIndex`, and `iterateAfter` and before. + +In your `build.zig`: + +```zig +exe.root_module.addImport("Words", zg.module("Words")); +``` + +In your code: + +```zig +const Words = @import("Words"); + +test "Words" { + const wb = try Words.init(testing.allocator); + defer wb.deinit(testing.allocator); + const word_str = "Metonym Μετωνύμιο メトニム"; + var w_iter = wb.iterator(word_str); + try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str)); + // Spaces are "words" too! + try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str)); + const in_greek = w_iter.next().?; + // wordAtIndex doesn't care if the index is valid for a codepoint: + for (in_greek.offset..in_greek.offset + in_greek.len) |i| { + const at_index = wb.wordAtIndex(word_str, i).bytes(word_str); + try testing.expectEqualStrings("Μετωνύμιο", at_index); + } + _ = w_iter.next(); + try testing.expectEqualStrings("メトニム", w_iter.next().?.bytes(word_str)); +} +``` + ## Unicode General Categories To detect the general category for a code point, use the `GeneralCategories` module. @@ -279,24 +369,24 @@ Unicode normalization is the process of converting a string into a uniform representation that can guarantee a known structure by following a strict set of rules. There are four normalization forms: -Canonical Composition (NFC) +**Canonical Composition (NFC)** : The most compact representation obtained by first decomposing to Canonical Decomposition and then composing to NFC. -Compatibility Composition (NFKC) +**Compatibility Composition (NFKC)** : The most comprehensive composition obtained by first decomposing to Compatibility Decomposition and then composing to NFKC. -Canonical Decomposition (NFD) +**Canonical Decomposition (NFD)** : Only code points with canonical decompositions are decomposed. This is a more compact and faster decomposition but will not provide the most comprehensive normalization possible. -Compatibility Decomposition (NFKD) +**Compatibility Decomposition (NFKD)** : The most comprehensive decomposition method where both canonical and compatibility decompositions are performed recursively. -zg has methods to produce all four normalization forms in the `Normalize` module. +`zg` has methods to produce all four normalization forms in the `Normalize` module. In your `build.zig`: @@ -493,7 +583,7 @@ in the same fashion as shown for `CaseFolding` and `Normalize`. ## Scripts -Unicode categorizes code points by the Script in which they belong. A Script +Unicode categorizes code points by the Script in which they belong. A Script collects letters and other symbols that belong to a particular writing system. You can detect the Script for a code point with the `Scripts` module. @@ -522,21 +612,33 @@ test "Scripts" { ## Limits -Iterators, and fragment types such as `CodePoint`, `Grapheme` and `Word`, use a -`u32` to store the offset into a string, and the length of the fragment -(`CodePoint` uses a `u3` for length, actually). +Iterators, and fragment types such as `CodePoint`, `Grapheme` and +`Word`, use a `u32` to store the offset into a string, and the length of +the fragment (`CodePoint` uses a `u3` for length, actually). 4GiB is a lot of string. There are a few reasons to work with that much -string, log files primarily, but fewer to bring it all into memory at once, and -practically no reason at all to do anything to such a string without breaking -it into smaller piece to work with. +string, log files primarily, but fewer to bring it all into memory at +once, and practically no reason at all to do anything to such a string +without breaking it into smaller piece to work with. -Also, Zig compiles on 32 bit systems, where `usize` is 32. Code running on -such systems has no choice but to handle slices in smaller pieces. In general, -if you want code to perform correctly when encountering multi- gigabyte -strings, you'll need to code for that, at a level one or two steps above that -in which you'll want to, for example, iterate some graphemes of that string. +Also, Zig compiles on 32 bit systems, where `usize` is a `u32`. Code +running on such systems has no choice but to handle slices in smaller +pieces. In general, if you want code to perform correctly when +encountering multi-gigabyte strings, you'll need to code for that, at a +level one or two steps above that in which you'll want to, for example, +iterate some graphemes of that string. That all said, `zg` modules can be passed the Boolean config option -`fat_offset`, which will make all of those data structures use a `u64` instead. -You don't actually want to do this. But you can. +`fat_offset`, which will make all of those data structures use a `u64` +instead. I added this option not because you should use it, which you +should not, but to encourage awareness that code operating on strings +needs to pay attention to the size of those strings, and have a plan for +when sizes get out of specification. What would your code do with a +1MiB region of string with no newline? There are many questions of this +nature, and robust code must detect when data is out of the expected +envelope, so it can respond accordingly. + +Code which does pay attention to these questions has no need for `u64` +sized offsets, and code which does not will not be helped by them. But +perhaps yours is an exception, in which case, by all means, configure +accordingly. -- cgit v1.2.3