From e3082e64b3ab8a8aa0777d63be69eb8b6d50a654 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Tue, 8 Jul 2025 12:12:20 -0400 Subject: Add Words.zig example to README --- NEWS.md | 108 +++++++++++++++++++++++++++++++++++++ README.md | 156 +++++++++++++++++++++++++++++++++++++++++++---------- src/Words.zig | 17 ++++++ src/code_point.zig | 3 ++ 4 files changed, 257 insertions(+), 27 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8131878..0ccf151 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,113 @@ # News +## zg v0.14.1 Release Notes + +In a flurry of activity during and after the `v0.14.0` beta, several +features were added (including from a new contributor!), and a bug +fixed. + +Presenting `zg v0.14.1`. As should be expected from a patch release, +there are no breaking changes to the interface, just bug fixes and +features. + +### Grapheme Zalgo Text Bugfix + +Until this release, `zg` was using a `u8` to store the length of a +`Grapheme`. While this is much larger than any "real" grapheme, the +Unicode grapheme segmentation algorithm allows graphemes of arbitrary +size to be constructed, often called [Zalgo text][Zalgo] after a +notorious and funny Stack Overflow answer making use of this affordance. + +Therefore, a crafted string could force an integer overflow, with all that +comes with it. The `.len` field of a `Grapheme` is now a `u32`, like the +`.offset` field. Due to padding, the `Grapheme` is the same size as it +was, just making use of the entire 8 bytes. + +Actually, both fields are now `uoffset`, for reasons described next. + +[Zalgo]: https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454 + +### Limits Section Added to README + +The README now clearly documents that some data structures and iterators +in `zg` use a `u32`. I've also made it possible to configure the library +to use a `u64` instead, and have included an explanation of why this is +not the solution to actual problems which it at first might seem. + +My job as maintainer is to provide a useful library to the community, and +comptime makes it easy and pleasant to tailor types to purpose. So for those +who see a need for `u64` values in those structures, just pass `-Dfat_offset` +or its equivalent, and you'll have them. + +I believe this to be neither necessary nor sufficient for handling data of +that size. But I can't anticipate every requirement, and don't want to +preclude it as a solution. + +### Iterators, Back and Forth + +A new contributor, Nemoos, took on the challenge of adding a reverse +iterator to `Graphemes`. Thanks Nemoos! + +I've taken the opportunity to fill in a few bits of functionality to +flesh these out. `code_point` now has a reverse iterator as well, and +either a forward or backward iterator can be reversed in-place. + +Reversing an iterator will always return the last non-`null` result +of calling that iterator. This is the only sane behavior, but +might be a bit unexpected without prior warning. + +There's also `codePointAtIndex` and `graphemeAtIndex`. These can be +given any index which falls within the Grapheme or Codepoint which +is returned. These always return a value, and therefore cannot be +called on an empty string. + +Finally, `Graphemes.iterateAfterGrapheme(string, grapheme)` will +return a forward iterator which will yield the grapheme after +`grapheme` when first called. `iterateBeforeGrapheme` has the +signature and result one might expect from this. + +`code_point` doesn't have an equivalent of those, since it isn't +useful: codepoints are one to four bytes in length, while obtaining +a grapheme reliably, given only an index, involves some pretty tricky +business to get right. The `Graphemes` API just described allows +code to obtain a Grapheme cursor and then begin iterating in either +direction, by calling `graphemeAtIndex` and providing it to either +of those functions. For codepoints, starting an iterator at either +`.offset` or `.offset + .len` will suffice, since the `CodePoint` +iterator is otherwise stateless. + +### Words Module + +The [Unicode annex][tr29] with the canonical grapheme segmentation +algorithm also includes algorithms for word and sentence segmentation. +`v0.14.1` includes an implementation of the word algorithm. + +It works like `Graphemes`. There's forward and reverse iteration, +`wordAtIndex`, and `iterate(Before|After)Word`. + +If anyone is looking for a challenge, there are open issues for sentence +segmentation and [line breaking][tr14]. + +[tr29]: https://www.unicode.org/reports/tr29/ +[tr14]: https://www.unicode.org/reports/tr14/ + +#### Runeset Used + +As a point of interest: + +Most of the rules in the word breaking algorithm come from a distinct +property table, `WordBreakProperties.txt` from the [UCD][UCD]. These +are made into a data structure familiar from the other modules. + +One rule, WB3c, uses the Extended Pictographic property. This is also +used in `Graphemes`, but to avoid a dependency on that library, I used +a [Runeset][Rune]. This is included statically, with only just as much +code as needed to recognize the sequences; `zg` itself remains free of +transitive dependencies. + +[UCD]: https://www.unicode.org/reports/tr44/ +[Rune]: https://github.com/mnemnion/runeset + ## zg v0.14.0 Release Notes This is the first minor point release since Sam Atman (me) took over diff --git a/README.md b/README.md index 1da50f3..3abe480 100644 --- a/README.md +++ b/README.md @@ -2,21 +2,24 @@ zg provides Unicode text processing for Zig projects. + ## Unicode Version The Unicode version supported by zg is `16.0.0`. + ## Zig Version The minimum Zig version required is `0.14`. + ## Integrating zg into your Zig Project You first need to add zg as a dependency in your `build.zig.zon` file. In your Zig project's root directory, run: ```plain -zig fetch --save https://codeberg.org/atman/zg/archive/v0.14.0-rc1.tar.gz +zig fetch --save https://codeberg.org/atman/zg/archive/v0.14.1.tar.gz ``` Then instantiate the dependency in your `build.zig`: @@ -25,12 +28,14 @@ Then instantiate the dependency in your `build.zig`: const zg = b.dependency("zg", .{}); ``` + ## A Modular Approach zg is a modular library. This approach minimizes binary file size and memory requirements by only including the Unicode data required for the specified module. The following sections describe the various modules and their specific use case. + ### Init and Setup The code examples will show the use of `Module.init(allocator)` to create the @@ -67,7 +72,7 @@ const code_point = @import("code_point"); test "Code point iterator" { const str = "Hi 😊"; - var iter = code_point.Iterator{ .bytes = str }; + var iter: code_point.Iterator = .init(str); var i: usize = 0; while (iter.next()) |cp| : (i += 1) { @@ -78,25 +83,60 @@ test "Code point iterator" { if (i == 3) { try expect(cp.code == '😊'); - // The `offset` field is the byte offset in the // source string. try expect(cp.offset == 3); - try expectEqual(code_point.CodePoint, code_point.decodeAtIndex(str, cp.offset)); - + try expectEqual(cp, code_point.decodeAtIndex(str, cp.offset).?); // The `len` field is the length in bytes of the // code point in the source string. try expect(cp.len == 4); + // There is also a 'cursor' decode, like so: + { + var cursor = cp.offset; + try expectEqual(cp, code_point.decodeAtCursor(str, &cursor).?); + // Which advances the cursor variable to the next possible + // offset, in this case, `str.len`. Don't forget to account + // for this possibility! + try expectEqual(cp.offset + cp.len, cursor); + } + // There's also this, for when you aren't sure if you have the + // correct start for a code point: + try expectEqual(cp, code_point.codepointAtIndex(str, cp.offset + 1).?); } + // Reverse iteration is also an option: + var r_iter: code_point.ReverseIterator = .init(str); + // Both iterators can be peeked: + try expectEqual('😊', r_iter.peek().?.code); + try expectEqual('😊', r_iter.prev().?.code); + // Both kinds of iterators can be reversed: + var fwd_iter = r_iter.forwardIterator(); // or iter.reverseIterator(); + // This will always return the last codepoint from + // the prior iterator, _if_ it yielded one: + try expectEqual('😊', fwd_iter.next().?.code); } } ``` +Note that it's safe to call CodePoint functions on invalid +UTF-8. Iterators and decode functions will return the Unicode +Replacement Character `U+FFFD`, according to the Substitution of Maximal +Subparts algorithm, for any invalid code unit sequences encountered. + + ## Grapheme Clusters -Many characters are composed from more than one code point. These are known as -Grapheme Clusters, and the `Graphemes` module has a data structure to represent -them, `Grapheme`, and an `Iterator` to iterate over them in a string. +Many characters are composed from more than one code point. These +are known as Grapheme Clusters, and the `Graphemes` module has a +data structure to represent them, `Grapheme`, and an `Iterator` and +`ReverseIterator` to iterate over them in a string. + +There is also `graphemeAtIndex`, which returns whatever grapheme +belongs to the index; this does not have to be on a valid grapheme +or codepoint boundary, but it is illegal to call on an empty string. +Last, `iterateAfterGrapheme` or `iterateBeforeGrapheme` will provide +forward or backward grapheme iterators of the string, from the grapheme +provided. Thus, given an index, you can begin forward or backward +iteration at that index without needing to slice the string. In your `build.zig`: @@ -139,6 +179,56 @@ test "Grapheme cluster iterator" { } ``` + +## Words + +Unicode has a standard word segmentation algorithm, which gives good +results for most languages. Some languages, such as Thai, require a +dictionary to find the boundary between words; these cases are not +handled by the standard algorithm. + +`zg` implements that algorithm in the `Words` module. As a note, +the iterators and functions provided here will yield segments which +are not a "word" in the conventional sense, but word _boundaries_. +Specifically, the iterators in this module will return every segment of +a string, ensuring that words are kept whole when encountered. If the +word breaks are of primary interest, you'll want to use the `.offset` +field of each iterated value, and handle `string.len` as the final case +when the iteration returns `null`. + +The API is congruent with `Graphemes`: forward and backward iterators, +`wordAtIndex`, and `iterateAfter` and before. + +In your `build.zig`: + +```zig +exe.root_module.addImport("Words", zg.module("Words")); +``` + +In your code: + +```zig +const Words = @import("Words"); + +test "Words" { + const wb = try Words.init(testing.allocator); + defer wb.deinit(testing.allocator); + const word_str = "Metonym Μετωνύμιο メトニム"; + var w_iter = wb.iterator(word_str); + try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str)); + // Spaces are "words" too! + try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str)); + const in_greek = w_iter.next().?; + // wordAtIndex doesn't care if the index is valid for a codepoint: + for (in_greek.offset..in_greek.offset + in_greek.len) |i| { + const at_index = wb.wordAtIndex(word_str, i).bytes(word_str); + try testing.expectEqualStrings("Μετωνύμιο", at_index); + } + _ = w_iter.next(); + try testing.expectEqualStrings("メトニム", w_iter.next().?.bytes(word_str)); +} +``` + ## Unicode General Categories To detect the general category for a code point, use the `GeneralCategories` module. @@ -279,24 +369,24 @@ Unicode normalization is the process of converting a string into a uniform representation that can guarantee a known structure by following a strict set of rules. There are four normalization forms: -Canonical Composition (NFC) +**Canonical Composition (NFC)** : The most compact representation obtained by first decomposing to Canonical Decomposition and then composing to NFC. -Compatibility Composition (NFKC) +**Compatibility Composition (NFKC)** : The most comprehensive composition obtained by first decomposing to Compatibility Decomposition and then composing to NFKC. -Canonical Decomposition (NFD) +**Canonical Decomposition (NFD)** : Only code points with canonical decompositions are decomposed. This is a more compact and faster decomposition but will not provide the most comprehensive normalization possible. -Compatibility Decomposition (NFKD) +**Compatibility Decomposition (NFKD)** : The most comprehensive decomposition method where both canonical and compatibility decompositions are performed recursively. -zg has methods to produce all four normalization forms in the `Normalize` module. +`zg` has methods to produce all four normalization forms in the `Normalize` module. In your `build.zig`: @@ -493,7 +583,7 @@ in the same fashion as shown for `CaseFolding` and `Normalize`. ## Scripts -Unicode categorizes code points by the Script in which they belong. A Script +Unicode categorizes code points by the Script in which they belong. A Script collects letters and other symbols that belong to a particular writing system. You can detect the Script for a code point with the `Scripts` module. @@ -522,21 +612,33 @@ test "Scripts" { ## Limits -Iterators, and fragment types such as `CodePoint`, `Grapheme` and `Word`, use a -`u32` to store the offset into a string, and the length of the fragment -(`CodePoint` uses a `u3` for length, actually). +Iterators, and fragment types such as `CodePoint`, `Grapheme` and +`Word`, use a `u32` to store the offset into a string, and the length of +the fragment (`CodePoint` uses a `u3` for length, actually). 4GiB is a lot of string. There are a few reasons to work with that much -string, log files primarily, but fewer to bring it all into memory at once, and -practically no reason at all to do anything to such a string without breaking -it into smaller piece to work with. +string, log files primarily, but fewer to bring it all into memory at +once, and practically no reason at all to do anything to such a string +without breaking it into smaller piece to work with. -Also, Zig compiles on 32 bit systems, where `usize` is 32. Code running on -such systems has no choice but to handle slices in smaller pieces. In general, -if you want code to perform correctly when encountering multi- gigabyte -strings, you'll need to code for that, at a level one or two steps above that -in which you'll want to, for example, iterate some graphemes of that string. +Also, Zig compiles on 32 bit systems, where `usize` is a `u32`. Code +running on such systems has no choice but to handle slices in smaller +pieces. In general, if you want code to perform correctly when +encountering multi-gigabyte strings, you'll need to code for that, at a +level one or two steps above that in which you'll want to, for example, +iterate some graphemes of that string. That all said, `zg` modules can be passed the Boolean config option -`fat_offset`, which will make all of those data structures use a `u64` instead. -You don't actually want to do this. But you can. +`fat_offset`, which will make all of those data structures use a `u64` +instead. I added this option not because you should use it, which you +should not, but to encourage awareness that code operating on strings +needs to pay attention to the size of those strings, and have a plan for +when sizes get out of specification. What would your code do with a +1MiB region of string with no newline? There are many questions of this +nature, and robust code must detect when data is out of the expected +envelope, so it can respond accordingly. + +Code which does pay attention to these questions has no need for `u64` +sized offsets, and code which does not will not be helped by them. But +perhaps yours is an exception, in which case, by all means, configure +accordingly. diff --git a/src/Words.zig b/src/Words.zig index af82562..617c34d 100644 --- a/src/Words.zig +++ b/src/Words.zig @@ -674,6 +674,23 @@ test "ext_pict" { try testing.expect(ext_pict.isMatch("\u{2701}")); } +test "Words" { + const wb = try Words.init(testing.allocator); + defer wb.deinit(testing.allocator); + const word_str = "Metonym Μετωνύμιο メトニム"; + var w_iter = wb.iterator(word_str); + try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str)); + // Spaces are "words" too! + try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str)); + const in_greek = w_iter.next().?; + for (in_greek.offset..in_greek.offset + in_greek.len) |i| { + const at_index = wb.wordAtIndex(word_str, i).bytes(word_str); + try testing.expectEqualStrings("Μετωνύμιο", at_index); + } + _ = w_iter.next(); + try testing.expectEqualStrings("メトニム", w_iter.next().?.bytes(word_str)); +} + test wordAtIndex { const wb = try Words.init(testing.allocator); defer wb.deinit(testing.allocator); diff --git a/src/code_point.zig b/src/code_point.zig index 16648af..7a638af 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -121,6 +121,9 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *uoffset) ?CodePoint { } if (st == RUNE_REJECT or cursor.* == bytes.len) { @branchHint(.cold); + // This, and the branch below, detect truncation, the + // only invalid state handled differently by the Maximal + // Subparts algorithm. if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { cursor.* -= 2; // +1 return .{ -- cgit v1.2.3