From 717a6ab80c3c64176d2e1ed29da173ba51ee77b4 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 6 Feb 2026 15:11:24 -0500 Subject: Allocation-free README examples --- NEWS.md | 3 + README.md | 334 +++++++++++++++++++++++++++++--------------------------------- 2 files changed, 157 insertions(+), 180 deletions(-) diff --git a/NEWS.md b/NEWS.md index ada1405..9538017 100644 --- a/NEWS.md +++ b/NEWS.md @@ -45,6 +45,9 @@ But feel free! Pro tip: use LSP superpowers to rename the instance to the name of the module, then just delete the initializer. Couldn't be simpler. +While further breaking changes are almost certain, this is the last +refactor of this total magnitude which `zg` is likely to see. + ### zg: The Module diff --git a/README.md b/README.md index d858eb4..fd46cab 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,10 @@ The Unicode version supported by zg is `16.0.0`. The minimum Zig version required is `0.15.2`. +The official release of `zg 0.16` will require Zig `0.16.x`, whatever +`x` is official this time. The last beta release will be kept around +for those who don't want to bump Zig versions right away. + ## Integrating zg into your Zig Project @@ -19,7 +23,7 @@ You first need to add zg as a dependency in your `build.zig.zon` file. In your Zig project's root directory, run: ```plain -zig fetch --save https://codeberg.org/atman/zg/archive/v0.15.3.tar.gz +zig fetch --save https://codeberg.org/atman/zg/archive/v0.16.0-beta1.tar.gz ``` Then instantiate the dependency in your `build.zig`: @@ -28,36 +32,41 @@ Then instantiate the dependency in your `build.zig`: const zg = b.dependency("zg", .{}); ``` +## Zig Module -## A Modular Approach +The `zg` package has classically been structured as a collection +of mix-and-match modules. This approach is still available, just +supplemented with a module-of- modules, also called `zg`. -zg is a modular library. This approach minimizes binary file size and memory -requirements by only including the Unicode data required for the specified module. -The following sections describe the various modules and their specific use case. +For historical reasons, many of the submodules use `TypeCase`, despite +the fact that they no longer require instantiation. Reflecting this, +the names of the modules in the `zg` scope are all `container_case`. +To use in this fashion, import like so: -### Init and Setup +```zig +exe.root_module.addImport("zg", zg.module("zg")); +``` -The code examples will show the use of `Module.init(allocator)` to create the -various modules. All of the allocating modules have a `setup` variant, which -takes a pointer and allocates in-place. +Rather than trying to split the difference, the README will reflect use +of `zg` on a submodule basis. Note that any configurations discussed can +be passed directly to the `zg` dependency import, and will reach that +submodule accordingly. -Example use: -```zig -test "Setup form" { - var graphemes = try allocator.create(Graphemes); - defer allocator.destroy(graphemes); - try graphemes.setup(allocator); - defer graphemes.deinit(allocator); -} -``` +### The Modular Approach + +`zg` is a modular library. This approach minimizes binary file size and +memory requirements by only including the Unicode data required for the +specified module. The following sections describe the various modules +and their specific use case. ## Code Points -In the `code_point` module, you'll find a data structure representing a single code -point, `CodePoint`, and an `Iterator` to iterate over the code points in a string. +In the `code_point` module, you'll find a data structure representing a +single code point, `CodePoint`, and an `Iterator` to iterate over the +code points in a string. In your `build.zig`: @@ -150,11 +159,8 @@ In your code: const Graphemes = @import("Graphemes"); test "Grapheme cluster iterator" { - const graph = try Graphemes.init(allocator); - defer graph.deinit(allocator); - const str = "He\u{301}"; // Hé - var iter = graph.iterator(str); + var iter = Graphemes.iterator(str); var i: usize = 0; @@ -176,8 +182,7 @@ test "Grapheme cluster iterator" { try expectEqualStrings("e\u{301}", gc.bytes(str)); } } -} -``` +}``` ## Words @@ -211,17 +216,15 @@ In your code: const Words = @import("Words"); test "Words" { - const wb = try Words.init(testing.allocator); - defer wb.deinit(testing.allocator); const word_str = "Metonym Μετωνύμιο メトニム"; - var w_iter = wb.iterator(word_str); + var w_iter = Words.iterator(word_str); try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str)); // Spaces are "words" too! try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str)); const in_greek = w_iter.next().?; // wordAtIndex doesn't care if the index is valid for a codepoint: for (in_greek.offset..in_greek.offset + in_greek.len) |i| { - const at_index = wb.wordAtIndex(word_str, i).bytes(word_str); + const at_index = Words.wordAtIndex(word_str, i).bytes(word_str); try testing.expectEqualStrings("Μετωνύμιο", at_index); } _ = w_iter.next(); @@ -231,7 +234,8 @@ test "Words" { ## Unicode General Categories -To detect the general category for a code point, use the `GeneralCategories` module. +To detect the general category for a code point, use the +`GeneralCategories` module. In your `build.zig`: @@ -245,31 +249,29 @@ In your code: const GeneralCategories = @import("GeneralCategories"); test "General Categories" { - const gen_cat = try GeneralCategories.init(allocator); - defer gen_cat.deinit(allocator); - // The `gc` method returns the abbreviated General Category. // These abbreviations and descriptive comments can be found // in the source file `src/GenCatData.zig` as en enum. - try expect(gen_cat.gc('A') == .Lu); // Lu: uppercase letter - try expect(gen_cat.gc('3') == .Nd); // Nd: decimal number + try expect(GeneralCategories.gc('A') == .Lu); // Lu: uppercase letter + try expect(GeneralCategories.gc('3') == .Nd); // Nd: decimal number // The following are convenience methods for groups of General // Categories. For example, all letter categories start with `L`: // Lu, Ll, Lt, Lo. - try expect(gen_cat.isControl(0)); - try expect(gen_cat.isLetter('z')); - try expect(gen_cat.isMark('\u{301}')); - try expect(gen_cat.isNumber('3')); - try expect(gen_cat.isPunctuation('[')); - try expect(gen_cat.isSeparator(' ')); - try expect(gen_cat.isSymbol('©')); + try expect(GeneralCategories.isControl(0)); + try expect(GeneralCategories.isLetter('z')); + try expect(GeneralCategories.isMark('\u{301}')); + try expect(GeneralCategories.isNumber('3')); + try expect(GeneralCategories.isPunctuation('[')); + try expect(GeneralCategories.isSeparator(' ')); + try expect(GeneralCategories.isSymbol('©')); } ``` ## Unicode Properties -You can detect common properties of a code point with the `Properties` module. +You can detect common properties of a code point with the `Properties` +module. In your `build.zig`: @@ -282,48 +284,46 @@ In your code: ```zig const Properties = @import("Properties"); -test "Properties" { - const props = try Properties.init(allocator); - defer props.deinit(allocator); +const Properties = @import("Properties"); +test "Properties" { // Mathematical symbols and letters. - try expect(props.isMath('+')); + try expect(Properties.isMath('+')); // Alphabetic only code points. - try expect(props.isAlphabetic('Z')); + try expect(Properties.isAlphabetic('Z')); // Space, tab, and other separators. - try expect(props.isWhitespace(' ')); + try expect(Properties.isWhitespace(' ')); // Hexadecimal digits and variations thereof. - try expect(props.isHexDigit('f')); - try expect(!props.isHexDigit('z')); + try expect(Properties.isHexDigit('f')); + try expect(!Properties.isHexDigit('z')); // Accents, dieresis, and other combining marks. - try expect(props.isDiacritic('\u{301}')); + try expect(Properties.isDiacritic('\u{301}')); // Unicode has a specification for valid identifiers like // the ones used in programming and regular expressions. - try expect(props.isIdStart('Z')); // Identifier start character - try expect(!props.isIdStart('1')); - try expect(props.isIdContinue('1')); + try expect(Properties.isIdStart('Z')); // Identifier start character + try expect(!Properties.isIdStart('1')); + try expect(Properties.isIdContinue('1')); // The `X` versions add some code points that can appear after // normalizing a string. - try expect(props.isXidStart('\u{b33}')); // Extended identifier start character - try expect(props.isXidContinue('\u{e33}')); - try expect(!props.isXidStart('1')); + try expect(Properties.isXidStart('\u{b33}')); // Extended identifier start character + try expect(Properties.isXidContinue('\u{e33}')); + try expect(!Properties.isXidStart('1')); // Note surprising Unicode numeric type properties! - try expect(props.isNumeric('\u{277f}')); - try expect(!props.isNumeric('3')); // 3 is not numeric! - try expect(props.isDigit('\u{2070}')); - try expect(!props.isDigit('3')); // 3 is not a digit! - try expect(props.isDecimal('3')); // 3 is a decimal digit -} -``` + try expect(Properties.isNumeric('\u{277f}')); + try expect(!Properties.isNumeric('3')); // 3 is not numeric! + try expect(Properties.isDigit('\u{2070}')); + try expect(!Properties.isDigit('3')); // 3 is not a digit! + try expect(Properties.isDecimal('3')); // 3 is a decimal digit +}``` ## Letter Case Detection and Conversion -To detect and convert to and from different letter cases, use the `LetterCasing` -module. +To detect and convert to and from different letter cases, use the +`LetterCasing` module. In your `build.zig`: @@ -337,37 +337,35 @@ In your code: const LetterCasing = @import("LetterCasing"); test "LetterCasing" { - const case = try LetterCasing.init(allocator); - defer case.deinit(allocator); - // Upper and lower case. - try expect(case.isUpper('A')); - try expect('A' == case.toUpper('a')); - try expect(case.isLower('a')); - try expect('a' == case.toLower('A')); + try expect(LetterCasing.isUpper('A')); + try expect('A' == LetterCasing.toUpper('a')); + try expect(LetterCasing.isLower('a')); + try expect('a' == LetterCasing.toLower('A')); // Code points that have case. - try expect(case.isCased('É')); - try expect(!case.isCased('3')); + try expect(LetterCasing.isCased('É')); + try expect(!LetterCasing.isCased('3')); // Case detection and conversion for strings. - try expect(case.isUpperStr("HELLO 123!")); - const ucased = try case.toUpperStr(allocator, "hello 123"); + try expect(LetterCasing.isUpperStr("HELLO 123!")); + const ucased = try LetterCasing.toUpperStr(allocator, "hello 123"); defer allocator.free(ucased); try expectEqualStrings("HELLO 123", ucased); - try expect(case.isLowerStr("hello 123!")); - const lcased = try case.toLowerStr(allocator, "HELLO 123"); + try expect(LetterCasing.isLowerStr("hello 123!")); + const lcased = try LetterCasing.toLowerStr(allocator, "HELLO 123"); defer allocator.free(lcased); try expectEqualStrings("hello 123", lcased); } ``` + ## Normalization -Unicode normalization is the process of converting a string into a uniform -representation that can guarantee a known structure by following a strict set -of rules. There are four normalization forms: +Unicode normalization is the process of converting a string into a +uniform representation that can guarantee a known structure by following +a strict set of rules. There are four normalization forms: **Canonical Composition (NFC)** : The most compact representation obtained by first @@ -400,52 +398,51 @@ In your code: const Normalize = @import("Normalize"); test "Normalize" { - const normalize = try Normalize.init(allocator); - defer normalize.deinit(allocator); // NFC: Canonical composition - const nfc_result = try normalize.nfc(allocator, "Complex char: \u{3D2}\u{301}"); + const nfc_result = try Normalize.nfc(allocator, "Complex char: \u{3D2}\u{301}"); defer nfc_result.deinit(allocator); try expectEqualStrings("Complex char: \u{3D3}", nfc_result.slice); // NFKC: Compatibility composition - const nfkc_result = try normalize.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); + const nfkc_result = try Normalize.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); defer nfkc_result.deinit(allocator); try expectEqualStrings("Complex char: \u{038E}", nfkc_result.slice); // NFD: Canonical decomposition - const nfd_result = try normalize.nfd(allocator, "Héllo World! \u{3d3}"); + const nfd_result = try Normalize.nfd(allocator, "Héllo World! \u{3d3}"); defer nfd_result.deinit(allocator); try expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", nfd_result.slice); // NFKD: Compatibility decomposition - const nfkd_result = try normalize.nfkd(allocator, "Héllo World! \u{3d3}"); + const nfkd_result = try Normalize.nfkd(allocator, "Héllo World! \u{3d3}"); defer nfkd_result.deinit(allocator); try expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", nfkd_result.slice); // Test for equality of two strings after normalizing to NFC. - try expect(try normalize.eql(allocator, "foé", "foe\u{0301}")); - try expect(try normalize.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); + try expect(try Normalize.eql(allocator, "foé", "foe\u{0301}")); + try expect(try Normalize.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); } ``` -The `Result` returned by normalization functions may or may not be copied from the -inputs given. For example, an all-ASCII input does not need to be a copy, and will -be a view of the original slice. Calling `result.deinit(allocator)` will only free -an allocated `Result`, not one which is a view. Thus it is safe to do -unconditionally. -This does mean that the validity of a `Result` can depend on the original string -staying in memory. To ensure that your `Result` is always a copy, you may call -`try result.toOwned(allocator)`, which will only make a copy if one was not -already made. +The `Result` returned by normalization functions may or may not be +copied from the inputs given. For example, an all-ASCII input does not +need to be a copy, and will be a view of the original slice. Calling +`result.deinit(allocator)` will only free an allocated `Result`, not one +which is a view. Thus it is safe to do unconditionally. + +This does mean that the validity of a `Result` can depend on the +original string staying in memory. To ensure that your `Result` is +always a copy, you may call `try result.toOwned(allocator)`, which will +only make a copy if one was not already made. ## Caseless Matching via Case Folding -Unicode provides a more efficient way of comparing strings while ignoring letter -case differences: case folding. When you case fold a string, it's converted into a -normalized case form suitable for efficient matching. Use the `CaseFold` module -for this. +Unicode provides a more efficient way of comparing strings while +ignoring letter case differences: case folding. When you case fold +a string, it's converted into a normalized case form suitable for +efficient matching. Use the `CaseFold` module for this. In your `build.zig`: @@ -459,54 +456,32 @@ In your code: const CaseFolding = @import("CaseFolding"); test "Caseless matching" { - // We need Unicode case fold data. - const case_fold = try CaseFolding.init(allocator); - defer case_fold.deinit(allocator); - // `compatCaselessMatch` provides the deepest level of caseless // matching because it decomposes fully to NFKD. const a = "Héllo World! \u{3d3}"; const b = "He\u{301}llo World! \u{3a5}\u{301}"; - try expect(try case_fold.compatCaselessMatch(allocator, a, b)); + try expect(try CaseFolding.compatCaselessMatch(allocator, a, b)); const c = "He\u{301}llo World! \u{3d2}\u{301}"; - try expect(try case_fold.compatCaselessMatch(allocator, a, c)); + try expect(try CaseFolding.compatCaselessMatch(allocator, a, c)); // `canonCaselessMatch` isn't as comprehensive as `compatCaselessMatch` // because it only decomposes to NFD. Naturally, it's faster because of this. - try expect(!try case_fold.canonCaselessMatch(allocator, a, b)); - try expect(try case_fold.canonCaselessMatch(allocator, a, c)); -} -``` -Case folding needs to use the `Normalize` module in order to produce the compatibility -forms for comparison. If you are already using a `Normalize` for other purposes, -`CaseFolding` can borrow it: - -```zig -const CaseFolding = @import("CaseFolding"); -const Normalize = @import("Normalize"); - -test "Initialize With a Normalize" { - const normalize = try Normalize.init(allocator); - // You're responsible for freeing this: - defer normalize.deinit(allocator); - const case_fold = try CaseFolding.initWithNormalize(allocator, normalize); - // This will not free your normalize when it runs first. - defer case_fold.deinit(allocator); + try expect(!try CaseFolding.canonCaselessMatch(allocator, a, b)); + try expect(try CaseFolding.canonCaselessMatch(allocator, a, c)); } ``` -This has a `setupWithNormalize` variant as well, note that this also takes -a `Normalize` struct, and not a pointer to it. ## Display Width of Characters and Strings -When displaying text with a fixed-width font on a terminal screen, it's very -important to know exactly how many columns or cells each character should take. -Most characters will use one column, but there are many, like emoji and East- -Asian ideographs that need more space. The `DisplayWidth` module provides -methods for this purpose. It also has methods that use the display width calculation -to `center`, `padLeft`, `padRight`, and `wrap` text. +When displaying text with a fixed-width font on a terminal screen, it's +very important to know exactly how many columns or cells each character +should take. Most characters will use one column, but there are +many, like emoji and East- Asian ideographs that need more space. The +`DisplayWidth` module provides methods for this purpose. It also has +methods that use the display width calculation to `center`, `padLeft`, +`padRight`, and `wrap` text. In your `build.zig`: @@ -520,34 +495,31 @@ In your code: const DisplayWidth = @import("DisplayWidth"); test "Display width" { - const dw = try DisplayWidth.init(allocator); - defer dw.deinit(allocator); - // String display width - try expectEqual(@as(usize, 5), dw.strWidth("Hello\r\n")); - try expectEqual(@as(usize, 8), dw.strWidth("Hello 😊")); - try expectEqual(@as(usize, 8), dw.strWidth("Héllo 😊")); - try expectEqual(@as(usize, 9), dw.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ")); - try expectEqual(@as(usize, 17), dw.strWidth("슬라바 우크라이나")); + try expectEqual(@as(usize, 5), DisplayWidth.strWidth("Hello\r\n")); + try expectEqual(@as(usize, 8), DisplayWidth.strWidth("Hello 😊")); + try expectEqual(@as(usize, 8), DisplayWidth.strWidth("Héllo 😊")); + try expectEqual(@as(usize, 9), DisplayWidth.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ")); + try expectEqual(@as(usize, 17), DisplayWidth.strWidth("슬라바 우크라이나")); // Centering text - const centered = try dw.center(allocator, "w😊w", 10, "-"); + const centered = try DisplayWidth.center(allocator, "w😊w", 10, "-"); defer allocator.free(centered); try expectEqualStrings("---w😊w---", centered); // Pad left - const right_aligned = try dw.padLeft(allocator, "abc", 9, "*"); + const right_aligned = try DisplayWidth.padLeft(allocator, "abc", 9, "*"); defer allocator.free(right_aligned); try expectEqualStrings("******abc", right_aligned); // Pad right - const left_aligned = try dw.padRight(allocator, "abc", 9, "*"); + const left_aligned = try DisplayWidth.padRight(allocator, "abc", 9, "*"); defer allocator.free(left_aligned); try expectEqualStrings("abc******", left_aligned); // Wrap text const input = "The quick brown fox\r\njumped over the lazy dog!"; - const wrapped = try dw.wrap(allocator, input, 10, 3); + const wrapped = try DisplayWidth.wrap(allocator, input, 10, 3); defer allocator.free(wrapped); const want = \\The quick @@ -560,7 +532,8 @@ test "Display width" { } ``` -This module has build options. The first is `cjk`, which will consider [ambiguous characters](https://www.unicode.org/reports/tr11/tr11-6.html) as double-width. +This module has build options. The first is `cjk`, which will consider +[ambiguous characters][ambig] as double-width. To choose this option, add it to the dependency like so: @@ -570,22 +543,28 @@ const zg = b.dependency("zg", .{ }); ``` -The other options are `c0_width` and `c1_width`. The standard behavior is to treat -C0 and C1 control codes as zero-width, except for delete and backspace, which are --1 (the logic ensures that a `strWidth` is always at least 0). If printing -control codes with replacement characters, it's necessary to assign these a width, -hence the options. When provided these values must fit in an `i4`, this allows -for C1s to be printed as `\u{80}` if desired. +The other options are `c0_width` and `c1_width`. The standard behavior +is to treat C0 and C1 control codes as zero-width, except for delete and +backspace, which are -1 (the logic ensures that a `strWidth` is always +at least 0). If printing control codes with replacement characters, +it's necessary to assign these a width, hence the options. When +provided these values must fit in an `i4`, this allows for C1s to be +printed as `\u{80}` if desired. + +`DisplayWidth` uses the `Graphemes` module internally. +If you already have one, it can be borrowed using +`DisplayWidth.initWithGraphemes(allocator, graphemes)` in the same +fashion as shown for `CaseFolding` and `Normalize`. -`DisplayWidth` uses the `Graphemes` module internally. If you already have one, -it can be borrowed using `DisplayWidth.initWithGraphemes(allocator, graphemes)` -in the same fashion as shown for `CaseFolding` and `Normalize`. + +[ambig]: https://www.unicode.org/reports/tr11/tr11-6.html ## Scripts -Unicode categorizes code points by the Script in which they belong. A Script -collects letters and other symbols that belong to a particular writing system. -You can detect the Script for a code point with the `Scripts` module. +Unicode categorizes code points by the Script in which they belong. A +Script collects letters and other symbols that belong to a particular +writing system. You can detect the Script for a code point with the +`Scripts` module. In your `build.zig`: @@ -596,23 +575,21 @@ exe.root_module.addImport("Scripts", zg.module("Scripts")); In your code: ```zig -const Scripts= @import("Scripts"); +const Scripts = @import("Scripts"); test "Scripts" { - const scripts = try Scripts.init(allocator); - defer scripts.deinit(allocator); - // To see the full list of Scripts, look at the // `src/Scripts.zig` file. They are list in an enum. - try expect(scripts.script('A') == .Latin); - try expect(scripts.script('Ω') == .Greek); - try expect(scripts.script('צ') == .Hebrew); + try expect(Scripts.script('A') == .Latin); + try expect(Scripts.script('Ω') == .Greek); + try expect(Scripts.script('צ') == .Hebrew); } ``` ## Emoji -To get information about emoji and emoji-like characters, use the `Emoji` module. +To get information about emoji and emoji-like characters, use the +`Emoji` module. In your `build.zig`: @@ -626,15 +603,12 @@ In your code: const Emoji = @import("Emoji"); test "Emoji" { - const emoji = try Emoji.init(allocator); - defer emoji.deinit(allocator); - - try expect(emoji.isEmoji(0x1F415)); // 🐕 - try expect(emoji.isEmojiPresentation(0x1F408)); // 🐈 - try expect(emoji.isEmojiModifier(0x1F3FF)); // 🏿 - try expect(emoji.isEmojiModifierBase(0x1F977)); // 🥷 - try expect(emoji.isEmojiComponent(0x1F9B0)); // 🦰 - try expect(emoji.isExtendedPictographic(0x1F005)); // 🀅 + try expect(Emoji.isEmoji(0x1F415)); // 🐕 + try expect(Emoji.isEmojiPresentation(0x1F408)); // 🐈 + try expect(Emoji.isEmojiModifier(0x1F3FF)); // + try expect(Emoji.isEmojiModifierBase(0x1F977)); // 🥷 + try expect(Emoji.isEmojiComponent(0x1F9B0)); // 🦰 + try expect(Emoji.isExtendedPictographic(0x1F005)); // 🀅 } ``` -- cgit v1.2.3