diff options
Diffstat (limited to 'README.md')
| -rw-r--r-- | README.md | 334 |
1 files changed, 154 insertions, 180 deletions
| @@ -12,6 +12,10 @@ The Unicode version supported by zg is `16.0.0`. | |||
| 12 | 12 | ||
| 13 | The minimum Zig version required is `0.15.2`. | 13 | The minimum Zig version required is `0.15.2`. |
| 14 | 14 | ||
| 15 | The official release of `zg 0.16` will require Zig `0.16.x`, whatever | ||
| 16 | `x` is official this time. The last beta release will be kept around | ||
| 17 | for those who don't want to bump Zig versions right away. | ||
| 18 | |||
| 15 | 19 | ||
| 16 | ## Integrating zg into your Zig Project | 20 | ## Integrating zg into your Zig Project |
| 17 | 21 | ||
| @@ -19,7 +23,7 @@ You first need to add zg as a dependency in your `build.zig.zon` file. In your | |||
| 19 | Zig project's root directory, run: | 23 | Zig project's root directory, run: |
| 20 | 24 | ||
| 21 | ```plain | 25 | ```plain |
| 22 | zig fetch --save https://codeberg.org/atman/zg/archive/v0.15.3.tar.gz | 26 | zig fetch --save https://codeberg.org/atman/zg/archive/v0.16.0-beta1.tar.gz |
| 23 | ``` | 27 | ``` |
| 24 | 28 | ||
| 25 | Then instantiate the dependency in your `build.zig`: | 29 | Then instantiate the dependency in your `build.zig`: |
| @@ -28,36 +32,41 @@ Then instantiate the dependency in your `build.zig`: | |||
| 28 | const zg = b.dependency("zg", .{}); | 32 | const zg = b.dependency("zg", .{}); |
| 29 | ``` | 33 | ``` |
| 30 | 34 | ||
| 35 | ## Zig Module | ||
| 31 | 36 | ||
| 32 | ## A Modular Approach | 37 | The `zg` package has classically been structured as a collection |
| 38 | of mix-and-match modules. This approach is still available, just | ||
| 39 | supplemented with a module-of- modules, also called `zg`. | ||
| 33 | 40 | ||
| 34 | zg is a modular library. This approach minimizes binary file size and memory | 41 | For historical reasons, many of the submodules use `TypeCase`, despite |
| 35 | requirements by only including the Unicode data required for the specified module. | 42 | the fact that they no longer require instantiation. Reflecting this, |
| 36 | The following sections describe the various modules and their specific use case. | 43 | the names of the modules in the `zg` scope are all `container_case`. |
| 37 | 44 | ||
| 45 | To use in this fashion, import like so: | ||
| 38 | 46 | ||
| 39 | ### Init and Setup | 47 | ```zig |
| 48 | exe.root_module.addImport("zg", zg.module("zg")); | ||
| 49 | ``` | ||
| 40 | 50 | ||
| 41 | The code examples will show the use of `Module.init(allocator)` to create the | 51 | Rather than trying to split the difference, the README will reflect use |
| 42 | various modules. All of the allocating modules have a `setup` variant, which | 52 | of `zg` on a submodule basis. Note that any configurations discussed can |
| 43 | takes a pointer and allocates in-place. | 53 | be passed directly to the `zg` dependency import, and will reach that |
| 54 | submodule accordingly. | ||
| 44 | 55 | ||
| 45 | Example use: | ||
| 46 | 56 | ||
| 47 | ```zig | 57 | ### The Modular Approach |
| 48 | test "Setup form" { | 58 | |
| 49 | var graphemes = try allocator.create(Graphemes); | 59 | `zg` is a modular library. This approach minimizes binary file size and |
| 50 | defer allocator.destroy(graphemes); | 60 | memory requirements by only including the Unicode data required for the |
| 51 | try graphemes.setup(allocator); | 61 | specified module. The following sections describe the various modules |
| 52 | defer graphemes.deinit(allocator); | 62 | and their specific use case. |
| 53 | } | ||
| 54 | ``` | ||
| 55 | 63 | ||
| 56 | 64 | ||
| 57 | ## Code Points | 65 | ## Code Points |
| 58 | 66 | ||
| 59 | In the `code_point` module, you'll find a data structure representing a single code | 67 | In the `code_point` module, you'll find a data structure representing a |
| 60 | point, `CodePoint`, and an `Iterator` to iterate over the code points in a string. | 68 | single code point, `CodePoint`, and an `Iterator` to iterate over the |
| 69 | code points in a string. | ||
| 61 | 70 | ||
| 62 | In your `build.zig`: | 71 | In your `build.zig`: |
| 63 | 72 | ||
| @@ -150,11 +159,8 @@ In your code: | |||
| 150 | const Graphemes = @import("Graphemes"); | 159 | const Graphemes = @import("Graphemes"); |
| 151 | 160 | ||
| 152 | test "Grapheme cluster iterator" { | 161 | test "Grapheme cluster iterator" { |
| 153 | const graph = try Graphemes.init(allocator); | ||
| 154 | defer graph.deinit(allocator); | ||
| 155 | |||
| 156 | const str = "He\u{301}"; // Hé | 162 | const str = "He\u{301}"; // Hé |
| 157 | var iter = graph.iterator(str); | 163 | var iter = Graphemes.iterator(str); |
| 158 | 164 | ||
| 159 | var i: usize = 0; | 165 | var i: usize = 0; |
| 160 | 166 | ||
| @@ -176,8 +182,7 @@ test "Grapheme cluster iterator" { | |||
| 176 | try expectEqualStrings("e\u{301}", gc.bytes(str)); | 182 | try expectEqualStrings("e\u{301}", gc.bytes(str)); |
| 177 | } | 183 | } |
| 178 | } | 184 | } |
| 179 | } | 185 | }``` |
| 180 | ``` | ||
| 181 | 186 | ||
| 182 | 187 | ||
| 183 | ## Words | 188 | ## Words |
| @@ -211,17 +216,15 @@ In your code: | |||
| 211 | const Words = @import("Words"); | 216 | const Words = @import("Words"); |
| 212 | 217 | ||
| 213 | test "Words" { | 218 | test "Words" { |
| 214 | const wb = try Words.init(testing.allocator); | ||
| 215 | defer wb.deinit(testing.allocator); | ||
| 216 | const word_str = "Metonym Μετωνύμιο メトニム"; | 219 | const word_str = "Metonym Μετωνύμιο メトニム"; |
| 217 | var w_iter = wb.iterator(word_str); | 220 | var w_iter = Words.iterator(word_str); |
| 218 | try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str)); | 221 | try testing.expectEqualStrings("Metonym", w_iter.next().?.bytes(word_str)); |
| 219 | // Spaces are "words" too! | 222 | // Spaces are "words" too! |
| 220 | try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str)); | 223 | try testing.expectEqualStrings(" ", w_iter.next().?.bytes(word_str)); |
| 221 | const in_greek = w_iter.next().?; | 224 | const in_greek = w_iter.next().?; |
| 222 | // wordAtIndex doesn't care if the index is valid for a codepoint: | 225 | // wordAtIndex doesn't care if the index is valid for a codepoint: |
| 223 | for (in_greek.offset..in_greek.offset + in_greek.len) |i| { | 226 | for (in_greek.offset..in_greek.offset + in_greek.len) |i| { |
| 224 | const at_index = wb.wordAtIndex(word_str, i).bytes(word_str); | 227 | const at_index = Words.wordAtIndex(word_str, i).bytes(word_str); |
| 225 | try testing.expectEqualStrings("Μετωνύμιο", at_index); | 228 | try testing.expectEqualStrings("Μετωνύμιο", at_index); |
| 226 | } | 229 | } |
| 227 | _ = w_iter.next(); | 230 | _ = w_iter.next(); |
| @@ -231,7 +234,8 @@ test "Words" { | |||
| 231 | 234 | ||
| 232 | ## Unicode General Categories | 235 | ## Unicode General Categories |
| 233 | 236 | ||
| 234 | To detect the general category for a code point, use the `GeneralCategories` module. | 237 | To detect the general category for a code point, use the |
| 238 | `GeneralCategories` module. | ||
| 235 | 239 | ||
| 236 | In your `build.zig`: | 240 | In your `build.zig`: |
| 237 | 241 | ||
| @@ -245,31 +249,29 @@ In your code: | |||
| 245 | const GeneralCategories = @import("GeneralCategories"); | 249 | const GeneralCategories = @import("GeneralCategories"); |
| 246 | 250 | ||
| 247 | test "General Categories" { | 251 | test "General Categories" { |
| 248 | const gen_cat = try GeneralCategories.init(allocator); | ||
| 249 | defer gen_cat.deinit(allocator); | ||
| 250 | |||
| 251 | // The `gc` method returns the abbreviated General Category. | 252 | // The `gc` method returns the abbreviated General Category. |
| 252 | // These abbreviations and descriptive comments can be found | 253 | // These abbreviations and descriptive comments can be found |
| 253 | // in the source file `src/GenCatData.zig` as en enum. | 254 | // in the source file `src/GenCatData.zig` as en enum. |
| 254 | try expect(gen_cat.gc('A') == .Lu); // Lu: uppercase letter | 255 | try expect(GeneralCategories.gc('A') == .Lu); // Lu: uppercase letter |
| 255 | try expect(gen_cat.gc('3') == .Nd); // Nd: decimal number | 256 | try expect(GeneralCategories.gc('3') == .Nd); // Nd: decimal number |
| 256 | 257 | ||
| 257 | // The following are convenience methods for groups of General | 258 | // The following are convenience methods for groups of General |
| 258 | // Categories. For example, all letter categories start with `L`: | 259 | // Categories. For example, all letter categories start with `L`: |
| 259 | // Lu, Ll, Lt, Lo. | 260 | // Lu, Ll, Lt, Lo. |
| 260 | try expect(gen_cat.isControl(0)); | 261 | try expect(GeneralCategories.isControl(0)); |
| 261 | try expect(gen_cat.isLetter('z')); | 262 | try expect(GeneralCategories.isLetter('z')); |
| 262 | try expect(gen_cat.isMark('\u{301}')); | 263 | try expect(GeneralCategories.isMark('\u{301}')); |
| 263 | try expect(gen_cat.isNumber('3')); | 264 | try expect(GeneralCategories.isNumber('3')); |
| 264 | try expect(gen_cat.isPunctuation('[')); | 265 | try expect(GeneralCategories.isPunctuation('[')); |
| 265 | try expect(gen_cat.isSeparator(' ')); | 266 | try expect(GeneralCategories.isSeparator(' ')); |
| 266 | try expect(gen_cat.isSymbol('©')); | 267 | try expect(GeneralCategories.isSymbol('©')); |
| 267 | } | 268 | } |
| 268 | ``` | 269 | ``` |
| 269 | 270 | ||
| 270 | ## Unicode Properties | 271 | ## Unicode Properties |
| 271 | 272 | ||
| 272 | You can detect common properties of a code point with the `Properties` module. | 273 | You can detect common properties of a code point with the `Properties` |
| 274 | module. | ||
| 273 | 275 | ||
| 274 | In your `build.zig`: | 276 | In your `build.zig`: |
| 275 | 277 | ||
| @@ -282,48 +284,46 @@ In your code: | |||
| 282 | ```zig | 284 | ```zig |
| 283 | const Properties = @import("Properties"); | 285 | const Properties = @import("Properties"); |
| 284 | 286 | ||
| 285 | test "Properties" { | 287 | const Properties = @import("Properties"); |
| 286 | const props = try Properties.init(allocator); | ||
| 287 | defer props.deinit(allocator); | ||
| 288 | 288 | ||
| 289 | test "Properties" { | ||
| 289 | // Mathematical symbols and letters. | 290 | // Mathematical symbols and letters. |
| 290 | try expect(props.isMath('+')); | 291 | try expect(Properties.isMath('+')); |
| 291 | // Alphabetic only code points. | 292 | // Alphabetic only code points. |
| 292 | try expect(props.isAlphabetic('Z')); | 293 | try expect(Properties.isAlphabetic('Z')); |
| 293 | // Space, tab, and other separators. | 294 | // Space, tab, and other separators. |
| 294 | try expect(props.isWhitespace(' ')); | 295 | try expect(Properties.isWhitespace(' ')); |
| 295 | // Hexadecimal digits and variations thereof. | 296 | // Hexadecimal digits and variations thereof. |
| 296 | try expect(props.isHexDigit('f')); | 297 | try expect(Properties.isHexDigit('f')); |
| 297 | try expect(!props.isHexDigit('z')); | 298 | try expect(!Properties.isHexDigit('z')); |
| 298 | 299 | ||
| 299 | // Accents, dieresis, and other combining marks. | 300 | // Accents, dieresis, and other combining marks. |
| 300 | try expect(props.isDiacritic('\u{301}')); | 301 | try expect(Properties.isDiacritic('\u{301}')); |
| 301 | 302 | ||
| 302 | // Unicode has a specification for valid identifiers like | 303 | // Unicode has a specification for valid identifiers like |
| 303 | // the ones used in programming and regular expressions. | 304 | // the ones used in programming and regular expressions. |
| 304 | try expect(props.isIdStart('Z')); // Identifier start character | 305 | try expect(Properties.isIdStart('Z')); // Identifier start character |
| 305 | try expect(!props.isIdStart('1')); | 306 | try expect(!Properties.isIdStart('1')); |
| 306 | try expect(props.isIdContinue('1')); | 307 | try expect(Properties.isIdContinue('1')); |
| 307 | 308 | ||
| 308 | // The `X` versions add some code points that can appear after | 309 | // The `X` versions add some code points that can appear after |
| 309 | // normalizing a string. | 310 | // normalizing a string. |
| 310 | try expect(props.isXidStart('\u{b33}')); // Extended identifier start character | 311 | try expect(Properties.isXidStart('\u{b33}')); // Extended identifier start character |
| 311 | try expect(props.isXidContinue('\u{e33}')); | 312 | try expect(Properties.isXidContinue('\u{e33}')); |
| 312 | try expect(!props.isXidStart('1')); | 313 | try expect(!Properties.isXidStart('1')); |
| 313 | 314 | ||
| 314 | // Note surprising Unicode numeric type properties! | 315 | // Note surprising Unicode numeric type properties! |
| 315 | try expect(props.isNumeric('\u{277f}')); | 316 | try expect(Properties.isNumeric('\u{277f}')); |
| 316 | try expect(!props.isNumeric('3')); // 3 is not numeric! | 317 | try expect(!Properties.isNumeric('3')); // 3 is not numeric! |
| 317 | try expect(props.isDigit('\u{2070}')); | 318 | try expect(Properties.isDigit('\u{2070}')); |
| 318 | try expect(!props.isDigit('3')); // 3 is not a digit! | 319 | try expect(!Properties.isDigit('3')); // 3 is not a digit! |
| 319 | try expect(props.isDecimal('3')); // 3 is a decimal digit | 320 | try expect(Properties.isDecimal('3')); // 3 is a decimal digit |
| 320 | } | 321 | }``` |
| 321 | ``` | ||
| 322 | 322 | ||
| 323 | ## Letter Case Detection and Conversion | 323 | ## Letter Case Detection and Conversion |
| 324 | 324 | ||
| 325 | To detect and convert to and from different letter cases, use the `LetterCasing` | 325 | To detect and convert to and from different letter cases, use the |
| 326 | module. | 326 | `LetterCasing` module. |
| 327 | 327 | ||
| 328 | In your `build.zig`: | 328 | In your `build.zig`: |
| 329 | 329 | ||
| @@ -337,37 +337,35 @@ In your code: | |||
| 337 | const LetterCasing = @import("LetterCasing"); | 337 | const LetterCasing = @import("LetterCasing"); |
| 338 | 338 | ||
| 339 | test "LetterCasing" { | 339 | test "LetterCasing" { |
| 340 | const case = try LetterCasing.init(allocator); | ||
| 341 | defer case.deinit(allocator); | ||
| 342 | |||
| 343 | // Upper and lower case. | 340 | // Upper and lower case. |
| 344 | try expect(case.isUpper('A')); | 341 | try expect(LetterCasing.isUpper('A')); |
| 345 | try expect('A' == case.toUpper('a')); | 342 | try expect('A' == LetterCasing.toUpper('a')); |
| 346 | try expect(case.isLower('a')); | 343 | try expect(LetterCasing.isLower('a')); |
| 347 | try expect('a' == case.toLower('A')); | 344 | try expect('a' == LetterCasing.toLower('A')); |
| 348 | 345 | ||
| 349 | // Code points that have case. | 346 | // Code points that have case. |
| 350 | try expect(case.isCased('É')); | 347 | try expect(LetterCasing.isCased('É')); |
| 351 | try expect(!case.isCased('3')); | 348 | try expect(!LetterCasing.isCased('3')); |
| 352 | 349 | ||
| 353 | // Case detection and conversion for strings. | 350 | // Case detection and conversion for strings. |
| 354 | try expect(case.isUpperStr("HELLO 123!")); | 351 | try expect(LetterCasing.isUpperStr("HELLO 123!")); |
| 355 | const ucased = try case.toUpperStr(allocator, "hello 123"); | 352 | const ucased = try LetterCasing.toUpperStr(allocator, "hello 123"); |
| 356 | defer allocator.free(ucased); | 353 | defer allocator.free(ucased); |
| 357 | try expectEqualStrings("HELLO 123", ucased); | 354 | try expectEqualStrings("HELLO 123", ucased); |
| 358 | 355 | ||
| 359 | try expect(case.isLowerStr("hello 123!")); | 356 | try expect(LetterCasing.isLowerStr("hello 123!")); |
| 360 | const lcased = try case.toLowerStr(allocator, "HELLO 123"); | 357 | const lcased = try LetterCasing.toLowerStr(allocator, "HELLO 123"); |
| 361 | defer allocator.free(lcased); | 358 | defer allocator.free(lcased); |
| 362 | try expectEqualStrings("hello 123", lcased); | 359 | try expectEqualStrings("hello 123", lcased); |
| 363 | } | 360 | } |
| 364 | ``` | 361 | ``` |
| 365 | 362 | ||
| 363 | |||
| 366 | ## Normalization | 364 | ## Normalization |
| 367 | 365 | ||
| 368 | Unicode normalization is the process of converting a string into a uniform | 366 | Unicode normalization is the process of converting a string into a |
| 369 | representation that can guarantee a known structure by following a strict set | 367 | uniform representation that can guarantee a known structure by following |
| 370 | of rules. There are four normalization forms: | 368 | a strict set of rules. There are four normalization forms: |
| 371 | 369 | ||
| 372 | **Canonical Composition (NFC)** | 370 | **Canonical Composition (NFC)** |
| 373 | : The most compact representation obtained by first | 371 | : The most compact representation obtained by first |
| @@ -400,52 +398,51 @@ In your code: | |||
| 400 | const Normalize = @import("Normalize"); | 398 | const Normalize = @import("Normalize"); |
| 401 | 399 | ||
| 402 | test "Normalize" { | 400 | test "Normalize" { |
| 403 | const normalize = try Normalize.init(allocator); | ||
| 404 | defer normalize.deinit(allocator); | ||
| 405 | 401 | ||
| 406 | // NFC: Canonical composition | 402 | // NFC: Canonical composition |
| 407 | const nfc_result = try normalize.nfc(allocator, "Complex char: \u{3D2}\u{301}"); | 403 | const nfc_result = try Normalize.nfc(allocator, "Complex char: \u{3D2}\u{301}"); |
| 408 | defer nfc_result.deinit(allocator); | 404 | defer nfc_result.deinit(allocator); |
| 409 | try expectEqualStrings("Complex char: \u{3D3}", nfc_result.slice); | 405 | try expectEqualStrings("Complex char: \u{3D3}", nfc_result.slice); |
| 410 | 406 | ||
| 411 | // NFKC: Compatibility composition | 407 | // NFKC: Compatibility composition |
| 412 | const nfkc_result = try normalize.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); | 408 | const nfkc_result = try Normalize.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); |
| 413 | defer nfkc_result.deinit(allocator); | 409 | defer nfkc_result.deinit(allocator); |
| 414 | try expectEqualStrings("Complex char: \u{038E}", nfkc_result.slice); | 410 | try expectEqualStrings("Complex char: \u{038E}", nfkc_result.slice); |
| 415 | 411 | ||
| 416 | // NFD: Canonical decomposition | 412 | // NFD: Canonical decomposition |
| 417 | const nfd_result = try normalize.nfd(allocator, "Héllo World! \u{3d3}"); | 413 | const nfd_result = try Normalize.nfd(allocator, "Héllo World! \u{3d3}"); |
| 418 | defer nfd_result.deinit(allocator); | 414 | defer nfd_result.deinit(allocator); |
| 419 | try expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", nfd_result.slice); | 415 | try expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", nfd_result.slice); |
| 420 | 416 | ||
| 421 | // NFKD: Compatibility decomposition | 417 | // NFKD: Compatibility decomposition |
| 422 | const nfkd_result = try normalize.nfkd(allocator, "Héllo World! \u{3d3}"); | 418 | const nfkd_result = try Normalize.nfkd(allocator, "Héllo World! \u{3d3}"); |
| 423 | defer nfkd_result.deinit(allocator); | 419 | defer nfkd_result.deinit(allocator); |
| 424 | try expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", nfkd_result.slice); | 420 | try expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", nfkd_result.slice); |
| 425 | 421 | ||
| 426 | // Test for equality of two strings after normalizing to NFC. | 422 | // Test for equality of two strings after normalizing to NFC. |
| 427 | try expect(try normalize.eql(allocator, "foé", "foe\u{0301}")); | 423 | try expect(try Normalize.eql(allocator, "foé", "foe\u{0301}")); |
| 428 | try expect(try normalize.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); | 424 | try expect(try Normalize.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); |
| 429 | } | 425 | } |
| 430 | ``` | 426 | ``` |
| 431 | The `Result` returned by normalization functions may or may not be copied from the | ||
| 432 | inputs given. For example, an all-ASCII input does not need to be a copy, and will | ||
| 433 | be a view of the original slice. Calling `result.deinit(allocator)` will only free | ||
| 434 | an allocated `Result`, not one which is a view. Thus it is safe to do | ||
| 435 | unconditionally. | ||
| 436 | 427 | ||
| 437 | This does mean that the validity of a `Result` can depend on the original string | 428 | The `Result` returned by normalization functions may or may not be |
| 438 | staying in memory. To ensure that your `Result` is always a copy, you may call | 429 | copied from the inputs given. For example, an all-ASCII input does not |
| 439 | `try result.toOwned(allocator)`, which will only make a copy if one was not | 430 | need to be a copy, and will be a view of the original slice. Calling |
| 440 | already made. | 431 | `result.deinit(allocator)` will only free an allocated `Result`, not one |
| 432 | which is a view. Thus it is safe to do unconditionally. | ||
| 433 | |||
| 434 | This does mean that the validity of a `Result` can depend on the | ||
| 435 | original string staying in memory. To ensure that your `Result` is | ||
| 436 | always a copy, you may call `try result.toOwned(allocator)`, which will | ||
| 437 | only make a copy if one was not already made. | ||
| 441 | 438 | ||
| 442 | 439 | ||
| 443 | ## Caseless Matching via Case Folding | 440 | ## Caseless Matching via Case Folding |
| 444 | 441 | ||
| 445 | Unicode provides a more efficient way of comparing strings while ignoring letter | 442 | Unicode provides a more efficient way of comparing strings while |
| 446 | case differences: case folding. When you case fold a string, it's converted into a | 443 | ignoring letter case differences: case folding. When you case fold |
| 447 | normalized case form suitable for efficient matching. Use the `CaseFold` module | 444 | a string, it's converted into a normalized case form suitable for |
| 448 | for this. | 445 | efficient matching. Use the `CaseFold` module for this. |
| 449 | 446 | ||
| 450 | In your `build.zig`: | 447 | In your `build.zig`: |
| 451 | 448 | ||
| @@ -459,54 +456,32 @@ In your code: | |||
| 459 | const CaseFolding = @import("CaseFolding"); | 456 | const CaseFolding = @import("CaseFolding"); |
| 460 | 457 | ||
| 461 | test "Caseless matching" { | 458 | test "Caseless matching" { |
| 462 | // We need Unicode case fold data. | ||
| 463 | const case_fold = try CaseFolding.init(allocator); | ||
| 464 | defer case_fold.deinit(allocator); | ||
| 465 | |||
| 466 | // `compatCaselessMatch` provides the deepest level of caseless | 459 | // `compatCaselessMatch` provides the deepest level of caseless |
| 467 | // matching because it decomposes fully to NFKD. | 460 | // matching because it decomposes fully to NFKD. |
| 468 | const a = "Héllo World! \u{3d3}"; | 461 | const a = "Héllo World! \u{3d3}"; |
| 469 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; | 462 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; |
| 470 | try expect(try case_fold.compatCaselessMatch(allocator, a, b)); | 463 | try expect(try CaseFolding.compatCaselessMatch(allocator, a, b)); |
| 471 | 464 | ||
| 472 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; | 465 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; |
| 473 | try expect(try case_fold.compatCaselessMatch(allocator, a, c)); | 466 | try expect(try CaseFolding.compatCaselessMatch(allocator, a, c)); |
| 474 | 467 | ||
| 475 | // `canonCaselessMatch` isn't as comprehensive as `compatCaselessMatch` | 468 | // `canonCaselessMatch` isn't as comprehensive as `compatCaselessMatch` |
| 476 | // because it only decomposes to NFD. Naturally, it's faster because of this. | 469 | // because it only decomposes to NFD. Naturally, it's faster because of this. |
| 477 | try expect(!try case_fold.canonCaselessMatch(allocator, a, b)); | 470 | try expect(!try CaseFolding.canonCaselessMatch(allocator, a, b)); |
| 478 | try expect(try case_fold.canonCaselessMatch(allocator, a, c)); | 471 | try expect(try CaseFolding.canonCaselessMatch(allocator, a, c)); |
| 479 | } | ||
| 480 | ``` | ||
| 481 | Case folding needs to use the `Normalize` module in order to produce the compatibility | ||
| 482 | forms for comparison. If you are already using a `Normalize` for other purposes, | ||
| 483 | `CaseFolding` can borrow it: | ||
| 484 | |||
| 485 | ```zig | ||
| 486 | const CaseFolding = @import("CaseFolding"); | ||
| 487 | const Normalize = @import("Normalize"); | ||
| 488 | |||
| 489 | test "Initialize With a Normalize" { | ||
| 490 | const normalize = try Normalize.init(allocator); | ||
| 491 | // You're responsible for freeing this: | ||
| 492 | defer normalize.deinit(allocator); | ||
| 493 | const case_fold = try CaseFolding.initWithNormalize(allocator, normalize); | ||
| 494 | // This will not free your normalize when it runs first. | ||
| 495 | defer case_fold.deinit(allocator); | ||
| 496 | } | 472 | } |
| 497 | ``` | 473 | ``` |
| 498 | This has a `setupWithNormalize` variant as well, note that this also takes | ||
| 499 | a `Normalize` struct, and not a pointer to it. | ||
| 500 | 474 | ||
| 501 | 475 | ||
| 502 | ## Display Width of Characters and Strings | 476 | ## Display Width of Characters and Strings |
| 503 | 477 | ||
| 504 | When displaying text with a fixed-width font on a terminal screen, it's very | 478 | When displaying text with a fixed-width font on a terminal screen, it's |
| 505 | important to know exactly how many columns or cells each character should take. | 479 | very important to know exactly how many columns or cells each character |
| 506 | Most characters will use one column, but there are many, like emoji and East- | 480 | should take. Most characters will use one column, but there are |
| 507 | Asian ideographs that need more space. The `DisplayWidth` module provides | 481 | many, like emoji and East- Asian ideographs that need more space. The |
| 508 | methods for this purpose. It also has methods that use the display width calculation | 482 | `DisplayWidth` module provides methods for this purpose. It also has |
| 509 | to `center`, `padLeft`, `padRight`, and `wrap` text. | 483 | methods that use the display width calculation to `center`, `padLeft`, |
| 484 | `padRight`, and `wrap` text. | ||
| 510 | 485 | ||
| 511 | In your `build.zig`: | 486 | In your `build.zig`: |
| 512 | 487 | ||
| @@ -520,34 +495,31 @@ In your code: | |||
| 520 | const DisplayWidth = @import("DisplayWidth"); | 495 | const DisplayWidth = @import("DisplayWidth"); |
| 521 | 496 | ||
| 522 | test "Display width" { | 497 | test "Display width" { |
| 523 | const dw = try DisplayWidth.init(allocator); | ||
| 524 | defer dw.deinit(allocator); | ||
| 525 | |||
| 526 | // String display width | 498 | // String display width |
| 527 | try expectEqual(@as(usize, 5), dw.strWidth("Hello\r\n")); | 499 | try expectEqual(@as(usize, 5), DisplayWidth.strWidth("Hello\r\n")); |
| 528 | try expectEqual(@as(usize, 8), dw.strWidth("Hello 😊")); | 500 | try expectEqual(@as(usize, 8), DisplayWidth.strWidth("Hello 😊")); |
| 529 | try expectEqual(@as(usize, 8), dw.strWidth("Héllo 😊")); | 501 | try expectEqual(@as(usize, 8), DisplayWidth.strWidth("Héllo 😊")); |
| 530 | try expectEqual(@as(usize, 9), dw.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ")); | 502 | try expectEqual(@as(usize, 9), DisplayWidth.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ")); |
| 531 | try expectEqual(@as(usize, 17), dw.strWidth("슬라바 우크라이나")); | 503 | try expectEqual(@as(usize, 17), DisplayWidth.strWidth("슬라바 우크라이나")); |
| 532 | 504 | ||
| 533 | // Centering text | 505 | // Centering text |
| 534 | const centered = try dw.center(allocator, "w😊w", 10, "-"); | 506 | const centered = try DisplayWidth.center(allocator, "w😊w", 10, "-"); |
| 535 | defer allocator.free(centered); | 507 | defer allocator.free(centered); |
| 536 | try expectEqualStrings("---w😊w---", centered); | 508 | try expectEqualStrings("---w😊w---", centered); |
| 537 | 509 | ||
| 538 | // Pad left | 510 | // Pad left |
| 539 | const right_aligned = try dw.padLeft(allocator, "abc", 9, "*"); | 511 | const right_aligned = try DisplayWidth.padLeft(allocator, "abc", 9, "*"); |
| 540 | defer allocator.free(right_aligned); | 512 | defer allocator.free(right_aligned); |
| 541 | try expectEqualStrings("******abc", right_aligned); | 513 | try expectEqualStrings("******abc", right_aligned); |
| 542 | 514 | ||
| 543 | // Pad right | 515 | // Pad right |
| 544 | const left_aligned = try dw.padRight(allocator, "abc", 9, "*"); | 516 | const left_aligned = try DisplayWidth.padRight(allocator, "abc", 9, "*"); |
| 545 | defer allocator.free(left_aligned); | 517 | defer allocator.free(left_aligned); |
| 546 | try expectEqualStrings("abc******", left_aligned); | 518 | try expectEqualStrings("abc******", left_aligned); |
| 547 | 519 | ||
| 548 | // Wrap text | 520 | // Wrap text |
| 549 | const input = "The quick brown fox\r\njumped over the lazy dog!"; | 521 | const input = "The quick brown fox\r\njumped over the lazy dog!"; |
| 550 | const wrapped = try dw.wrap(allocator, input, 10, 3); | 522 | const wrapped = try DisplayWidth.wrap(allocator, input, 10, 3); |
| 551 | defer allocator.free(wrapped); | 523 | defer allocator.free(wrapped); |
| 552 | const want = | 524 | const want = |
| 553 | \\The quick | 525 | \\The quick |
| @@ -560,7 +532,8 @@ test "Display width" { | |||
| 560 | } | 532 | } |
| 561 | ``` | 533 | ``` |
| 562 | 534 | ||
| 563 | This module has build options. The first is `cjk`, which will consider [ambiguous characters](https://www.unicode.org/reports/tr11/tr11-6.html) as double-width. | 535 | This module has build options. The first is `cjk`, which will consider |
| 536 | [ambiguous characters][ambig] as double-width. | ||
| 564 | 537 | ||
| 565 | To choose this option, add it to the dependency like so: | 538 | To choose this option, add it to the dependency like so: |
| 566 | 539 | ||
| @@ -570,22 +543,28 @@ const zg = b.dependency("zg", .{ | |||
| 570 | }); | 543 | }); |
| 571 | ``` | 544 | ``` |
| 572 | 545 | ||
| 573 | The other options are `c0_width` and `c1_width`. The standard behavior is to treat | 546 | The other options are `c0_width` and `c1_width`. The standard behavior |
| 574 | C0 and C1 control codes as zero-width, except for delete and backspace, which are | 547 | is to treat C0 and C1 control codes as zero-width, except for delete and |
| 575 | -1 (the logic ensures that a `strWidth` is always at least 0). If printing | 548 | backspace, which are -1 (the logic ensures that a `strWidth` is always |
| 576 | control codes with replacement characters, it's necessary to assign these a width, | 549 | at least 0). If printing control codes with replacement characters, |
| 577 | hence the options. When provided these values must fit in an `i4`, this allows | 550 | it's necessary to assign these a width, hence the options. When |
| 578 | for C1s to be printed as `\u{80}` if desired. | 551 | provided these values must fit in an `i4`, this allows for C1s to be |
| 552 | printed as `\u{80}` if desired. | ||
| 553 | |||
| 554 | `DisplayWidth` uses the `Graphemes` module internally. | ||
| 555 | If you already have one, it can be borrowed using | ||
| 556 | `DisplayWidth.initWithGraphemes(allocator, graphemes)` in the same | ||
| 557 | fashion as shown for `CaseFolding` and `Normalize`. | ||
| 579 | 558 | ||
| 580 | `DisplayWidth` uses the `Graphemes` module internally. If you already have one, | 559 | |
| 581 | it can be borrowed using `DisplayWidth.initWithGraphemes(allocator, graphemes)` | 560 | [ambig]: https://www.unicode.org/reports/tr11/tr11-6.html |
| 582 | in the same fashion as shown for `CaseFolding` and `Normalize`. | ||
| 583 | 561 | ||
| 584 | ## Scripts | 562 | ## Scripts |
| 585 | 563 | ||
| 586 | Unicode categorizes code points by the Script in which they belong. A Script | 564 | Unicode categorizes code points by the Script in which they belong. A |
| 587 | collects letters and other symbols that belong to a particular writing system. | 565 | Script collects letters and other symbols that belong to a particular |
| 588 | You can detect the Script for a code point with the `Scripts` module. | 566 | writing system. You can detect the Script for a code point with the |
| 567 | `Scripts` module. | ||
| 589 | 568 | ||
| 590 | In your `build.zig`: | 569 | In your `build.zig`: |
| 591 | 570 | ||
| @@ -596,23 +575,21 @@ exe.root_module.addImport("Scripts", zg.module("Scripts")); | |||
| 596 | In your code: | 575 | In your code: |
| 597 | 576 | ||
| 598 | ```zig | 577 | ```zig |
| 599 | const Scripts= @import("Scripts"); | 578 | const Scripts = @import("Scripts"); |
| 600 | 579 | ||
| 601 | test "Scripts" { | 580 | test "Scripts" { |
| 602 | const scripts = try Scripts.init(allocator); | ||
| 603 | defer scripts.deinit(allocator); | ||
| 604 | |||
| 605 | // To see the full list of Scripts, look at the | 581 | // To see the full list of Scripts, look at the |
| 606 | // `src/Scripts.zig` file. They are list in an enum. | 582 | // `src/Scripts.zig` file. They are list in an enum. |
| 607 | try expect(scripts.script('A') == .Latin); | 583 | try expect(Scripts.script('A') == .Latin); |
| 608 | try expect(scripts.script('Ω') == .Greek); | 584 | try expect(Scripts.script('Ω') == .Greek); |
| 609 | try expect(scripts.script('צ') == .Hebrew); | 585 | try expect(Scripts.script('צ') == .Hebrew); |
| 610 | } | 586 | } |
| 611 | ``` | 587 | ``` |
| 612 | 588 | ||
| 613 | ## Emoji | 589 | ## Emoji |
| 614 | 590 | ||
| 615 | To get information about emoji and emoji-like characters, use the `Emoji` module. | 591 | To get information about emoji and emoji-like characters, use the |
| 592 | `Emoji` module. | ||
| 616 | 593 | ||
| 617 | In your `build.zig`: | 594 | In your `build.zig`: |
| 618 | 595 | ||
| @@ -626,15 +603,12 @@ In your code: | |||
| 626 | const Emoji = @import("Emoji"); | 603 | const Emoji = @import("Emoji"); |
| 627 | 604 | ||
| 628 | test "Emoji" { | 605 | test "Emoji" { |
| 629 | const emoji = try Emoji.init(allocator); | 606 | try expect(Emoji.isEmoji(0x1F415)); // 🐕 |
| 630 | defer emoji.deinit(allocator); | 607 | try expect(Emoji.isEmojiPresentation(0x1F408)); // 🐈 |
| 631 | 608 | try expect(Emoji.isEmojiModifier(0x1F3FF)); // | |
| 632 | try expect(emoji.isEmoji(0x1F415)); // 🐕 | 609 | try expect(Emoji.isEmojiModifierBase(0x1F977)); // 🥷 |
| 633 | try expect(emoji.isEmojiPresentation(0x1F408)); // 🐈 | 610 | try expect(Emoji.isEmojiComponent(0x1F9B0)); // 🦰 |
| 634 | try expect(emoji.isEmojiModifier(0x1F3FF)); // 🏿 | 611 | try expect(Emoji.isExtendedPictographic(0x1F005)); // 🀅 |
| 635 | try expect(emoji.isEmojiModifierBase(0x1F977)); // 🥷 | ||
| 636 | try expect(emoji.isEmojiComponent(0x1F9B0)); // 🦰 | ||
| 637 | try expect(emoji.isExtendedPictographic(0x1F005)); // 🀅 | ||
| 638 | } | 612 | } |
| 639 | ``` | 613 | ``` |
| 640 | 614 | ||