diff options
| author | 2024-03-31 09:59:51 -0400 | |
|---|---|---|
| committer | 2024-03-31 09:59:51 -0400 | |
| commit | 200c617c865a5952f0bd12378802cc06ea3eb1c2 (patch) | |
| tree | 2af456d4c62a08330cf961e7237f083fc4566370 /README.md | |
| parent | Split out Unicode tests to separate file (diff) | |
| download | zg-200c617c865a5952f0bd12378802cc06ea3eb1c2.tar.gz zg-200c617c865a5952f0bd12378802cc06ea3eb1c2.tar.xz zg-200c617c865a5952f0bd12378802cc06ea3eb1c2.zip | |
Updated README
Diffstat (limited to 'README.md')
| -rw-r--r-- | README.md | 537 |
1 files changed, 537 insertions, 0 deletions
diff --git a/README.md b/README.md new file mode 100644 index 0000000..d4fc8f6 --- /dev/null +++ b/README.md | |||
| @@ -0,0 +1,537 @@ | |||
| 1 | # zg | ||
| 2 | zg provides Unicode text processing for Zig projects. | ||
| 3 | |||
| 4 | ## Unicode Version | ||
| 5 | The Unicode version supported by zg is 15.1.0. | ||
| 6 | |||
| 7 | ## Zig Version | ||
| 8 | The minimum Zig version required is 0.12.0-dev.3496+a2df84d0. | ||
| 9 | |||
| 10 | ## Integrating zg into your Zig Project | ||
| 11 | You first need to add zg as a dependency in your `build.zig.zon` file: | ||
| 12 | |||
| 13 | ```zig | ||
| 14 | .zg = .{ | ||
| 15 | .url = "https://codeberg.org/dude_the_builder/zg/archive/v0.1.0.tar.gz", | ||
| 16 | } | ||
| 17 | ``` | ||
| 18 | |||
| 19 | Then instantiate the dependency in your `build.zig`: | ||
| 20 | |||
| 21 | |||
| 22 | ```zig | ||
| 23 | const zg = b.dependency("zg", .{}); | ||
| 24 | ``` | ||
| 25 | |||
| 26 | ## A Modular Approach | ||
| 27 | zg is a modular library. This approach minimizes binary file size and memory | ||
| 28 | requirements by only including the Unicode data required for the specified module. | ||
| 29 | The following sections describe the various modules and their specific use case. | ||
| 30 | |||
| 31 | ## Code Points | ||
| 32 | In the `code_point` module, you'll find a data structure representing a single code | ||
| 33 | point, `CodePoint`, and an `Iterator` to iterate over the code points in a string. | ||
| 34 | |||
| 35 | In your `build.zig`: | ||
| 36 | |||
| 37 | ```zig | ||
| 38 | exe.root_module.addImport("code_point", zg.module("code_point")); | ||
| 39 | ``` | ||
| 40 | |||
| 41 | In your code: | ||
| 42 | |||
| 43 | ```zig | ||
| 44 | const code_point = @import("code_point"); | ||
| 45 | |||
| 46 | test "Code point iterator" { | ||
| 47 | const str = "Hi 😊"; | ||
| 48 | var iter = code_point.Iterator{ .bytes = str }; | ||
| 49 | var i: usize = 0; | ||
| 50 | |||
| 51 | while (iter.next()) |cp| : (i += 1) { | ||
| 52 | // The `code` field is the actual code point scalar as a `u21`. | ||
| 53 | if (i == 0) try expect(cp.code == 'H'); | ||
| 54 | if (i == 1) try expect(cp.code == 'i'); | ||
| 55 | if (i == 2) try expect(cp.code == ' '); | ||
| 56 | |||
| 57 | if (i == 3) { | ||
| 58 | try expect(cp.code == '😊'); | ||
| 59 | |||
| 60 | // The `offset` field is the byte offset in the | ||
| 61 | // source string. | ||
| 62 | try expect(cp.offset == 3); | ||
| 63 | |||
| 64 | // The `len` field is the length in bytes of the | ||
| 65 | // code point in the source string. | ||
| 66 | try expect(cp.len == 4); | ||
| 67 | } | ||
| 68 | } | ||
| 69 | } | ||
| 70 | ``` | ||
| 71 | |||
| 72 | ## Grapheme Clusters | ||
| 73 | Many characters are composed from more than one code point. These are known as | ||
| 74 | Grapheme Clusters and the `grapheme` module has a data structure to represent | ||
| 75 | them, `Grapheme`, and an `Iterator` to iterate over them in a string. | ||
| 76 | |||
| 77 | In your `build.zig`: | ||
| 78 | |||
| 79 | ```zig | ||
| 80 | exe.root_module.addImport("grapheme", zg.module("grapheme")); | ||
| 81 | ``` | ||
| 82 | |||
| 83 | In your code: | ||
| 84 | |||
| 85 | ```zig | ||
| 86 | const grapheme = @import("grapheme"); | ||
| 87 | |||
| 88 | test "Grapheme cluster iterator" { | ||
| 89 | // we need some Unicode data to process Grapheme Clusters. | ||
| 90 | const gd = try grapheme.GraphemeData.init(allocator); | ||
| 91 | defer gd.deinit(); | ||
| 92 | |||
| 93 | const str = "He\u{301}"; // Hé | ||
| 94 | var iter = grapheme.Iterator.init(str, &gd); | ||
| 95 | |||
| 96 | var i: usize = 0; | ||
| 97 | |||
| 98 | while (iter.next()) |gc| : (i += 1) { | ||
| 99 | // The `len` field is the length in bytes of the | ||
| 100 | // grapheme cluster in the source string. | ||
| 101 | if (i == 0) try expect(gc.len == 1); | ||
| 102 | |||
| 103 | if (i == 1) { | ||
| 104 | try expect(gc.len == 3); | ||
| 105 | |||
| 106 | // The `offset` in bytes of the grapheme cluster | ||
| 107 | // in the source string. | ||
| 108 | try expect(gc.offset == 1); | ||
| 109 | |||
| 110 | // The `bytes` method returns the slice of bytes | ||
| 111 | // that comprise this grapheme cluster in the | ||
| 112 | // source string `str`. | ||
| 113 | try expectEqualStrings("e\u{301}", gc.bytes(str)); | ||
| 114 | } | ||
| 115 | } | ||
| 116 | } | ||
| 117 | ``` | ||
| 118 | |||
| 119 | ## Unicode General Categories | ||
| 120 | To detect the general category for a code point, use the `GenCatData` module. | ||
| 121 | |||
| 122 | In your `build.zig`: | ||
| 123 | |||
| 124 | ```zig | ||
| 125 | exe.root_module.addImport("GenCatData", zg.module("GenCatData")); | ||
| 126 | ``` | ||
| 127 | |||
| 128 | In your code: | ||
| 129 | |||
| 130 | ```zig | ||
| 131 | const GenCatData = @import("GenCatData"); | ||
| 132 | |||
| 133 | test "General Category" { | ||
| 134 | const gcd = try GenCatData.init(allocator); | ||
| 135 | defer gcd.deinit(); | ||
| 136 | |||
| 137 | // The `gc` method returns the abbreviated General Category. | ||
| 138 | // These abbreviations and descriptive comments can be found | ||
| 139 | // in the source file `src/GenCatData.zig` as en enum. | ||
| 140 | try expect(gcd.gc('A') == .Lu); // Lu: uppercase letter | ||
| 141 | try expect(gcd.gc('3') == .Nd); // Nd: decimal number | ||
| 142 | |||
| 143 | // The following are convenience methods for groups of General | ||
| 144 | // Categories. For example, all letter categories start with `L`: | ||
| 145 | // Lu, Ll, Lt, Lo. | ||
| 146 | try expect(gcd.isControl(0)); | ||
| 147 | try expect(gcd.isLetter('z')); | ||
| 148 | try expect(gcd.isMark('\u{301}')); | ||
| 149 | try expect(gcd.isNumber('3')); | ||
| 150 | try expect(gcd.isPunctuation('[')); | ||
| 151 | try expect(gcd.isSeparator(' ')); | ||
| 152 | try expect(gcd.isSymbol('©')); | ||
| 153 | } | ||
| 154 | ``` | ||
| 155 | |||
| 156 | ## Unicode Properties | ||
| 157 | You can detect common properties of a code point with the `PropsData` module. | ||
| 158 | |||
| 159 | In your `build.zig`: | ||
| 160 | |||
| 161 | ```zig | ||
| 162 | exe.root_module.addImport("PropsData", zg.module("PropsData")); | ||
| 163 | ``` | ||
| 164 | |||
| 165 | In your code: | ||
| 166 | |||
| 167 | ```zig | ||
| 168 | const PropsData = @import("PropsData"); | ||
| 169 | |||
| 170 | test "Properties" { | ||
| 171 | const pd = try PropsData.init(allocator); | ||
| 172 | defer pd.deinit(); | ||
| 173 | |||
| 174 | // Mathematical symbols and letters. | ||
| 175 | try expect(pd.isMath('+')); | ||
| 176 | // Alphabetic only code points. | ||
| 177 | try expect(pd.isAlphabetic('Z')); | ||
| 178 | // Space, tab, and other separators. | ||
| 179 | try expect(pd.isWhitespace(' ')); | ||
| 180 | // Hexadecimal digits and variations thereof. | ||
| 181 | try expect(pd.isHexDigit('f')); | ||
| 182 | try expect(!pd.isHexDigit('z')); | ||
| 183 | |||
| 184 | // Accents, dieresis, and other combining marks. | ||
| 185 | try expect(pd.isDiacritic('\u{301}')); | ||
| 186 | |||
| 187 | // Unicode has a specification for valid identifiers like | ||
| 188 | // the ones used in programming and regular expressions. | ||
| 189 | try expect(pd.isIdStart('Z')); // Identifier start character | ||
| 190 | try expect(!pd.isIdStart('1')); | ||
| 191 | try expect(pd.isIdContinue('1')); | ||
| 192 | |||
| 193 | // The `X` versions add some code points that can appear after | ||
| 194 | // normalizing a string. | ||
| 195 | try expect(pd.isXidStart('\u{b33}')); // Extended identifier start character | ||
| 196 | try expect(pd.isXidContinue('\u{e33}')); | ||
| 197 | try expect(!pd.isXidStart('1')); | ||
| 198 | |||
| 199 | // Note surprising Unicode numeric type properties! | ||
| 200 | try expect(pd.isNumeric('\u{277f}')); | ||
| 201 | try expect(!pd.isNumeric('3')); // 3 is not numeric! | ||
| 202 | try expect(pd.isDigit('\u{2070}')); | ||
| 203 | try expect(!pd.isDigit('3')); // 3 is not a digit! | ||
| 204 | try expect(pd.isDecimal('3')); // 3 is a decimal digit | ||
| 205 | } | ||
| 206 | ``` | ||
| 207 | |||
| 208 | ## Letter Case Detection and Conversion | ||
| 209 | To detect and convert to and from different letter cases, use the `CaseData` | ||
| 210 | module. | ||
| 211 | |||
| 212 | In your `build.zig`: | ||
| 213 | |||
| 214 | ```zig | ||
| 215 | exe.root_module.addImport("CaseData", zg.module("CaseData")); | ||
| 216 | ``` | ||
| 217 | |||
| 218 | In your code: | ||
| 219 | |||
| 220 | ```zig | ||
| 221 | const CaseData = @import("CaseData"); | ||
| 222 | |||
| 223 | test "Case" { | ||
| 224 | const cd = try CaseData.init(allocator); | ||
| 225 | defer cd.deinit(); | ||
| 226 | |||
| 227 | // Upper and lower case. | ||
| 228 | try expect(cd.isUpper('A')); | ||
| 229 | try expect('A' == cd.toUpper('a')); | ||
| 230 | try expect(cd.isLower('a')); | ||
| 231 | try expect('a' == cd.toLower('A')); | ||
| 232 | |||
| 233 | // Code points that have case. | ||
| 234 | try expect(cd.isCased('É')); | ||
| 235 | try expect(!cd.isCased('3')); | ||
| 236 | |||
| 237 | // Case detection and conversion for strings. | ||
| 238 | try expect(cd.isUpperStr("HELLO 123!")); | ||
| 239 | const ucased = try cd.toUpperStr(allocator, "hello 123"); | ||
| 240 | defer allocator.free(ucased); | ||
| 241 | try expectEqualStrings("HELLO 123", ucased); | ||
| 242 | |||
| 243 | try expect(cd.isLowerStr("hello 123!")); | ||
| 244 | const lcased = try cd.toLowerStr(allocator, "HELLO 123"); | ||
| 245 | defer allocator.free(lcased); | ||
| 246 | try expectEqualStrings("hello 123", lcased); | ||
| 247 | } | ||
| 248 | ``` | ||
| 249 | |||
| 250 | ## Normalization | ||
| 251 | Unicode normalization is the process of converting a string into a uniform | ||
| 252 | representation that can guarantee a known structure by following a strict set | ||
| 253 | of rules. There are four normalization forms: | ||
| 254 | |||
| 255 | Canonical Composition (NFC) | ||
| 256 | : The most compact representation obtained by first | ||
| 257 | decomposing to Canonical Decomposition and then composing to NFC. | ||
| 258 | |||
| 259 | Compatibility Composition (NFKC) | ||
| 260 | : The most comprehensive composition obtained | ||
| 261 | by first decomposing to Compatibility Decomposition and then composing to NFKC. | ||
| 262 | |||
| 263 | Canonical Decomposition (NFD) | ||
| 264 | : Only code points with canonical decompositions | ||
| 265 | are decomposed. This is a more compact and faster decomposition but will not | ||
| 266 | provide the most comprehensive normalization possible. | ||
| 267 | |||
| 268 | Compatibility Decomposition (NFKD) | ||
| 269 | : The most comprehensive decomposition method | ||
| 270 | where both canonical and compatibility decompositions are performed recursively. | ||
| 271 | |||
| 272 | zg has methods to produce all four normalization forms in the `Normalize` module. | ||
| 273 | |||
| 274 | In your `build.zig`: | ||
| 275 | |||
| 276 | ```zig | ||
| 277 | exe.root_module.addImport("Normalize", zg.module("Normalize")); | ||
| 278 | ``` | ||
| 279 | |||
| 280 | In your code: | ||
| 281 | |||
| 282 | ```zig | ||
| 283 | const Normalize = @import("Normalize"); | ||
| 284 | |||
| 285 | test "Normalization" { | ||
| 286 | // We need lots of Unicode dta for normalization. | ||
| 287 | var norm_data = try Normalize.NormData.init(allocator); | ||
| 288 | defer norm_data.deinit(); | ||
| 289 | |||
| 290 | // The `Normalize` structure takes a pointer to the data. | ||
| 291 | const n = Normalize{ .norm_data = &norm_data }; | ||
| 292 | |||
| 293 | // NFC: Canonical composition | ||
| 294 | const nfc_result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); | ||
| 295 | defer nfc_result.deinit(); | ||
| 296 | try expectEqualStrings("Complex char: \u{3D3}", nfc_result.slice); | ||
| 297 | |||
| 298 | // NFKC: Compatibility composition | ||
| 299 | const nfkc_result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); | ||
| 300 | defer nfkc_result.deinit(); | ||
| 301 | try expectEqualStrings("Complex char: \u{038E}", nfkc_result.slice); | ||
| 302 | |||
| 303 | // NFD: Canonical decomposition | ||
| 304 | const nfd_result = try n.nfd(allocator, "Héllo World! \u{3d3}"); | ||
| 305 | defer nfd_result.deinit(); | ||
| 306 | try expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", nfd_result.slice); | ||
| 307 | |||
| 308 | // NFKD: Compatibility decomposition | ||
| 309 | const nfkd_result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); | ||
| 310 | defer nfkd_result.deinit(); | ||
| 311 | try expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", nfkd_result.slice); | ||
| 312 | |||
| 313 | // Test for equality of two strings after normalizing to NFC. | ||
| 314 | try expect(try n.eql(allocator, "foé", "foe\u{0301}")); | ||
| 315 | try expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); | ||
| 316 | } | ||
| 317 | ``` | ||
| 318 | |||
| 319 | ## Caseless Matching via Case Folding | ||
| 320 | Unicode provides a more efficient way of comparing strings while ignoring letter | ||
| 321 | case differences: case folding. When you case fold a string, it's converted into a | ||
| 322 | normalized case form suitable for efficient matching. Use the `CaseFold` module | ||
| 323 | for this. | ||
| 324 | |||
| 325 | In your `build.zig`: | ||
| 326 | |||
| 327 | ```zig | ||
| 328 | exe.root_module.addImport("Normalize", zg.module("Normalize")); | ||
| 329 | exe.root_module.addImport("CaseFold", zg.module("CaseFold")); | ||
| 330 | ``` | ||
| 331 | |||
| 332 | In your code: | ||
| 333 | |||
| 334 | ```zig | ||
| 335 | const Normalize = @import("Normalize"); | ||
| 336 | const CaseFold = @import("CaseFold"); | ||
| 337 | |||
| 338 | test "Caseless matching" { | ||
| 339 | // We need to normalize during the matching process. | ||
| 340 | var norm_data = try Normalize.NormData.init(allocator); | ||
| 341 | defer norm_data.deinit(); | ||
| 342 | const n = Normalize{ .norm_data = &norm_data }; | ||
| 343 | |||
| 344 | // We need Unicode case fold data. | ||
| 345 | const cfd = try CaseFold.FoldData.init(allocator); | ||
| 346 | defer cfd.deinit(); | ||
| 347 | |||
| 348 | // The `CaseFold` structure takes a pointer to the data. | ||
| 349 | const cf = CaseFold{ .fold_data = &cfd }; | ||
| 350 | |||
| 351 | // `compatCaselessMatch` provides the deepest level of caseless | ||
| 352 | // matching because it decomposes fully to NFKD. | ||
| 353 | const a = "Héllo World! \u{3d3}"; | ||
| 354 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; | ||
| 355 | try expect(try cf.compatCaselessMatch(allocator, &n, a, b)); | ||
| 356 | |||
| 357 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; | ||
| 358 | try expect(try cf.compatCaselessMatch(allocator, &n, a, c)); | ||
| 359 | |||
| 360 | // `canonCaselessMatch` isn't as comprehensive as `compatCaselessMatch` | ||
| 361 | // because it only decomposes to NFD. Naturally, it's faster because of this. | ||
| 362 | try expect(!try cf.canonCaselessMatch(allocator, &n, a, b)); | ||
| 363 | try expect(try cf.canonCaselessMatch(allocator, &n, a, c)); | ||
| 364 | } | ||
| 365 | ``` | ||
| 366 | |||
| 367 | ## Display Width of Characters and Strings | ||
| 368 | When displaying text with a fixed-width font on a terminal screen, it's very | ||
| 369 | important to know exactly how many columns or cells each character should take. | ||
| 370 | Most characters will use one column, but there are many, like emoji and East- | ||
| 371 | Asian ideographs that need more space. The `DisplayWidth` module provides | ||
| 372 | methods for this purpose. It also has methods that use the display width calculation | ||
| 373 | to `center`, `padLeft`, `padRight`, and `wrap` text. | ||
| 374 | |||
| 375 | In your `build.zig`: | ||
| 376 | |||
| 377 | ```zig | ||
| 378 | exe.root_module.addImport("DisplayWidth", zg.module("DisplayWidth")); | ||
| 379 | ``` | ||
| 380 | |||
| 381 | In your code: | ||
| 382 | |||
| 383 | ```zig | ||
| 384 | const DisplayWidth = @import("DisplayWidth"); | ||
| 385 | |||
| 386 | test "Display width" { | ||
| 387 | // We need Unicode data for display width calculation. | ||
| 388 | const dwd = try DisplayWidth.DisplayWidthData.init(allocator); | ||
| 389 | defer dwd.deinit(); | ||
| 390 | |||
| 391 | // The `DisplayWidth` structure takes a pointer to the data. | ||
| 392 | const dw = DisplayWidth{ .data = &dwd }; | ||
| 393 | |||
| 394 | // String display width | ||
| 395 | try expectEqual(@as(usize, 5), dw.strWidth("Hello\r\n")); | ||
| 396 | try expectEqual(@as(usize, 8), dw.strWidth("Hello 😊")); | ||
| 397 | try expectEqual(@as(usize, 8), dw.strWidth("Héllo 😊")); | ||
| 398 | try expectEqual(@as(usize, 9), dw.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ")); | ||
| 399 | try expectEqual(@as(usize, 17), dw.strWidth("슬라바 우크라이나")); | ||
| 400 | |||
| 401 | // Centering text | ||
| 402 | const centered = try dw.center(allocator, "w😊w", 10, "-"); | ||
| 403 | defer allocator.free(centered); | ||
| 404 | try expectEqualStrings("---w😊w---", centered); | ||
| 405 | |||
| 406 | // Pad left | ||
| 407 | const right_aligned = try dw.padLeft(allocator, "abc", 9, "*"); | ||
| 408 | defer allocator.free(right_aligned); | ||
| 409 | try expectEqualStrings("******abc", right_aligned); | ||
| 410 | |||
| 411 | // Pad right | ||
| 412 | const left_aligned = try dw.padRight(allocator, "abc", 9, "*"); | ||
| 413 | defer allocator.free(left_aligned); | ||
| 414 | try expectEqualStrings("abc******", left_aligned); | ||
| 415 | |||
| 416 | // Wrap text | ||
| 417 | const input = "The quick brown fox\r\njumped over the lazy dog!"; | ||
| 418 | const wrapped = try dw.wrap(allocator, input, 10, 3); | ||
| 419 | defer allocator.free(wrapped); | ||
| 420 | const want = | ||
| 421 | \\The quick | ||
| 422 | \\brown fox | ||
| 423 | \\jumped | ||
| 424 | \\over the | ||
| 425 | \\lazy dog! | ||
| 426 | ; | ||
| 427 | try expectEqualStrings(want, wrapped); | ||
| 428 | } | ||
| 429 | ``` | ||
| 430 | |||
| 431 | ## Scripts | ||
| 432 | Unicode categorizes code points by the Script in which they belong. A Script | ||
| 433 | collects letters and other symbols that belong to a particular writing system. | ||
| 434 | You can detect the Script for a code point with the `ScriptsData` module. | ||
| 435 | |||
| 436 | In your `build.zig`: | ||
| 437 | |||
| 438 | ```zig | ||
| 439 | exe.root_module.addImport("ScriptsData", zg.module("ScriptsData")); | ||
| 440 | ``` | ||
| 441 | |||
| 442 | In your code: | ||
| 443 | |||
| 444 | ```zig | ||
| 445 | const ScriptsData = @import("ScriptsData"); | ||
| 446 | |||
| 447 | test "Scripts" { | ||
| 448 | const sd = try ScriptsData.init(allocator); | ||
| 449 | defer sd.deinit(); | ||
| 450 | |||
| 451 | // To see the full list of Scripts, look at the | ||
| 452 | // `src/ScriptsData.zig` file. They are list in an enum. | ||
| 453 | try expect(sd.script('A') == .Latin); | ||
| 454 | try expect(sd.script('Ω') == .Greek); | ||
| 455 | try expect(sd.script('צ') == .Hebrew); | ||
| 456 | } | ||
| 457 | ``` | ||
| 458 | |||
| 459 | ## Relation to Ziglyph | ||
| 460 | zg is a total re-write of some of the components of Ziglyph. The idea was to | ||
| 461 | reduce binary size and improve performance. These goals were achieved by using | ||
| 462 | trie-like data structures instead of generated functions. Where Ziglyph uses a | ||
| 463 | function call, zg uses an array lookup, which is quite faster. In addition, all | ||
| 464 | these data structures in zg are loaded at runtime from compressed versions in the | ||
| 465 | binary. This allows for smaller binary sizes at the expense of increased memory | ||
| 466 | footprint at runtime. | ||
| 467 | |||
| 468 | Benchmarks demonstrate the above stated goals have been met: | ||
| 469 | |||
| 470 | ```plain | ||
| 471 | Binary sizes ======= | ||
| 472 | |||
| 473 | 149K ziglyph_case | ||
| 474 | 87K zg_case | ||
| 475 | |||
| 476 | 275K ziglyph_caseless | ||
| 477 | 168K zg_caseless | ||
| 478 | |||
| 479 | 68K ziglyph_codepoint | ||
| 480 | 68K zg_codepoint | ||
| 481 | |||
| 482 | 101K ziglyph_grapheme | ||
| 483 | 86K zg_grapheme | ||
| 484 | |||
| 485 | 185K ziglyph_normalizer | ||
| 486 | 152K zg_normalize | ||
| 487 | |||
| 488 | 101K ziglyph_width | ||
| 489 | 86K zg_width | ||
| 490 | |||
| 491 | Benchmarks ========== | ||
| 492 | |||
| 493 | Ziglyph toUpperStr/toLowerStr: result: 7911596, took: 80 | ||
| 494 | Ziglyph isUpperStr/isLowerStr: result: 110959, took: 17 | ||
| 495 | zg toUpperStr/toLowerStr: result: 7911596, took: 62 | ||
| 496 | zg isUpperStr/isLowerStr: result: 110959, took: 7 | ||
| 497 | |||
| 498 | Ziglyph Normalizer.eqlCaseless: result: 625, took: 500 | ||
| 499 | zg CaseFold.canonCaselessMatch: result: 625, took: 385 | ||
| 500 | zg CaseFold.compatCaselessMatch: result: 625, took: 593 | ||
| 501 | |||
| 502 | Ziglyph CodePointIterator: result: 3769314, took: 2 | ||
| 503 | zg CodePointIterator: result: 3769314, took: 3 | ||
| 504 | |||
| 505 | Ziglyph GraphemeIterator: result: 3691806, took: 48 | ||
| 506 | zg GraphemeIterator: result: 3691806, took: 16 | ||
| 507 | |||
| 508 | Ziglyph Normalizer.nfkc: result: 3934162, took: 416 | ||
| 509 | zg Normalize.nfkc: result: 3934162, took: 182 | ||
| 510 | |||
| 511 | Ziglyph Normalizer.nfc: result: 3955798, took: 57 | ||
| 512 | zg Normalize.nfc: result: 3955798, took: 28 | ||
| 513 | |||
| 514 | Ziglyph Normalizer.nfkd: result: 4006398, took: 172 | ||
| 515 | zg Normalize.nfkd: result: 4006398, took: 104 | ||
| 516 | |||
| 517 | Ziglyph Normalizer.nfd: result: 4028034, took: 169 | ||
| 518 | zg Normalize.nfd: result: 4028034, took: 104 | ||
| 519 | |||
| 520 | Ziglyph Normalizer.eql: result: 625, took: 337 | ||
| 521 | Zg Normalize.eql: result: 625, took: 53 | ||
| 522 | |||
| 523 | Ziglyph display_width.strWidth: result: 3700914, took: 71 | ||
| 524 | zg DisplayWidth.strWidth: result: 3700914, took: 24 | ||
| 525 | ``` | ||
| 526 | |||
| 527 | These results were obtained on an M1 Mac with 16 GiB of RAM. | ||
| 528 | |||
| 529 | In contrast to Ziglyph, zg does not have: | ||
| 530 | |||
| 531 | - Word segmentation | ||
| 532 | - Sentence segmentation | ||
| 533 | - Collation | ||
| 534 | |||
| 535 | It's possible that any missing functionality will be added in future versions, | ||
| 536 | but only if enough demand is present in the community. | ||
| 537 | |||