summaryrefslogtreecommitdiff
path: root/README.md
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-03-31 09:59:51 -0400
committerGravatar Jose Colon Rodriguez2024-03-31 09:59:51 -0400
commit200c617c865a5952f0bd12378802cc06ea3eb1c2 (patch)
tree2af456d4c62a08330cf961e7237f083fc4566370 /README.md
parentSplit out Unicode tests to separate file (diff)
downloadzg-200c617c865a5952f0bd12378802cc06ea3eb1c2.tar.gz
zg-200c617c865a5952f0bd12378802cc06ea3eb1c2.tar.xz
zg-200c617c865a5952f0bd12378802cc06ea3eb1c2.zip
Updated README
Diffstat (limited to 'README.md')
-rw-r--r--README.md537
1 files changed, 537 insertions, 0 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d4fc8f6
--- /dev/null
+++ b/README.md
@@ -0,0 +1,537 @@
1# zg
2zg provides Unicode text processing for Zig projects.
3
4## Unicode Version
5The Unicode version supported by zg is 15.1.0.
6
7## Zig Version
8The minimum Zig version required is 0.12.0-dev.3496+a2df84d0.
9
10## Integrating zg into your Zig Project
11You first need to add zg as a dependency in your `build.zig.zon` file:
12
13```zig
14.zg = .{
15 .url = "https://codeberg.org/dude_the_builder/zg/archive/v0.1.0.tar.gz",
16}
17```
18
19Then instantiate the dependency in your `build.zig`:
20
21
22```zig
23const zg = b.dependency("zg", .{});
24```
25
26## A Modular Approach
27zg is a modular library. This approach minimizes binary file size and memory
28requirements by only including the Unicode data required for the specified module.
29The following sections describe the various modules and their specific use case.
30
31## Code Points
32In the `code_point` module, you'll find a data structure representing a single code
33point, `CodePoint`, and an `Iterator` to iterate over the code points in a string.
34
35In your `build.zig`:
36
37```zig
38exe.root_module.addImport("code_point", zg.module("code_point"));
39```
40
41In your code:
42
43```zig
44const code_point = @import("code_point");
45
46test "Code point iterator" {
47 const str = "Hi 😊";
48 var iter = code_point.Iterator{ .bytes = str };
49 var i: usize = 0;
50
51 while (iter.next()) |cp| : (i += 1) {
52 // The `code` field is the actual code point scalar as a `u21`.
53 if (i == 0) try expect(cp.code == 'H');
54 if (i == 1) try expect(cp.code == 'i');
55 if (i == 2) try expect(cp.code == ' ');
56
57 if (i == 3) {
58 try expect(cp.code == '😊');
59
60 // The `offset` field is the byte offset in the
61 // source string.
62 try expect(cp.offset == 3);
63
64 // The `len` field is the length in bytes of the
65 // code point in the source string.
66 try expect(cp.len == 4);
67 }
68 }
69}
70```
71
72## Grapheme Clusters
73Many characters are composed from more than one code point. These are known as
74Grapheme Clusters and the `grapheme` module has a data structure to represent
75them, `Grapheme`, and an `Iterator` to iterate over them in a string.
76
77In your `build.zig`:
78
79```zig
80exe.root_module.addImport("grapheme", zg.module("grapheme"));
81```
82
83In your code:
84
85```zig
86const grapheme = @import("grapheme");
87
88test "Grapheme cluster iterator" {
89 // we need some Unicode data to process Grapheme Clusters.
90 const gd = try grapheme.GraphemeData.init(allocator);
91 defer gd.deinit();
92
93 const str = "He\u{301}"; // Hé
94 var iter = grapheme.Iterator.init(str, &gd);
95
96 var i: usize = 0;
97
98 while (iter.next()) |gc| : (i += 1) {
99 // The `len` field is the length in bytes of the
100 // grapheme cluster in the source string.
101 if (i == 0) try expect(gc.len == 1);
102
103 if (i == 1) {
104 try expect(gc.len == 3);
105
106 // The `offset` in bytes of the grapheme cluster
107 // in the source string.
108 try expect(gc.offset == 1);
109
110 // The `bytes` method returns the slice of bytes
111 // that comprise this grapheme cluster in the
112 // source string `str`.
113 try expectEqualStrings("e\u{301}", gc.bytes(str));
114 }
115 }
116}
117```
118
119## Unicode General Categories
120To detect the general category for a code point, use the `GenCatData` module.
121
122In your `build.zig`:
123
124```zig
125exe.root_module.addImport("GenCatData", zg.module("GenCatData"));
126```
127
128In your code:
129
130```zig
131const GenCatData = @import("GenCatData");
132
133test "General Category" {
134 const gcd = try GenCatData.init(allocator);
135 defer gcd.deinit();
136
137 // The `gc` method returns the abbreviated General Category.
138 // These abbreviations and descriptive comments can be found
139 // in the source file `src/GenCatData.zig` as en enum.
140 try expect(gcd.gc('A') == .Lu); // Lu: uppercase letter
141 try expect(gcd.gc('3') == .Nd); // Nd: decimal number
142
143 // The following are convenience methods for groups of General
144 // Categories. For example, all letter categories start with `L`:
145 // Lu, Ll, Lt, Lo.
146 try expect(gcd.isControl(0));
147 try expect(gcd.isLetter('z'));
148 try expect(gcd.isMark('\u{301}'));
149 try expect(gcd.isNumber('3'));
150 try expect(gcd.isPunctuation('['));
151 try expect(gcd.isSeparator(' '));
152 try expect(gcd.isSymbol('©'));
153}
154```
155
156## Unicode Properties
157You can detect common properties of a code point with the `PropsData` module.
158
159In your `build.zig`:
160
161```zig
162exe.root_module.addImport("PropsData", zg.module("PropsData"));
163```
164
165In your code:
166
167```zig
168const PropsData = @import("PropsData");
169
170test "Properties" {
171 const pd = try PropsData.init(allocator);
172 defer pd.deinit();
173
174 // Mathematical symbols and letters.
175 try expect(pd.isMath('+'));
176 // Alphabetic only code points.
177 try expect(pd.isAlphabetic('Z'));
178 // Space, tab, and other separators.
179 try expect(pd.isWhitespace(' '));
180 // Hexadecimal digits and variations thereof.
181 try expect(pd.isHexDigit('f'));
182 try expect(!pd.isHexDigit('z'));
183
184 // Accents, dieresis, and other combining marks.
185 try expect(pd.isDiacritic('\u{301}'));
186
187 // Unicode has a specification for valid identifiers like
188 // the ones used in programming and regular expressions.
189 try expect(pd.isIdStart('Z')); // Identifier start character
190 try expect(!pd.isIdStart('1'));
191 try expect(pd.isIdContinue('1'));
192
193 // The `X` versions add some code points that can appear after
194 // normalizing a string.
195 try expect(pd.isXidStart('\u{b33}')); // Extended identifier start character
196 try expect(pd.isXidContinue('\u{e33}'));
197 try expect(!pd.isXidStart('1'));
198
199 // Note surprising Unicode numeric type properties!
200 try expect(pd.isNumeric('\u{277f}'));
201 try expect(!pd.isNumeric('3')); // 3 is not numeric!
202 try expect(pd.isDigit('\u{2070}'));
203 try expect(!pd.isDigit('3')); // 3 is not a digit!
204 try expect(pd.isDecimal('3')); // 3 is a decimal digit
205}
206```
207
208## Letter Case Detection and Conversion
209To detect and convert to and from different letter cases, use the `CaseData`
210module.
211
212In your `build.zig`:
213
214```zig
215exe.root_module.addImport("CaseData", zg.module("CaseData"));
216```
217
218In your code:
219
220```zig
221const CaseData = @import("CaseData");
222
223test "Case" {
224 const cd = try CaseData.init(allocator);
225 defer cd.deinit();
226
227 // Upper and lower case.
228 try expect(cd.isUpper('A'));
229 try expect('A' == cd.toUpper('a'));
230 try expect(cd.isLower('a'));
231 try expect('a' == cd.toLower('A'));
232
233 // Code points that have case.
234 try expect(cd.isCased('É'));
235 try expect(!cd.isCased('3'));
236
237 // Case detection and conversion for strings.
238 try expect(cd.isUpperStr("HELLO 123!"));
239 const ucased = try cd.toUpperStr(allocator, "hello 123");
240 defer allocator.free(ucased);
241 try expectEqualStrings("HELLO 123", ucased);
242
243 try expect(cd.isLowerStr("hello 123!"));
244 const lcased = try cd.toLowerStr(allocator, "HELLO 123");
245 defer allocator.free(lcased);
246 try expectEqualStrings("hello 123", lcased);
247}
248```
249
250## Normalization
251Unicode normalization is the process of converting a string into a uniform
252representation that can guarantee a known structure by following a strict set
253of rules. There are four normalization forms:
254
255Canonical Composition (NFC)
256: The most compact representation obtained by first
257decomposing to Canonical Decomposition and then composing to NFC.
258
259Compatibility Composition (NFKC)
260: The most comprehensive composition obtained
261by first decomposing to Compatibility Decomposition and then composing to NFKC.
262
263Canonical Decomposition (NFD)
264: Only code points with canonical decompositions
265are decomposed. This is a more compact and faster decomposition but will not
266provide the most comprehensive normalization possible.
267
268Compatibility Decomposition (NFKD)
269: The most comprehensive decomposition method
270where both canonical and compatibility decompositions are performed recursively.
271
272zg has methods to produce all four normalization forms in the `Normalize` module.
273
274In your `build.zig`:
275
276```zig
277exe.root_module.addImport("Normalize", zg.module("Normalize"));
278```
279
280In your code:
281
282```zig
283const Normalize = @import("Normalize");
284
285test "Normalization" {
286 // We need lots of Unicode dta for normalization.
287 var norm_data = try Normalize.NormData.init(allocator);
288 defer norm_data.deinit();
289
290 // The `Normalize` structure takes a pointer to the data.
291 const n = Normalize{ .norm_data = &norm_data };
292
293 // NFC: Canonical composition
294 const nfc_result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
295 defer nfc_result.deinit();
296 try expectEqualStrings("Complex char: \u{3D3}", nfc_result.slice);
297
298 // NFKC: Compatibility composition
299 const nfkc_result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
300 defer nfkc_result.deinit();
301 try expectEqualStrings("Complex char: \u{038E}", nfkc_result.slice);
302
303 // NFD: Canonical decomposition
304 const nfd_result = try n.nfd(allocator, "Héllo World! \u{3d3}");
305 defer nfd_result.deinit();
306 try expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", nfd_result.slice);
307
308 // NFKD: Compatibility decomposition
309 const nfkd_result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
310 defer nfkd_result.deinit();
311 try expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", nfkd_result.slice);
312
313 // Test for equality of two strings after normalizing to NFC.
314 try expect(try n.eql(allocator, "foé", "foe\u{0301}"));
315 try expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
316}
317```
318
319## Caseless Matching via Case Folding
320Unicode provides a more efficient way of comparing strings while ignoring letter
321case differences: case folding. When you case fold a string, it's converted into a
322normalized case form suitable for efficient matching. Use the `CaseFold` module
323for this.
324
325In your `build.zig`:
326
327```zig
328exe.root_module.addImport("Normalize", zg.module("Normalize"));
329exe.root_module.addImport("CaseFold", zg.module("CaseFold"));
330```
331
332In your code:
333
334```zig
335const Normalize = @import("Normalize");
336const CaseFold = @import("CaseFold");
337
338test "Caseless matching" {
339 // We need to normalize during the matching process.
340 var norm_data = try Normalize.NormData.init(allocator);
341 defer norm_data.deinit();
342 const n = Normalize{ .norm_data = &norm_data };
343
344 // We need Unicode case fold data.
345 const cfd = try CaseFold.FoldData.init(allocator);
346 defer cfd.deinit();
347
348 // The `CaseFold` structure takes a pointer to the data.
349 const cf = CaseFold{ .fold_data = &cfd };
350
351 // `compatCaselessMatch` provides the deepest level of caseless
352 // matching because it decomposes fully to NFKD.
353 const a = "Héllo World! \u{3d3}";
354 const b = "He\u{301}llo World! \u{3a5}\u{301}";
355 try expect(try cf.compatCaselessMatch(allocator, &n, a, b));
356
357 const c = "He\u{301}llo World! \u{3d2}\u{301}";
358 try expect(try cf.compatCaselessMatch(allocator, &n, a, c));
359
360 // `canonCaselessMatch` isn't as comprehensive as `compatCaselessMatch`
361 // because it only decomposes to NFD. Naturally, it's faster because of this.
362 try expect(!try cf.canonCaselessMatch(allocator, &n, a, b));
363 try expect(try cf.canonCaselessMatch(allocator, &n, a, c));
364}
365```
366
367## Display Width of Characters and Strings
368When displaying text with a fixed-width font on a terminal screen, it's very
369important to know exactly how many columns or cells each character should take.
370Most characters will use one column, but there are many, like emoji and East-
371Asian ideographs that need more space. The `DisplayWidth` module provides
372methods for this purpose. It also has methods that use the display width calculation
373to `center`, `padLeft`, `padRight`, and `wrap` text.
374
375In your `build.zig`:
376
377```zig
378exe.root_module.addImport("DisplayWidth", zg.module("DisplayWidth"));
379```
380
381In your code:
382
383```zig
384const DisplayWidth = @import("DisplayWidth");
385
386test "Display width" {
387 // We need Unicode data for display width calculation.
388 const dwd = try DisplayWidth.DisplayWidthData.init(allocator);
389 defer dwd.deinit();
390
391 // The `DisplayWidth` structure takes a pointer to the data.
392 const dw = DisplayWidth{ .data = &dwd };
393
394 // String display width
395 try expectEqual(@as(usize, 5), dw.strWidth("Hello\r\n"));
396 try expectEqual(@as(usize, 8), dw.strWidth("Hello 😊"));
397 try expectEqual(@as(usize, 8), dw.strWidth("Héllo 😊"));
398 try expectEqual(@as(usize, 9), dw.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ"));
399 try expectEqual(@as(usize, 17), dw.strWidth("슬라바 우크라이나"));
400
401 // Centering text
402 const centered = try dw.center(allocator, "w😊w", 10, "-");
403 defer allocator.free(centered);
404 try expectEqualStrings("---w😊w---", centered);
405
406 // Pad left
407 const right_aligned = try dw.padLeft(allocator, "abc", 9, "*");
408 defer allocator.free(right_aligned);
409 try expectEqualStrings("******abc", right_aligned);
410
411 // Pad right
412 const left_aligned = try dw.padRight(allocator, "abc", 9, "*");
413 defer allocator.free(left_aligned);
414 try expectEqualStrings("abc******", left_aligned);
415
416 // Wrap text
417 const input = "The quick brown fox\r\njumped over the lazy dog!";
418 const wrapped = try dw.wrap(allocator, input, 10, 3);
419 defer allocator.free(wrapped);
420 const want =
421 \\The quick
422 \\brown fox
423 \\jumped
424 \\over the
425 \\lazy dog!
426 ;
427 try expectEqualStrings(want, wrapped);
428}
429```
430
431## Scripts
432Unicode categorizes code points by the Script in which they belong. A Script
433collects letters and other symbols that belong to a particular writing system.
434You can detect the Script for a code point with the `ScriptsData` module.
435
436In your `build.zig`:
437
438```zig
439exe.root_module.addImport("ScriptsData", zg.module("ScriptsData"));
440```
441
442In your code:
443
444```zig
445const ScriptsData = @import("ScriptsData");
446
447test "Scripts" {
448 const sd = try ScriptsData.init(allocator);
449 defer sd.deinit();
450
451 // To see the full list of Scripts, look at the
452 // `src/ScriptsData.zig` file. They are list in an enum.
453 try expect(sd.script('A') == .Latin);
454 try expect(sd.script('Ω') == .Greek);
455 try expect(sd.script('צ') == .Hebrew);
456}
457```
458
459## Relation to Ziglyph
460zg is a total re-write of some of the components of Ziglyph. The idea was to
461reduce binary size and improve performance. These goals were achieved by using
462trie-like data structures instead of generated functions. Where Ziglyph uses a
463function call, zg uses an array lookup, which is quite faster. In addition, all
464these data structures in zg are loaded at runtime from compressed versions in the
465binary. This allows for smaller binary sizes at the expense of increased memory
466footprint at runtime.
467
468Benchmarks demonstrate the above stated goals have been met:
469
470```plain
471Binary sizes =======
472
473149K ziglyph_case
47487K zg_case
475
476275K ziglyph_caseless
477168K zg_caseless
478
47968K ziglyph_codepoint
48068K zg_codepoint
481
482101K ziglyph_grapheme
48386K zg_grapheme
484
485185K ziglyph_normalizer
486152K zg_normalize
487
488101K ziglyph_width
48986K zg_width
490
491Benchmarks ==========
492
493Ziglyph toUpperStr/toLowerStr: result: 7911596, took: 80
494Ziglyph isUpperStr/isLowerStr: result: 110959, took: 17
495zg toUpperStr/toLowerStr: result: 7911596, took: 62
496zg isUpperStr/isLowerStr: result: 110959, took: 7
497
498Ziglyph Normalizer.eqlCaseless: result: 625, took: 500
499zg CaseFold.canonCaselessMatch: result: 625, took: 385
500zg CaseFold.compatCaselessMatch: result: 625, took: 593
501
502Ziglyph CodePointIterator: result: 3769314, took: 2
503zg CodePointIterator: result: 3769314, took: 3
504
505Ziglyph GraphemeIterator: result: 3691806, took: 48
506zg GraphemeIterator: result: 3691806, took: 16
507
508Ziglyph Normalizer.nfkc: result: 3934162, took: 416
509zg Normalize.nfkc: result: 3934162, took: 182
510
511Ziglyph Normalizer.nfc: result: 3955798, took: 57
512zg Normalize.nfc: result: 3955798, took: 28
513
514Ziglyph Normalizer.nfkd: result: 4006398, took: 172
515zg Normalize.nfkd: result: 4006398, took: 104
516
517Ziglyph Normalizer.nfd: result: 4028034, took: 169
518zg Normalize.nfd: result: 4028034, took: 104
519
520Ziglyph Normalizer.eql: result: 625, took: 337
521Zg Normalize.eql: result: 625, took: 53
522
523Ziglyph display_width.strWidth: result: 3700914, took: 71
524zg DisplayWidth.strWidth: result: 3700914, took: 24
525```
526
527These results were obtained on an M1 Mac with 16 GiB of RAM.
528
529In contrast to Ziglyph, zg does not have:
530
531- Word segmentation
532- Sentence segmentation
533- Collation
534
535It's possible that any missing functionality will be added in future versions,
536but only if enough demand is present in the community.
537