Updated README

author: Jose Colon Rodriguez 2024-03-31 09:59:51 -0400
committer: Jose Colon Rodriguez 2024-03-31 09:59:51 -0400
commit: 200c617c865a5952f0bd12378802cc06ea3eb1c2 (patch)
tree: 2af456d4c62a08330cf961e7237f083fc4566370 /README.md
parent: Split out Unicode tests to separate file (diff)
download: zg-200c617c865a5952f0bd12378802cc06ea3eb1c2.tar.gz
zg-200c617c865a5952f0bd12378802cc06ea3eb1c2.tar.xz
zg-200c617c865a5952f0bd12378802cc06ea3eb1c2.zip
1 files changed, 537 insertions, 0 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d4fc8f6
--- /dev/null
+++ b/README.md
@@ -0,0 +1,537 @@
+# zg
+zg provides Unicode text processing for Zig projects.
+## Unicode Version
+The Unicode version supported by zg is 15.1.0.
+## Zig Version
+The minimum Zig version required is 0.12.0-dev.3496+a2df84d0.
+## Integrating zg into your Zig Project
+You first need to add zg as a dependency in your `build.zig.zon` file:
+```zig
+.zg = .{
+    .url = "https://codeberg.org/dude_the_builder/zg/archive/v0.1.0.tar.gz",
+}
+```
+Then instantiate the dependency in your `build.zig`:
+```zig
+const zg = b.dependency("zg", .{});
+```
+## A Modular Approach
+zg is a modular library. This approach minimizes binary file size and memory
+requirements by only including the Unicode data required for the specified module.
+The following sections describe the various modules and their specific use case.
+## Code Points
+In the `code_point` module, you'll find a data structure representing a single code
+point, `CodePoint`, and an `Iterator` to iterate over the code points in a string.
+In your `build.zig`:
+```zig
+exe.root_module.addImport("code_point", zg.module("code_point"));
+```
+In your code:
+```zig
+const code_point = @import("code_point");
+test "Code point iterator" {
+    const str = "Hi 😊";
+    var iter = code_point.Iterator{ .bytes = str };
+    var i: usize = 0;
+    while (iter.next()) |cp| : (i += 1) {
+        // The `code` field is the actual code point scalar as a `u21`.
+        if (i == 0) try expect(cp.code == 'H');
+        if (i == 1) try expect(cp.code == 'i');
+        if (i == 2) try expect(cp.code == ' ');
+        if (i == 3) {
+            try expect(cp.code == '😊');
+            // The `offset` field is the byte offset in the
+            // source string.
+            try expect(cp.offset == 3);
+            // The `len` field is the length in bytes of the
+            // code point in the source string.
+            try expect(cp.len == 4);
+        }
+    }
+}
+```
+## Grapheme Clusters
+Many characters are composed from more than one code point. These are known as
+Grapheme Clusters and the `grapheme` module has a data structure to represent
+them, `Grapheme`, and an `Iterator` to iterate over them in a string.
+In your `build.zig`:
+```zig
+exe.root_module.addImport("grapheme", zg.module("grapheme"));
+```
+In your code:
+```zig
+const grapheme = @import("grapheme");
+test "Grapheme cluster iterator" {
+    // we need some Unicode data to process Grapheme Clusters.
+    const gd = try grapheme.GraphemeData.init(allocator);
+    defer gd.deinit();
+    const str = "He\u{301}"; // Hé
+    var iter = grapheme.Iterator.init(str, &gd);
+    var i: usize = 0;
+    while (iter.next()) |gc| : (i += 1) {
+        // The `len` field is the length in bytes of the
+        // grapheme cluster in the source string.
+        if (i == 0) try expect(gc.len == 1);
+        if (i == 1) {
+            try expect(gc.len == 3);
+            // The `offset` in bytes of the grapheme cluster
+            // in the source string.
+            try expect(gc.offset == 1);
+            // The `bytes` method returns the slice of bytes
+            // that comprise this grapheme cluster in the
+            // source string `str`.
+            try expectEqualStrings("e\u{301}", gc.bytes(str));
+        }
+    }
+}
+```
+## Unicode General Categories
+To detect the general category for a code point, use the `GenCatData` module.
+In your `build.zig`:
+```zig
+exe.root_module.addImport("GenCatData", zg.module("GenCatData"));
+```
+In your code:
+```zig
+const GenCatData = @import("GenCatData");
+test "General Category" {
+    const gcd = try GenCatData.init(allocator);
+    defer gcd.deinit();
+    // The `gc` method returns the abbreviated General Category.
+    // These abbreviations and descriptive comments can be found
+    // in the source file `src/GenCatData.zig` as en enum.
+    try expect(gcd.gc('A') == .Lu); // Lu: uppercase letter
+    try expect(gcd.gc('3') == .Nd); // Nd: decimal number
+    // The following are convenience methods for groups of General
+    // Categories. For example, all letter categories start with `L`:
+    // Lu, Ll, Lt, Lo.
+    try expect(gcd.isControl(0));
+    try expect(gcd.isLetter('z'));
+    try expect(gcd.isMark('\u{301}'));
+    try expect(gcd.isNumber('3'));
+    try expect(gcd.isPunctuation('['));
+    try expect(gcd.isSeparator(' '));
+    try expect(gcd.isSymbol('©'));
+}
+```
+## Unicode Properties
+You can detect common properties of a code point with the `PropsData` module.
+In your `build.zig`:
+```zig
+exe.root_module.addImport("PropsData", zg.module("PropsData"));
+```
+In your code:
+```zig
+const PropsData = @import("PropsData");
+test "Properties" {
+    const pd = try PropsData.init(allocator);
+    defer pd.deinit();
+    // Mathematical symbols and letters.
+    try expect(pd.isMath('+'));
+    // Alphabetic only code points.
+    try expect(pd.isAlphabetic('Z'));
+    // Space, tab, and other separators.
+    try expect(pd.isWhitespace(' '));
+    // Hexadecimal digits and variations thereof.
+    try expect(pd.isHexDigit('f'));
+    try expect(!pd.isHexDigit('z'));
+    // Accents, dieresis, and other combining marks.
+    try expect(pd.isDiacritic('\u{301}'));
+    // Unicode has a specification for valid identifiers like 
+    // the ones used in programming and regular expressions.
+    try expect(pd.isIdStart('Z')); // Identifier start character
+    try expect(!pd.isIdStart('1'));
+    try expect(pd.isIdContinue('1'));
+    // The `X` versions add some code points that can appear after
+    // normalizing a string.
+    try expect(pd.isXidStart('\u{b33}')); // Extended identifier start character
+    try expect(pd.isXidContinue('\u{e33}'));
+    try expect(!pd.isXidStart('1'));
+    // Note surprising Unicode numeric type properties!
+    try expect(pd.isNumeric('\u{277f}'));
+    try expect(!pd.isNumeric('3')); // 3 is not numeric!
+    try expect(pd.isDigit('\u{2070}'));
+    try expect(!pd.isDigit('3')); // 3 is not a digit!
+    try expect(pd.isDecimal('3')); // 3 is a decimal digit
+}
+```
+## Letter Case Detection and Conversion
+To detect and convert to and from different letter cases, use the `CaseData`
+module.
+In your `build.zig`:
+```zig
+exe.root_module.addImport("CaseData", zg.module("CaseData"));
+```
+In your code:
+```zig
+const CaseData = @import("CaseData");
+test "Case" {
+    const cd = try CaseData.init(allocator);
+    defer cd.deinit();
+    // Upper and lower case.
+    try expect(cd.isUpper('A'));
+    try expect('A' == cd.toUpper('a'));
+    try expect(cd.isLower('a'));
+    try expect('a' == cd.toLower('A'));
+    // Code points that have case.
+    try expect(cd.isCased('É'));
+    try expect(!cd.isCased('3'));
+    // Case detection and conversion for strings.
+    try expect(cd.isUpperStr("HELLO 123!"));
+    const ucased = try cd.toUpperStr(allocator, "hello 123");
+    defer allocator.free(ucased);
+    try expectEqualStrings("HELLO 123", ucased);
+    try expect(cd.isLowerStr("hello 123!"));
+    const lcased = try cd.toLowerStr(allocator, "HELLO 123");
+    defer allocator.free(lcased);
+    try expectEqualStrings("hello 123", lcased);
+}
+```
+## Normalization
+Unicode normalization is the process of converting a string into a uniform 
+representation that can guarantee a known structure by following a strict set
+of rules. There are four normalization forms:
+Canonical Composition (NFC)
+: The most compact representation obtained by first
+decomposing to Canonical Decomposition and then composing to NFC.
+Compatibility Composition (NFKC)
+: The most comprehensive composition obtained
+by first decomposing to Compatibility Decomposition and then composing to NFKC.
+Canonical Decomposition (NFD)
+: Only code points with canonical decompositions
+are decomposed. This is a more compact and faster decomposition but will not 
+provide the most comprehensive normalization possible.
+Compatibility Decomposition (NFKD)
+: The most comprehensive decomposition method
+where both canonical and compatibility decompositions are performed recursively.
+zg has methods to produce all four normalization forms in the `Normalize` module. 
+In your `build.zig`:
+```zig
+exe.root_module.addImport("Normalize", zg.module("Normalize"));
+```
+In your code:
+```zig
+const Normalize = @import("Normalize");
+test "Normalization" {
+    // We need lots of Unicode dta for normalization.
+    var norm_data = try Normalize.NormData.init(allocator);
+    defer norm_data.deinit();
+    // The `Normalize` structure takes a pointer to the data.
+    const n = Normalize{ .norm_data = &norm_data };
+    // NFC: Canonical composition
+    const nfc_result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
+    defer nfc_result.deinit();
+    try expectEqualStrings("Complex char: \u{3D3}", nfc_result.slice);
+    // NFKC: Compatibility composition
+    const nfkc_result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
+    defer nfkc_result.deinit();
+    try expectEqualStrings("Complex char: \u{038E}", nfkc_result.slice);
+    // NFD: Canonical decomposition
+    const nfd_result = try n.nfd(allocator, "Héllo World! \u{3d3}");
+    defer nfd_result.deinit();
+    try expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", nfd_result.slice);
+    // NFKD: Compatibility decomposition
+    const nfkd_result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
+    defer nfkd_result.deinit();
+    try expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", nfkd_result.slice);
+    // Test for equality of two strings after normalizing to NFC.
+    try expect(try n.eql(allocator, "foé", "foe\u{0301}"));
+    try expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
+}
+```
+## Caseless Matching via Case Folding
+Unicode provides a more efficient way of comparing strings while ignoring letter
+case differences: case folding. When you case fold a string, it's converted into a
+normalized case form suitable for efficient matching. Use the `CaseFold` module
+for this.
+In your `build.zig`:
+```zig
+exe.root_module.addImport("Normalize", zg.module("Normalize"));
+exe.root_module.addImport("CaseFold", zg.module("CaseFold"));
+```
+In your code:
+```zig
+const Normalize = @import("Normalize");
+const CaseFold = @import("CaseFold");
+test "Caseless matching" {
+    // We need to normalize during the matching process.
+    var norm_data = try Normalize.NormData.init(allocator);
+    defer norm_data.deinit();
+    const n = Normalize{ .norm_data = &norm_data };
+    // We need Unicode case fold data.
+    const cfd = try CaseFold.FoldData.init(allocator);
+    defer cfd.deinit();
+    // The `CaseFold` structure takes a pointer to the data.
+    const cf = CaseFold{ .fold_data = &cfd };
+    // `compatCaselessMatch` provides the deepest level of caseless
+    // matching because it decomposes fully to NFKD.
+    const a = "Héllo World! \u{3d3}";
+    const b = "He\u{301}llo World! \u{3a5}\u{301}";
+    try expect(try cf.compatCaselessMatch(allocator, &n, a, b));
+    const c = "He\u{301}llo World! \u{3d2}\u{301}";
+    try expect(try cf.compatCaselessMatch(allocator, &n, a, c));
+    // `canonCaselessMatch` isn't as comprehensive as `compatCaselessMatch`
+    // because it only decomposes to NFD. Naturally, it's faster because of this.
+    try expect(!try cf.canonCaselessMatch(allocator, &n, a, b));
+    try expect(try cf.canonCaselessMatch(allocator, &n, a, c));
+}
+```
+## Display Width of Characters and Strings
+When displaying text with a fixed-width font on a terminal screen, it's very
+important to know exactly how many columns or cells each character should take.
+Most characters will use one column, but there are many, like emoji and East-
+Asian ideographs that need more space. The `DisplayWidth` module provides 
+methods for this purpose. It also has methods that use the display width calculation
+to `center`, `padLeft`, `padRight`, and `wrap` text.
+In your `build.zig`:
+```zig
+exe.root_module.addImport("DisplayWidth", zg.module("DisplayWidth"));
+```
+In your code:
+```zig
+const DisplayWidth = @import("DisplayWidth");
+test "Display width" {
+    // We need Unicode data for display width calculation.
+    const dwd = try DisplayWidth.DisplayWidthData.init(allocator);
+    defer dwd.deinit();
+    // The `DisplayWidth` structure takes a pointer to the data.
+    const dw = DisplayWidth{ .data = &dwd };
+    // String display width
+    try expectEqual(@as(usize, 5), dw.strWidth("Hello\r\n"));
+    try expectEqual(@as(usize, 8), dw.strWidth("Hello 😊"));
+    try expectEqual(@as(usize, 8), dw.strWidth("Héllo 😊"));
+    try expectEqual(@as(usize, 9), dw.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ"));
+    try expectEqual(@as(usize, 17), dw.strWidth("슬라바 우크라이나"));
+    // Centering text
+    const centered = try dw.center(allocator, "w😊w", 10, "-");
+    defer allocator.free(centered);
+    try expectEqualStrings("---w😊w---", centered);
+    // Pad left
+    const right_aligned = try dw.padLeft(allocator, "abc", 9, "*");
+    defer allocator.free(right_aligned);
+    try expectEqualStrings("******abc", right_aligned);
+    // Pad right
+    const left_aligned = try dw.padRight(allocator, "abc", 9, "*");
+    defer allocator.free(left_aligned);
+    try expectEqualStrings("abc******", left_aligned);
+    // Wrap text
+    const input = "The quick brown fox\r\njumped over the lazy dog!";
+    const wrapped = try dw.wrap(allocator, input, 10, 3);
+    defer allocator.free(wrapped);
+    const want =
+        \\The quick 
+        \\brown fox 
+        \\jumped 
+        \\over the 
+        \\lazy dog!
+    ;
+    try expectEqualStrings(want, wrapped);
+}
+```
+## Scripts
+Unicode categorizes code points by the Script in which they belong. A Script 
+collects letters and other symbols that belong to a particular writing system.
+You can detect the Script for a code point with the `ScriptsData` module.
+In your `build.zig`:
+```zig
+exe.root_module.addImport("ScriptsData", zg.module("ScriptsData"));
+```
+In your code:
+```zig
+const ScriptsData = @import("ScriptsData");
+test "Scripts" {
+    const sd = try ScriptsData.init(allocator);
+    defer sd.deinit();
+    // To see the full list of Scripts, look at the
+    // `src/ScriptsData.zig` file. They are list in an enum.
+    try expect(sd.script('A') == .Latin);
+    try expect(sd.script('Ω') == .Greek);
+    try expect(sd.script('צ') == .Hebrew);
+}
+```
+## Relation to Ziglyph
+zg is a total re-write of some of the components of Ziglyph. The idea was to
+reduce binary size and improve performance. These goals were achieved by using
+trie-like data structures instead of generated functions. Where Ziglyph uses a 
+function call, zg uses an array lookup, which is quite faster. In addition, all
+these data structures in zg are loaded at runtime from compressed versions in the
+binary. This allows for smaller binary sizes at the expense of increased memory 
+footprint at runtime.
+Benchmarks demonstrate the above stated goals have been met:
+```plain
+Binary sizes =======
+149K ziglyph_case
+87K zg_case
+275K ziglyph_caseless
+168K zg_caseless
+68K ziglyph_codepoint
+68K zg_codepoint
+101K ziglyph_grapheme
+86K zg_grapheme
+185K ziglyph_normalizer
+152K zg_normalize
+101K ziglyph_width
+86K zg_width
+Benchmarks ==========
+Ziglyph toUpperStr/toLowerStr: result: 7911596, took: 80
+Ziglyph isUpperStr/isLowerStr: result: 110959, took: 17
+zg toUpperStr/toLowerStr: result: 7911596, took: 62
+zg isUpperStr/isLowerStr: result: 110959, took: 7
+Ziglyph Normalizer.eqlCaseless: result: 625, took: 500
+zg CaseFold.canonCaselessMatch: result: 625, took: 385
+zg CaseFold.compatCaselessMatch: result: 625, took: 593
+Ziglyph CodePointIterator: result: 3769314, took: 2
+zg CodePointIterator: result: 3769314, took: 3
+Ziglyph GraphemeIterator: result: 3691806, took: 48
+zg GraphemeIterator: result: 3691806, took: 16
+Ziglyph Normalizer.nfkc: result: 3934162, took: 416
+zg Normalize.nfkc: result: 3934162, took: 182
+Ziglyph Normalizer.nfc: result: 3955798, took: 57
+zg Normalize.nfc: result: 3955798, took: 28
+Ziglyph Normalizer.nfkd: result: 4006398, took: 172
+zg Normalize.nfkd: result: 4006398, took: 104
+Ziglyph Normalizer.nfd: result: 4028034, took: 169
+zg Normalize.nfd: result: 4028034, took: 104
+Ziglyph Normalizer.eql: result: 625, took: 337
+Zg Normalize.eql: result: 625, took: 53
+Ziglyph display_width.strWidth: result: 3700914, took: 71
+zg DisplayWidth.strWidth: result: 3700914, took: 24
+```
+These results were obtained on an M1 Mac with 16 GiB of RAM.
+In contrast to Ziglyph, zg does not have:
+- Word segmentation
+- Sentence segmentation
+- Collation
+It's possible that any missing functionality will be added in future versions,
+but only if enough demand is present in the community.
author	Jose Colon Rodriguez	2024-03-31 09:59:51 -0400
committer	Jose Colon Rodriguez	2024-03-31 09:59:51 -0400
commit	200c617c865a5952f0bd12378802cc06ea3eb1c2 (patch)
tree	2af456d4c62a08330cf961e7237f083fc4566370 /README.md
parent	Split out Unicode tests to separate file (diff)
download	zg-200c617c865a5952f0bd12378802cc06ea3eb1c2.tar.gz zg-200c617c865a5952f0bd12378802cc06ea3eb1c2.tar.xz zg-200c617c865a5952f0bd12378802cc06ea3eb1c2.zip

diff --git a/README.md b/README.md new file mode 100644 index 0000000..d4fc8f6 --- /dev/null +++ b/README.md
@@ -0,0 +1,537 @@
	1	# zg
	2	zg provides Unicode text processing for Zig projects.
	3
	4	## Unicode Version
	5	The Unicode version supported by zg is 15.1.0.
	6
	7	## Zig Version
	8	The minimum Zig version required is 0.12.0-dev.3496+a2df84d0.
	9
	10	## Integrating zg into your Zig Project
	11	You first need to add zg as a dependency in your `build.zig.zon` file:
	12
	13	```zig
	14	.zg = .{
	15	.url = "https://codeberg.org/dude_the_builder/zg/archive/v0.1.0.tar.gz",
	16	}
	17	```
	18
	19	Then instantiate the dependency in your `build.zig`:
	20
	21
	22	```zig
	23	const zg = b.dependency("zg", .{});
	24	```
	25
	26	## A Modular Approach
	27	zg is a modular library. This approach minimizes binary file size and memory
	28	requirements by only including the Unicode data required for the specified module.
	29	The following sections describe the various modules and their specific use case.
	30
	31	## Code Points
	32	In the `code_point` module, you'll find a data structure representing a single code
	33	point, `CodePoint`, and an `Iterator` to iterate over the code points in a string.
	34
	35	In your `build.zig`:
	36
	37	```zig
	38	exe.root_module.addImport("code_point", zg.module("code_point"));
	39	```
	40
	41	In your code:
	42
	43	```zig
	44	const code_point = @import("code_point");
	45
	46	test "Code point iterator" {
	47	const str = "Hi 😊";
	48	var iter = code_point.Iterator{ .bytes = str };
	49	var i: usize = 0;
	50
	51	while (iter.next()) \|cp\| : (i += 1) {
	52	// The `code` field is the actual code point scalar as a `u21`.
	53	if (i == 0) try expect(cp.code == 'H');
	54	if (i == 1) try expect(cp.code == 'i');
	55	if (i == 2) try expect(cp.code == ' ');
	56
	57	if (i == 3) {
	58	try expect(cp.code == '😊');
	59
	60	// The `offset` field is the byte offset in the
	61	// source string.
	62	try expect(cp.offset == 3);
	63
	64	// The `len` field is the length in bytes of the
	65	// code point in the source string.
	66	try expect(cp.len == 4);
	67	}
	68	}
	69	}
	70	```
	71
	72	## Grapheme Clusters
	73	Many characters are composed from more than one code point. These are known as
	74	Grapheme Clusters and the `grapheme` module has a data structure to represent
	75	them, `Grapheme`, and an `Iterator` to iterate over them in a string.
	76
	77	In your `build.zig`:
	78
	79	```zig
	80	exe.root_module.addImport("grapheme", zg.module("grapheme"));
	81	```
	82
	83	In your code:
	84
	85	```zig
	86	const grapheme = @import("grapheme");
	87
	88	test "Grapheme cluster iterator" {
	89	// we need some Unicode data to process Grapheme Clusters.
	90	const gd = try grapheme.GraphemeData.init(allocator);
	91	defer gd.deinit();
	92
	93	const str = "He\u{301}"; // Hé
	94	var iter = grapheme.Iterator.init(str, &gd);
	95
	96	var i: usize = 0;
	97
	98	while (iter.next()) \|gc\| : (i += 1) {
	99	// The `len` field is the length in bytes of the
	100	// grapheme cluster in the source string.
	101	if (i == 0) try expect(gc.len == 1);
	102
	103	if (i == 1) {
	104	try expect(gc.len == 3);
	105
	106	// The `offset` in bytes of the grapheme cluster
	107	// in the source string.
	108	try expect(gc.offset == 1);
	109
	110	// The `bytes` method returns the slice of bytes
	111	// that comprise this grapheme cluster in the
	112	// source string `str`.
	113	try expectEqualStrings("e\u{301}", gc.bytes(str));
	114	}
	115	}
	116	}
	117	```
	118
	119	## Unicode General Categories
	120	To detect the general category for a code point, use the `GenCatData` module.
	121
	122	In your `build.zig`:
	123
	124	```zig
	125	exe.root_module.addImport("GenCatData", zg.module("GenCatData"));
	126	```
	127
	128	In your code:
	129
	130	```zig
	131	const GenCatData = @import("GenCatData");
	132
	133	test "General Category" {
	134	const gcd = try GenCatData.init(allocator);
	135	defer gcd.deinit();
	136
	137	// The `gc` method returns the abbreviated General Category.
	138	// These abbreviations and descriptive comments can be found
	139	// in the source file `src/GenCatData.zig` as en enum.
	140	try expect(gcd.gc('A') == .Lu); // Lu: uppercase letter
	141	try expect(gcd.gc('3') == .Nd); // Nd: decimal number
	142
	143	// The following are convenience methods for groups of General
	144	// Categories. For example, all letter categories start with `L`:
	145	// Lu, Ll, Lt, Lo.
	146	try expect(gcd.isControl(0));
	147	try expect(gcd.isLetter('z'));
	148	try expect(gcd.isMark('\u{301}'));
	149	try expect(gcd.isNumber('3'));
	150	try expect(gcd.isPunctuation('['));
	151	try expect(gcd.isSeparator(' '));
	152	try expect(gcd.isSymbol('©'));
	153	}
	154	```
	155
	156	## Unicode Properties
	157	You can detect common properties of a code point with the `PropsData` module.
	158
	159	In your `build.zig`:
	160
	161	```zig
	162	exe.root_module.addImport("PropsData", zg.module("PropsData"));
	163	```
	164
	165	In your code:
	166
	167	```zig
	168	const PropsData = @import("PropsData");
	169
	170	test "Properties" {
	171	const pd = try PropsData.init(allocator);
	172	defer pd.deinit();
	173
	174	// Mathematical symbols and letters.
	175	try expect(pd.isMath('+'));
	176	// Alphabetic only code points.
	177	try expect(pd.isAlphabetic('Z'));
	178	// Space, tab, and other separators.
	179	try expect(pd.isWhitespace(' '));
	180	// Hexadecimal digits and variations thereof.
	181	try expect(pd.isHexDigit('f'));
	182	try expect(!pd.isHexDigit('z'));
	183
	184	// Accents, dieresis, and other combining marks.
	185	try expect(pd.isDiacritic('\u{301}'));
	186
	187	// Unicode has a specification for valid identifiers like
	188	// the ones used in programming and regular expressions.
	189	try expect(pd.isIdStart('Z')); // Identifier start character
	190	try expect(!pd.isIdStart('1'));
	191	try expect(pd.isIdContinue('1'));
	192
	193	// The `X` versions add some code points that can appear after
	194	// normalizing a string.
	195	try expect(pd.isXidStart('\u{b33}')); // Extended identifier start character
	196	try expect(pd.isXidContinue('\u{e33}'));
	197	try expect(!pd.isXidStart('1'));
	198
	199	// Note surprising Unicode numeric type properties!
	200	try expect(pd.isNumeric('\u{277f}'));
	201	try expect(!pd.isNumeric('3')); // 3 is not numeric!
	202	try expect(pd.isDigit('\u{2070}'));
	203	try expect(!pd.isDigit('3')); // 3 is not a digit!
	204	try expect(pd.isDecimal('3')); // 3 is a decimal digit
	205	}
	206	```
	207
	208	## Letter Case Detection and Conversion
	209	To detect and convert to and from different letter cases, use the `CaseData`
	210	module.
	211
	212	In your `build.zig`:
	213
	214	```zig
	215	exe.root_module.addImport("CaseData", zg.module("CaseData"));
	216	```
	217
	218	In your code:
	219
	220	```zig
	221	const CaseData = @import("CaseData");
	222
	223	test "Case" {
	224	const cd = try CaseData.init(allocator);
	225	defer cd.deinit();
	226
	227	// Upper and lower case.
	228	try expect(cd.isUpper('A'));
	229	try expect('A' == cd.toUpper('a'));
	230	try expect(cd.isLower('a'));
	231	try expect('a' == cd.toLower('A'));
	232
	233	// Code points that have case.
	234	try expect(cd.isCased('É'));
	235	try expect(!cd.isCased('3'));
	236
	237	// Case detection and conversion for strings.
	238	try expect(cd.isUpperStr("HELLO 123!"));
	239	const ucased = try cd.toUpperStr(allocator, "hello 123");
	240	defer allocator.free(ucased);
	241	try expectEqualStrings("HELLO 123", ucased);
	242
	243	try expect(cd.isLowerStr("hello 123!"));
	244	const lcased = try cd.toLowerStr(allocator, "HELLO 123");
	245	defer allocator.free(lcased);
	246	try expectEqualStrings("hello 123", lcased);
	247	}
	248	```
	249
	250	## Normalization
	251	Unicode normalization is the process of converting a string into a uniform
	252	representation that can guarantee a known structure by following a strict set
	253	of rules. There are four normalization forms:
	254
	255	Canonical Composition (NFC)
	256	: The most compact representation obtained by first
	257	decomposing to Canonical Decomposition and then composing to NFC.
	258
	259	Compatibility Composition (NFKC)
	260	: The most comprehensive composition obtained
	261	by first decomposing to Compatibility Decomposition and then composing to NFKC.
	262
	263	Canonical Decomposition (NFD)
	264	: Only code points with canonical decompositions
	265	are decomposed. This is a more compact and faster decomposition but will not
	266	provide the most comprehensive normalization possible.
	267
	268	Compatibility Decomposition (NFKD)
	269	: The most comprehensive decomposition method
	270	where both canonical and compatibility decompositions are performed recursively.
	271
	272	zg has methods to produce all four normalization forms in the `Normalize` module.
	273
	274	In your `build.zig`:
	275
	276	```zig
	277	exe.root_module.addImport("Normalize", zg.module("Normalize"));
	278	```
	279
	280	In your code:
	281
	282	```zig
	283	const Normalize = @import("Normalize");
	284
	285	test "Normalization" {
	286	// We need lots of Unicode dta for normalization.
	287	var norm_data = try Normalize.NormData.init(allocator);
	288	defer norm_data.deinit();
	289
	290	// The `Normalize` structure takes a pointer to the data.
	291	const n = Normalize{ .norm_data = &norm_data };
	292
	293	// NFC: Canonical composition
	294	const nfc_result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
	295	defer nfc_result.deinit();
	296	try expectEqualStrings("Complex char: \u{3D3}", nfc_result.slice);
	297
	298	// NFKC: Compatibility composition
	299	const nfkc_result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
	300	defer nfkc_result.deinit();
	301	try expectEqualStrings("Complex char: \u{038E}", nfkc_result.slice);
	302
	303	// NFD: Canonical decomposition
	304	const nfd_result = try n.nfd(allocator, "Héllo World! \u{3d3}");
	305	defer nfd_result.deinit();
	306	try expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", nfd_result.slice);
	307
	308	// NFKD: Compatibility decomposition
	309	const nfkd_result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
	310	defer nfkd_result.deinit();
	311	try expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", nfkd_result.slice);
	312
	313	// Test for equality of two strings after normalizing to NFC.
	314	try expect(try n.eql(allocator, "foé", "foe\u{0301}"));
	315	try expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
	316	}
	317	```
	318
	319	## Caseless Matching via Case Folding
	320	Unicode provides a more efficient way of comparing strings while ignoring letter
	321	case differences: case folding. When you case fold a string, it's converted into a
	322	normalized case form suitable for efficient matching. Use the `CaseFold` module
	323	for this.
	324
	325	In your `build.zig`:
	326
	327	```zig
	328	exe.root_module.addImport("Normalize", zg.module("Normalize"));
	329	exe.root_module.addImport("CaseFold", zg.module("CaseFold"));
	330	```
	331
	332	In your code:
	333
	334	```zig
	335	const Normalize = @import("Normalize");
	336	const CaseFold = @import("CaseFold");
	337
	338	test "Caseless matching" {
	339	// We need to normalize during the matching process.
	340	var norm_data = try Normalize.NormData.init(allocator);
	341	defer norm_data.deinit();
	342	const n = Normalize{ .norm_data = &norm_data };
	343
	344	// We need Unicode case fold data.
	345	const cfd = try CaseFold.FoldData.init(allocator);
	346	defer cfd.deinit();
	347
	348	// The `CaseFold` structure takes a pointer to the data.
	349	const cf = CaseFold{ .fold_data = &cfd };
	350
	351	// `compatCaselessMatch` provides the deepest level of caseless
	352	// matching because it decomposes fully to NFKD.
	353	const a = "Héllo World! \u{3d3}";
	354	const b = "He\u{301}llo World! \u{3a5}\u{301}";
	355	try expect(try cf.compatCaselessMatch(allocator, &n, a, b));
	356
	357	const c = "He\u{301}llo World! \u{3d2}\u{301}";
	358	try expect(try cf.compatCaselessMatch(allocator, &n, a, c));
	359
	360	// `canonCaselessMatch` isn't as comprehensive as `compatCaselessMatch`
	361	// because it only decomposes to NFD. Naturally, it's faster because of this.
	362	try expect(!try cf.canonCaselessMatch(allocator, &n, a, b));
	363	try expect(try cf.canonCaselessMatch(allocator, &n, a, c));
	364	}
	365	```
	366
	367	## Display Width of Characters and Strings
	368	When displaying text with a fixed-width font on a terminal screen, it's very
	369	important to know exactly how many columns or cells each character should take.
	370	Most characters will use one column, but there are many, like emoji and East-
	371	Asian ideographs that need more space. The `DisplayWidth` module provides
	372	methods for this purpose. It also has methods that use the display width calculation
	373	to `center`, `padLeft`, `padRight`, and `wrap` text.
	374
	375	In your `build.zig`:
	376
	377	```zig
	378	exe.root_module.addImport("DisplayWidth", zg.module("DisplayWidth"));
	379	```
	380
	381	In your code:
	382
	383	```zig
	384	const DisplayWidth = @import("DisplayWidth");
	385
	386	test "Display width" {
	387	// We need Unicode data for display width calculation.
	388	const dwd = try DisplayWidth.DisplayWidthData.init(allocator);
	389	defer dwd.deinit();
	390
	391	// The `DisplayWidth` structure takes a pointer to the data.
	392	const dw = DisplayWidth{ .data = &dwd };
	393
	394	// String display width
	395	try expectEqual(@as(usize, 5), dw.strWidth("Hello\r\n"));
	396	try expectEqual(@as(usize, 8), dw.strWidth("Hello 😊"));
	397	try expectEqual(@as(usize, 8), dw.strWidth("Héllo 😊"));
	398	try expectEqual(@as(usize, 9), dw.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ"));
	399	try expectEqual(@as(usize, 17), dw.strWidth("슬라바 우크라이나"));
	400
	401	// Centering text
	402	const centered = try dw.center(allocator, "w😊w", 10, "-");
	403	defer allocator.free(centered);
	404	try expectEqualStrings("---w😊w---", centered);
	405
	406	// Pad left
	407	const right_aligned = try dw.padLeft(allocator, "abc", 9, "*");
	408	defer allocator.free(right_aligned);
	409	try expectEqualStrings("******abc", right_aligned);
	410
	411	// Pad right
	412	const left_aligned = try dw.padRight(allocator, "abc", 9, "*");
	413	defer allocator.free(left_aligned);
	414	try expectEqualStrings("abc******", left_aligned);
	415
	416	// Wrap text
	417	const input = "The quick brown fox\r\njumped over the lazy dog!";
	418	const wrapped = try dw.wrap(allocator, input, 10, 3);
	419	defer allocator.free(wrapped);
	420	const want =
	421	\\The quick
	422	\\brown fox
	423	\\jumped
	424	\\over the
	425	\\lazy dog!
	426	;
	427	try expectEqualStrings(want, wrapped);
	428	}
	429	```
	430
	431	## Scripts
	432	Unicode categorizes code points by the Script in which they belong. A Script
	433	collects letters and other symbols that belong to a particular writing system.
	434	You can detect the Script for a code point with the `ScriptsData` module.
	435
	436	In your `build.zig`:
	437
	438	```zig
	439	exe.root_module.addImport("ScriptsData", zg.module("ScriptsData"));
	440	```
	441
	442	In your code:
	443
	444	```zig
	445	const ScriptsData = @import("ScriptsData");
	446
	447	test "Scripts" {
	448	const sd = try ScriptsData.init(allocator);
	449	defer sd.deinit();
	450
	451	// To see the full list of Scripts, look at the
	452	// `src/ScriptsData.zig` file. They are list in an enum.
	453	try expect(sd.script('A') == .Latin);
	454	try expect(sd.script('Ω') == .Greek);
	455	try expect(sd.script('צ') == .Hebrew);
	456	}
	457	```
	458
	459	## Relation to Ziglyph
	460	zg is a total re-write of some of the components of Ziglyph. The idea was to
	461	reduce binary size and improve performance. These goals were achieved by using
	462	trie-like data structures instead of generated functions. Where Ziglyph uses a
	463	function call, zg uses an array lookup, which is quite faster. In addition, all
	464	these data structures in zg are loaded at runtime from compressed versions in the
	465	binary. This allows for smaller binary sizes at the expense of increased memory
	466	footprint at runtime.
	467
	468	Benchmarks demonstrate the above stated goals have been met:
	469
	470	```plain
	471	Binary sizes =======
	472
	473	149K ziglyph_case
	474	87K zg_case
	475
	476	275K ziglyph_caseless
	477	168K zg_caseless
	478
	479	68K ziglyph_codepoint
	480	68K zg_codepoint
	481
	482	101K ziglyph_grapheme
	483	86K zg_grapheme
	484
	485	185K ziglyph_normalizer
	486	152K zg_normalize
	487
	488	101K ziglyph_width
	489	86K zg_width
	490
	491	Benchmarks ==========
	492
	493	Ziglyph toUpperStr/toLowerStr: result: 7911596, took: 80
	494	Ziglyph isUpperStr/isLowerStr: result: 110959, took: 17
	495	zg toUpperStr/toLowerStr: result: 7911596, took: 62
	496	zg isUpperStr/isLowerStr: result: 110959, took: 7
	497
	498	Ziglyph Normalizer.eqlCaseless: result: 625, took: 500
	499	zg CaseFold.canonCaselessMatch: result: 625, took: 385
	500	zg CaseFold.compatCaselessMatch: result: 625, took: 593
	501
	502	Ziglyph CodePointIterator: result: 3769314, took: 2
	503	zg CodePointIterator: result: 3769314, took: 3
	504
	505	Ziglyph GraphemeIterator: result: 3691806, took: 48
	506	zg GraphemeIterator: result: 3691806, took: 16
	507
	508	Ziglyph Normalizer.nfkc: result: 3934162, took: 416
	509	zg Normalize.nfkc: result: 3934162, took: 182
	510
	511	Ziglyph Normalizer.nfc: result: 3955798, took: 57
	512	zg Normalize.nfc: result: 3955798, took: 28
	513
	514	Ziglyph Normalizer.nfkd: result: 4006398, took: 172
	515	zg Normalize.nfkd: result: 4006398, took: 104
	516
	517	Ziglyph Normalizer.nfd: result: 4028034, took: 169
	518	zg Normalize.nfd: result: 4028034, took: 104
	519
	520	Ziglyph Normalizer.eql: result: 625, took: 337
	521	Zg Normalize.eql: result: 625, took: 53
	522
	523	Ziglyph display_width.strWidth: result: 3700914, took: 71
	524	zg DisplayWidth.strWidth: result: 3700914, took: 24
	525	```
	526
	527	These results were obtained on an M1 Mac with 16 GiB of RAM.
	528
	529	In contrast to Ziglyph, zg does not have:
	530
	531	- Word segmentation
	532	- Sentence segmentation
	533	- Collation
	534
	535	It's possible that any missing functionality will be added in future versions,
	536	but only if enough demand is present in the community.
	537