summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md537
-rw-r--r--build.zig2
-rw-r--r--codegen/canon.zig5
-rw-r--r--codegen/case_prop.zig5
-rw-r--r--codegen/ccc.zig5
-rw-r--r--codegen/compat.zig5
-rw-r--r--codegen/core_props.zig5
-rw-r--r--codegen/dwp.zig5
-rw-r--r--codegen/fold.zig5
-rw-r--r--codegen/gbp.zig5
-rw-r--r--codegen/gencat.zig5
-rw-r--r--codegen/hangul.zig5
-rw-r--r--codegen/lower.zig5
-rw-r--r--codegen/normp.zig5
-rw-r--r--codegen/numeric.zig5
-rw-r--r--codegen/props.zig5
-rw-r--r--codegen/scripts.zig5
-rw-r--r--codegen/upper.zig5
-rw-r--r--src/CanonData.zig5
-rw-r--r--src/CaseData.zig11
-rw-r--r--src/CaseFold.zig8
-rw-r--r--src/CombiningData.zig5
-rw-r--r--src/CompatData.zig5
-rw-r--r--src/FoldData.zig5
-rw-r--r--src/GenCatData.zig5
-rw-r--r--src/GraphemeData.zig5
-rw-r--r--src/HangulData.zig5
-rw-r--r--src/NormPropsData.zig5
-rw-r--r--src/Normalize.zig41
-rw-r--r--src/PropsData.zig11
-rw-r--r--src/ScriptsData.zig7
-rw-r--r--src/WidthData.zig5
32 files changed, 606 insertions, 136 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d4fc8f6
--- /dev/null
+++ b/README.md
@@ -0,0 +1,537 @@
1# zg
2zg provides Unicode text processing for Zig projects.
3
4## Unicode Version
5The Unicode version supported by zg is 15.1.0.
6
7## Zig Version
8The minimum Zig version required is 0.12.0-dev.3496+a2df84d0.
9
10## Integrating zg into your Zig Project
11You first need to add zg as a dependency in your `build.zig.zon` file:
12
13```zig
14.zg = .{
15 .url = "https://codeberg.org/dude_the_builder/zg/archive/v0.1.0.tar.gz",
16}
17```
18
19Then instantiate the dependency in your `build.zig`:
20
21
22```zig
23const zg = b.dependency("zg", .{});
24```
25
26## A Modular Approach
27zg is a modular library. This approach minimizes binary file size and memory
28requirements by only including the Unicode data required for the specified module.
29The following sections describe the various modules and their specific use case.
30
31## Code Points
32In the `code_point` module, you'll find a data structure representing a single code
33point, `CodePoint`, and an `Iterator` to iterate over the code points in a string.
34
35In your `build.zig`:
36
37```zig
38exe.root_module.addImport("code_point", zg.module("code_point"));
39```
40
41In your code:
42
43```zig
44const code_point = @import("code_point");
45
46test "Code point iterator" {
47 const str = "Hi 😊";
48 var iter = code_point.Iterator{ .bytes = str };
49 var i: usize = 0;
50
51 while (iter.next()) |cp| : (i += 1) {
52 // The `code` field is the actual code point scalar as a `u21`.
53 if (i == 0) try expect(cp.code == 'H');
54 if (i == 1) try expect(cp.code == 'i');
55 if (i == 2) try expect(cp.code == ' ');
56
57 if (i == 3) {
58 try expect(cp.code == '😊');
59
60 // The `offset` field is the byte offset in the
61 // source string.
62 try expect(cp.offset == 3);
63
64 // The `len` field is the length in bytes of the
65 // code point in the source string.
66 try expect(cp.len == 4);
67 }
68 }
69}
70```
71
72## Grapheme Clusters
73Many characters are composed from more than one code point. These are known as
74Grapheme Clusters and the `grapheme` module has a data structure to represent
75them, `Grapheme`, and an `Iterator` to iterate over them in a string.
76
77In your `build.zig`:
78
79```zig
80exe.root_module.addImport("grapheme", zg.module("grapheme"));
81```
82
83In your code:
84
85```zig
86const grapheme = @import("grapheme");
87
88test "Grapheme cluster iterator" {
89 // we need some Unicode data to process Grapheme Clusters.
90 const gd = try grapheme.GraphemeData.init(allocator);
91 defer gd.deinit();
92
93 const str = "He\u{301}"; // Hé
94 var iter = grapheme.Iterator.init(str, &gd);
95
96 var i: usize = 0;
97
98 while (iter.next()) |gc| : (i += 1) {
99 // The `len` field is the length in bytes of the
100 // grapheme cluster in the source string.
101 if (i == 0) try expect(gc.len == 1);
102
103 if (i == 1) {
104 try expect(gc.len == 3);
105
106 // The `offset` in bytes of the grapheme cluster
107 // in the source string.
108 try expect(gc.offset == 1);
109
110 // The `bytes` method returns the slice of bytes
111 // that comprise this grapheme cluster in the
112 // source string `str`.
113 try expectEqualStrings("e\u{301}", gc.bytes(str));
114 }
115 }
116}
117```
118
119## Unicode General Categories
120To detect the general category for a code point, use the `GenCatData` module.
121
122In your `build.zig`:
123
124```zig
125exe.root_module.addImport("GenCatData", zg.module("GenCatData"));
126```
127
128In your code:
129
130```zig
131const GenCatData = @import("GenCatData");
132
133test "General Category" {
134 const gcd = try GenCatData.init(allocator);
135 defer gcd.deinit();
136
137 // The `gc` method returns the abbreviated General Category.
138 // These abbreviations and descriptive comments can be found
139 // in the source file `src/GenCatData.zig` as en enum.
140 try expect(gcd.gc('A') == .Lu); // Lu: uppercase letter
141 try expect(gcd.gc('3') == .Nd); // Nd: decimal number
142
143 // The following are convenience methods for groups of General
144 // Categories. For example, all letter categories start with `L`:
145 // Lu, Ll, Lt, Lo.
146 try expect(gcd.isControl(0));
147 try expect(gcd.isLetter('z'));
148 try expect(gcd.isMark('\u{301}'));
149 try expect(gcd.isNumber('3'));
150 try expect(gcd.isPunctuation('['));
151 try expect(gcd.isSeparator(' '));
152 try expect(gcd.isSymbol('©'));
153}
154```
155
156## Unicode Properties
157You can detect common properties of a code point with the `PropsData` module.
158
159In your `build.zig`:
160
161```zig
162exe.root_module.addImport("PropsData", zg.module("PropsData"));
163```
164
165In your code:
166
167```zig
168const PropsData = @import("PropsData");
169
170test "Properties" {
171 const pd = try PropsData.init(allocator);
172 defer pd.deinit();
173
174 // Mathematical symbols and letters.
175 try expect(pd.isMath('+'));
176 // Alphabetic only code points.
177 try expect(pd.isAlphabetic('Z'));
178 // Space, tab, and other separators.
179 try expect(pd.isWhitespace(' '));
180 // Hexadecimal digits and variations thereof.
181 try expect(pd.isHexDigit('f'));
182 try expect(!pd.isHexDigit('z'));
183
184 // Accents, dieresis, and other combining marks.
185 try expect(pd.isDiacritic('\u{301}'));
186
187 // Unicode has a specification for valid identifiers like
188 // the ones used in programming and regular expressions.
189 try expect(pd.isIdStart('Z')); // Identifier start character
190 try expect(!pd.isIdStart('1'));
191 try expect(pd.isIdContinue('1'));
192
193 // The `X` versions add some code points that can appear after
194 // normalizing a string.
195 try expect(pd.isXidStart('\u{b33}')); // Extended identifier start character
196 try expect(pd.isXidContinue('\u{e33}'));
197 try expect(!pd.isXidStart('1'));
198
199 // Note surprising Unicode numeric type properties!
200 try expect(pd.isNumeric('\u{277f}'));
201 try expect(!pd.isNumeric('3')); // 3 is not numeric!
202 try expect(pd.isDigit('\u{2070}'));
203 try expect(!pd.isDigit('3')); // 3 is not a digit!
204 try expect(pd.isDecimal('3')); // 3 is a decimal digit
205}
206```
207
208## Letter Case Detection and Conversion
209To detect and convert to and from different letter cases, use the `CaseData`
210module.
211
212In your `build.zig`:
213
214```zig
215exe.root_module.addImport("CaseData", zg.module("CaseData"));
216```
217
218In your code:
219
220```zig
221const CaseData = @import("CaseData");
222
223test "Case" {
224 const cd = try CaseData.init(allocator);
225 defer cd.deinit();
226
227 // Upper and lower case.
228 try expect(cd.isUpper('A'));
229 try expect('A' == cd.toUpper('a'));
230 try expect(cd.isLower('a'));
231 try expect('a' == cd.toLower('A'));
232
233 // Code points that have case.
234 try expect(cd.isCased('É'));
235 try expect(!cd.isCased('3'));
236
237 // Case detection and conversion for strings.
238 try expect(cd.isUpperStr("HELLO 123!"));
239 const ucased = try cd.toUpperStr(allocator, "hello 123");
240 defer allocator.free(ucased);
241 try expectEqualStrings("HELLO 123", ucased);
242
243 try expect(cd.isLowerStr("hello 123!"));
244 const lcased = try cd.toLowerStr(allocator, "HELLO 123");
245 defer allocator.free(lcased);
246 try expectEqualStrings("hello 123", lcased);
247}
248```
249
250## Normalization
251Unicode normalization is the process of converting a string into a uniform
252representation that can guarantee a known structure by following a strict set
253of rules. There are four normalization forms:
254
255Canonical Composition (NFC)
256: The most compact representation obtained by first
257decomposing to Canonical Decomposition and then composing to NFC.
258
259Compatibility Composition (NFKC)
260: The most comprehensive composition obtained
261by first decomposing to Compatibility Decomposition and then composing to NFKC.
262
263Canonical Decomposition (NFD)
264: Only code points with canonical decompositions
265are decomposed. This is a more compact and faster decomposition but will not
266provide the most comprehensive normalization possible.
267
268Compatibility Decomposition (NFKD)
269: The most comprehensive decomposition method
270where both canonical and compatibility decompositions are performed recursively.
271
272zg has methods to produce all four normalization forms in the `Normalize` module.
273
274In your `build.zig`:
275
276```zig
277exe.root_module.addImport("Normalize", zg.module("Normalize"));
278```
279
280In your code:
281
282```zig
283const Normalize = @import("Normalize");
284
285test "Normalization" {
286 // We need lots of Unicode dta for normalization.
287 var norm_data = try Normalize.NormData.init(allocator);
288 defer norm_data.deinit();
289
290 // The `Normalize` structure takes a pointer to the data.
291 const n = Normalize{ .norm_data = &norm_data };
292
293 // NFC: Canonical composition
294 const nfc_result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
295 defer nfc_result.deinit();
296 try expectEqualStrings("Complex char: \u{3D3}", nfc_result.slice);
297
298 // NFKC: Compatibility composition
299 const nfkc_result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
300 defer nfkc_result.deinit();
301 try expectEqualStrings("Complex char: \u{038E}", nfkc_result.slice);
302
303 // NFD: Canonical decomposition
304 const nfd_result = try n.nfd(allocator, "Héllo World! \u{3d3}");
305 defer nfd_result.deinit();
306 try expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", nfd_result.slice);
307
308 // NFKD: Compatibility decomposition
309 const nfkd_result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
310 defer nfkd_result.deinit();
311 try expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", nfkd_result.slice);
312
313 // Test for equality of two strings after normalizing to NFC.
314 try expect(try n.eql(allocator, "foé", "foe\u{0301}"));
315 try expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
316}
317```
318
319## Caseless Matching via Case Folding
320Unicode provides a more efficient way of comparing strings while ignoring letter
321case differences: case folding. When you case fold a string, it's converted into a
322normalized case form suitable for efficient matching. Use the `CaseFold` module
323for this.
324
325In your `build.zig`:
326
327```zig
328exe.root_module.addImport("Normalize", zg.module("Normalize"));
329exe.root_module.addImport("CaseFold", zg.module("CaseFold"));
330```
331
332In your code:
333
334```zig
335const Normalize = @import("Normalize");
336const CaseFold = @import("CaseFold");
337
338test "Caseless matching" {
339 // We need to normalize during the matching process.
340 var norm_data = try Normalize.NormData.init(allocator);
341 defer norm_data.deinit();
342 const n = Normalize{ .norm_data = &norm_data };
343
344 // We need Unicode case fold data.
345 const cfd = try CaseFold.FoldData.init(allocator);
346 defer cfd.deinit();
347
348 // The `CaseFold` structure takes a pointer to the data.
349 const cf = CaseFold{ .fold_data = &cfd };
350
351 // `compatCaselessMatch` provides the deepest level of caseless
352 // matching because it decomposes fully to NFKD.
353 const a = "Héllo World! \u{3d3}";
354 const b = "He\u{301}llo World! \u{3a5}\u{301}";
355 try expect(try cf.compatCaselessMatch(allocator, &n, a, b));
356
357 const c = "He\u{301}llo World! \u{3d2}\u{301}";
358 try expect(try cf.compatCaselessMatch(allocator, &n, a, c));
359
360 // `canonCaselessMatch` isn't as comprehensive as `compatCaselessMatch`
361 // because it only decomposes to NFD. Naturally, it's faster because of this.
362 try expect(!try cf.canonCaselessMatch(allocator, &n, a, b));
363 try expect(try cf.canonCaselessMatch(allocator, &n, a, c));
364}
365```
366
367## Display Width of Characters and Strings
368When displaying text with a fixed-width font on a terminal screen, it's very
369important to know exactly how many columns or cells each character should take.
370Most characters will use one column, but there are many, like emoji and East-
371Asian ideographs that need more space. The `DisplayWidth` module provides
372methods for this purpose. It also has methods that use the display width calculation
373to `center`, `padLeft`, `padRight`, and `wrap` text.
374
375In your `build.zig`:
376
377```zig
378exe.root_module.addImport("DisplayWidth", zg.module("DisplayWidth"));
379```
380
381In your code:
382
383```zig
384const DisplayWidth = @import("DisplayWidth");
385
386test "Display width" {
387 // We need Unicode data for display width calculation.
388 const dwd = try DisplayWidth.DisplayWidthData.init(allocator);
389 defer dwd.deinit();
390
391 // The `DisplayWidth` structure takes a pointer to the data.
392 const dw = DisplayWidth{ .data = &dwd };
393
394 // String display width
395 try expectEqual(@as(usize, 5), dw.strWidth("Hello\r\n"));
396 try expectEqual(@as(usize, 8), dw.strWidth("Hello 😊"));
397 try expectEqual(@as(usize, 8), dw.strWidth("Héllo 😊"));
398 try expectEqual(@as(usize, 9), dw.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ"));
399 try expectEqual(@as(usize, 17), dw.strWidth("슬라바 우크라이나"));
400
401 // Centering text
402 const centered = try dw.center(allocator, "w😊w", 10, "-");
403 defer allocator.free(centered);
404 try expectEqualStrings("---w😊w---", centered);
405
406 // Pad left
407 const right_aligned = try dw.padLeft(allocator, "abc", 9, "*");
408 defer allocator.free(right_aligned);
409 try expectEqualStrings("******abc", right_aligned);
410
411 // Pad right
412 const left_aligned = try dw.padRight(allocator, "abc", 9, "*");
413 defer allocator.free(left_aligned);
414 try expectEqualStrings("abc******", left_aligned);
415
416 // Wrap text
417 const input = "The quick brown fox\r\njumped over the lazy dog!";
418 const wrapped = try dw.wrap(allocator, input, 10, 3);
419 defer allocator.free(wrapped);
420 const want =
421 \\The quick
422 \\brown fox
423 \\jumped
424 \\over the
425 \\lazy dog!
426 ;
427 try expectEqualStrings(want, wrapped);
428}
429```
430
431## Scripts
432Unicode categorizes code points by the Script in which they belong. A Script
433collects letters and other symbols that belong to a particular writing system.
434You can detect the Script for a code point with the `ScriptsData` module.
435
436In your `build.zig`:
437
438```zig
439exe.root_module.addImport("ScriptsData", zg.module("ScriptsData"));
440```
441
442In your code:
443
444```zig
445const ScriptsData = @import("ScriptsData");
446
447test "Scripts" {
448 const sd = try ScriptsData.init(allocator);
449 defer sd.deinit();
450
451 // To see the full list of Scripts, look at the
452 // `src/ScriptsData.zig` file. They are list in an enum.
453 try expect(sd.script('A') == .Latin);
454 try expect(sd.script('Ω') == .Greek);
455 try expect(sd.script('צ') == .Hebrew);
456}
457```
458
459## Relation to Ziglyph
460zg is a total re-write of some of the components of Ziglyph. The idea was to
461reduce binary size and improve performance. These goals were achieved by using
462trie-like data structures instead of generated functions. Where Ziglyph uses a
463function call, zg uses an array lookup, which is quite faster. In addition, all
464these data structures in zg are loaded at runtime from compressed versions in the
465binary. This allows for smaller binary sizes at the expense of increased memory
466footprint at runtime.
467
468Benchmarks demonstrate the above stated goals have been met:
469
470```plain
471Binary sizes =======
472
473149K ziglyph_case
47487K zg_case
475
476275K ziglyph_caseless
477168K zg_caseless
478
47968K ziglyph_codepoint
48068K zg_codepoint
481
482101K ziglyph_grapheme
48386K zg_grapheme
484
485185K ziglyph_normalizer
486152K zg_normalize
487
488101K ziglyph_width
48986K zg_width
490
491Benchmarks ==========
492
493Ziglyph toUpperStr/toLowerStr: result: 7911596, took: 80
494Ziglyph isUpperStr/isLowerStr: result: 110959, took: 17
495zg toUpperStr/toLowerStr: result: 7911596, took: 62
496zg isUpperStr/isLowerStr: result: 110959, took: 7
497
498Ziglyph Normalizer.eqlCaseless: result: 625, took: 500
499zg CaseFold.canonCaselessMatch: result: 625, took: 385
500zg CaseFold.compatCaselessMatch: result: 625, took: 593
501
502Ziglyph CodePointIterator: result: 3769314, took: 2
503zg CodePointIterator: result: 3769314, took: 3
504
505Ziglyph GraphemeIterator: result: 3691806, took: 48
506zg GraphemeIterator: result: 3691806, took: 16
507
508Ziglyph Normalizer.nfkc: result: 3934162, took: 416
509zg Normalize.nfkc: result: 3934162, took: 182
510
511Ziglyph Normalizer.nfc: result: 3955798, took: 57
512zg Normalize.nfc: result: 3955798, took: 28
513
514Ziglyph Normalizer.nfkd: result: 4006398, took: 172
515zg Normalize.nfkd: result: 4006398, took: 104
516
517Ziglyph Normalizer.nfd: result: 4028034, took: 169
518zg Normalize.nfd: result: 4028034, took: 104
519
520Ziglyph Normalizer.eql: result: 625, took: 337
521Zg Normalize.eql: result: 625, took: 53
522
523Ziglyph display_width.strWidth: result: 3700914, took: 71
524zg DisplayWidth.strWidth: result: 3700914, took: 24
525```
526
527These results were obtained on an M1 Mac with 16 GiB of RAM.
528
529In contrast to Ziglyph, zg does not have:
530
531- Word segmentation
532- Sentence segmentation
533- Collation
534
535It's possible that any missing functionality will be added in future versions,
536but only if enough demand is present in the community.
537
diff --git a/build.zig b/build.zig
index c05b4a1..9f7f518 100644
--- a/build.zig
+++ b/build.zig
@@ -315,7 +315,7 @@ pub fn build(b: *std.Build) void {
315 scripts_data.addAnonymousImport("scripts", .{ .root_source_file = scripts_gen_out }); 315 scripts_data.addAnonymousImport("scripts", .{ .root_source_file = scripts_gen_out });
316 316
317 // Properties 317 // Properties
318 const props_data = b.addModule("ScriptsData", .{ 318 const props_data = b.addModule("PropsData", .{
319 .root_source_file = .{ .path = "src/PropsData.zig" }, 319 .root_source_file = .{ .path = "src/PropsData.zig" },
320 .target = target, 320 .target = target,
321 .optimize = optimize, 321 .optimize = optimize,
diff --git a/codegen/canon.zig b/codegen/canon.zig
index 9c84bfc..28b7f28 100644
--- a/codegen/canon.zig
+++ b/codegen/canon.zig
@@ -17,11 +17,10 @@ pub fn main() !void {
17 _ = args_iter.skip(); 17 _ = args_iter.skip();
18 const output_path = args_iter.next() orelse @panic("No output file arg!"); 18 const output_path = args_iter.next() orelse @panic("No output file arg!");
19 19
20 const compressor = std.compress.deflate.compressor; 20 const compressor = std.compress.flate.deflate.compressor;
21 var out_file = try std.fs.cwd().createFile(output_path, .{}); 21 var out_file = try std.fs.cwd().createFile(output_path, .{});
22 defer out_file.close(); 22 defer out_file.close();
23 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); 23 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
24 defer out_comp.deinit();
25 const writer = out_comp.writer(); 24 const writer = out_comp.writer();
26 25
27 const endian = builtin.cpu.arch.endian(); 26 const endian = builtin.cpu.arch.endian();
diff --git a/codegen/case_prop.zig b/codegen/case_prop.zig
index ce7ee0d..6c912a8 100644
--- a/codegen/case_prop.zig
+++ b/codegen/case_prop.zig
@@ -118,11 +118,10 @@ pub fn main() !void {
118 _ = args_iter.skip(); 118 _ = args_iter.skip();
119 const output_path = args_iter.next() orelse @panic("No output file arg!"); 119 const output_path = args_iter.next() orelse @panic("No output file arg!");
120 120
121 const compressor = std.compress.deflate.compressor; 121 const compressor = std.compress.flate.deflate.compressor;
122 var out_file = try std.fs.cwd().createFile(output_path, .{}); 122 var out_file = try std.fs.cwd().createFile(output_path, .{});
123 defer out_file.close(); 123 defer out_file.close();
124 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); 124 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
125 defer out_comp.deinit();
126 const writer = out_comp.writer(); 125 const writer = out_comp.writer();
127 126
128 const endian = builtin.cpu.arch.endian(); 127 const endian = builtin.cpu.arch.endian();
diff --git a/codegen/ccc.zig b/codegen/ccc.zig
index fd278ea..a01c8d2 100644
--- a/codegen/ccc.zig
+++ b/codegen/ccc.zig
@@ -107,11 +107,10 @@ pub fn main() !void {
107 _ = args_iter.skip(); 107 _ = args_iter.skip();
108 const output_path = args_iter.next() orelse @panic("No output file arg!"); 108 const output_path = args_iter.next() orelse @panic("No output file arg!");
109 109
110 const compressor = std.compress.deflate.compressor; 110 const compressor = std.compress.flate.deflate.compressor;
111 var out_file = try std.fs.cwd().createFile(output_path, .{}); 111 var out_file = try std.fs.cwd().createFile(output_path, .{});
112 defer out_file.close(); 112 defer out_file.close();
113 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); 113 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
114 defer out_comp.deinit();
115 const writer = out_comp.writer(); 114 const writer = out_comp.writer();
116 115
117 const endian = builtin.cpu.arch.endian(); 116 const endian = builtin.cpu.arch.endian();
diff --git a/codegen/compat.zig b/codegen/compat.zig
index d0a108a..07616fc 100644
--- a/codegen/compat.zig
+++ b/codegen/compat.zig
@@ -17,11 +17,10 @@ pub fn main() !void {
17 _ = args_iter.skip(); 17 _ = args_iter.skip();
18 const output_path = args_iter.next() orelse @panic("No output file arg!"); 18 const output_path = args_iter.next() orelse @panic("No output file arg!");
19 19
20 const compressor = std.compress.deflate.compressor; 20 const compressor = std.compress.flate.deflate.compressor;
21 var out_file = try std.fs.cwd().createFile(output_path, .{}); 21 var out_file = try std.fs.cwd().createFile(output_path, .{});
22 defer out_file.close(); 22 defer out_file.close();
23 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); 23 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
24 defer out_comp.deinit();
25 const writer = out_comp.writer(); 24 const writer = out_comp.writer();
26 25
27 const endian = builtin.cpu.arch.endian(); 26 const endian = builtin.cpu.arch.endian();
diff --git a/codegen/core_props.zig b/codegen/core_props.zig
index 1f46f9e..f60c7a9 100644
--- a/codegen/core_props.zig
+++ b/codegen/core_props.zig
@@ -121,11 +121,10 @@ pub fn main() !void {
121 _ = args_iter.skip(); 121 _ = args_iter.skip();
122 const output_path = args_iter.next() orelse @panic("No output file arg!"); 122 const output_path = args_iter.next() orelse @panic("No output file arg!");
123 123
124 const compressor = std.compress.deflate.compressor; 124 const compressor = std.compress.flate.deflate.compressor;
125 var out_file = try std.fs.cwd().createFile(output_path, .{}); 125 var out_file = try std.fs.cwd().createFile(output_path, .{});
126 defer out_file.close(); 126 defer out_file.close();
127 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); 127 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
128 defer out_comp.deinit();
129 const writer = out_comp.writer(); 128 const writer = out_comp.writer();
130 129
131 const endian = builtin.cpu.arch.endian(); 130 const endian = builtin.cpu.arch.endian();
diff --git a/codegen/dwp.zig b/codegen/dwp.zig
index 76a14d3..b36b2c9 100644
--- a/codegen/dwp.zig
+++ b/codegen/dwp.zig
@@ -230,11 +230,10 @@ pub fn main() !void {
230 _ = args_iter.skip(); 230 _ = args_iter.skip();
231 const output_path = args_iter.next() orelse @panic("No output file arg!"); 231 const output_path = args_iter.next() orelse @panic("No output file arg!");
232 232
233 const compressor = std.compress.deflate.compressor; 233 const compressor = std.compress.flate.deflate.compressor;
234 var out_file = try std.fs.cwd().createFile(output_path, .{}); 234 var out_file = try std.fs.cwd().createFile(output_path, .{});
235 defer out_file.close(); 235 defer out_file.close();
236 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); 236 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
237 defer out_comp.deinit();
238 const writer = out_comp.writer(); 237 const writer = out_comp.writer();
239 238
240 const endian = builtin.cpu.arch.endian(); 239 const endian = builtin.cpu.arch.endian();
diff --git a/codegen/fold.zig b/codegen/fold.zig
index b3192e7..6dc51ac 100644
--- a/codegen/fold.zig
+++ b/codegen/fold.zig
@@ -63,11 +63,10 @@ pub fn main() !void {
63 _ = args_iter.skip(); 63 _ = args_iter.skip();
64 const output_path = args_iter.next() orelse @panic("No output file arg!"); 64 const output_path = args_iter.next() orelse @panic("No output file arg!");
65 65
66 const compressor = std.compress.deflate.compressor; 66 const compressor = std.compress.flate.deflate.compressor;
67 var out_file = try std.fs.cwd().createFile(output_path, .{}); 67 var out_file = try std.fs.cwd().createFile(output_path, .{});
68 defer out_file.close(); 68 defer out_file.close();
69 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); 69 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
70 defer out_comp.deinit();
71 const writer = out_comp.writer(); 70 const writer = out_comp.writer();
72 71
73 const endian = builtin.cpu.arch.endian(); 72 const endian = builtin.cpu.arch.endian();
diff --git a/codegen/gbp.zig b/codegen/gbp.zig
index 39e0da3..3fc4461 100644
--- a/codegen/gbp.zig
+++ b/codegen/gbp.zig
@@ -227,11 +227,10 @@ pub fn main() !void {
227 _ = args_iter.skip(); 227 _ = args_iter.skip();
228 const output_path = args_iter.next() orelse @panic("No output file arg!"); 228 const output_path = args_iter.next() orelse @panic("No output file arg!");
229 229
230 const compressor = std.compress.deflate.compressor; 230 const compressor = std.compress.flate.deflate.compressor;
231 var out_file = try std.fs.cwd().createFile(output_path, .{}); 231 var out_file = try std.fs.cwd().createFile(output_path, .{});
232 defer out_file.close(); 232 defer out_file.close();
233 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); 233 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
234 defer out_comp.deinit();
235 const writer = out_comp.writer(); 234 const writer = out_comp.writer();
236 235
237 const endian = builtin.cpu.arch.endian(); 236 const endian = builtin.cpu.arch.endian();
diff --git a/codegen/gencat.zig b/codegen/gencat.zig
index a7713e6..fe06bd7 100644
--- a/codegen/gencat.zig
+++ b/codegen/gencat.zig
@@ -151,11 +151,10 @@ pub fn main() !void {
151 _ = args_iter.skip(); 151 _ = args_iter.skip();
152 const output_path = args_iter.next() orelse @panic("No output file arg!"); 152 const output_path = args_iter.next() orelse @panic("No output file arg!");
153 153
154 const compressor = std.compress.deflate.compressor; 154 const compressor = std.compress.flate.deflate.compressor;
155 var out_file = try std.fs.cwd().createFile(output_path, .{}); 155 var out_file = try std.fs.cwd().createFile(output_path, .{});
156 defer out_file.close(); 156 defer out_file.close();
157 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); 157 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
158 defer out_comp.deinit();
159 const writer = out_comp.writer(); 158 const writer = out_comp.writer();
160 159
161 const endian = builtin.cpu.arch.endian(); 160 const endian = builtin.cpu.arch.endian();
diff --git a/codegen/hangul.zig b/codegen/hangul.zig
index 73680c6..2c42bb7 100644
--- a/codegen/hangul.zig
+++ b/codegen/hangul.zig
@@ -116,11 +116,10 @@ pub fn main() !void {
116 _ = args_iter.skip(); 116 _ = args_iter.skip();
117 const output_path = args_iter.next() orelse @panic("No output file arg!"); 117 const output_path = args_iter.next() orelse @panic("No output file arg!");
118 118
119 const compressor = std.compress.deflate.compressor; 119 const compressor = std.compress.flate.deflate.compressor;
120 var out_file = try std.fs.cwd().createFile(output_path, .{}); 120 var out_file = try std.fs.cwd().createFile(output_path, .{});
121 defer out_file.close(); 121 defer out_file.close();
122 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); 122 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
123 defer out_comp.deinit();
124 const writer = out_comp.writer(); 123 const writer = out_comp.writer();
125 124
126 const endian = builtin.cpu.arch.endian(); 125 const endian = builtin.cpu.arch.endian();
diff --git a/codegen/lower.zig b/codegen/lower.zig
index 644ec13..a053fe3 100644
--- a/codegen/lower.zig
+++ b/codegen/lower.zig
@@ -17,11 +17,10 @@ pub fn main() !void {
17 _ = args_iter.skip(); 17 _ = args_iter.skip();
18 const output_path = args_iter.next() orelse @panic("No output file arg!"); 18 const output_path = args_iter.next() orelse @panic("No output file arg!");
19 19
20 const compressor = std.compress.deflate.compressor; 20 const compressor = std.compress.flate.deflate.compressor;
21 var out_file = try std.fs.cwd().createFile(output_path, .{}); 21 var out_file = try std.fs.cwd().createFile(output_path, .{});
22 defer out_file.close(); 22 defer out_file.close();
23 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); 23 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
24 defer out_comp.deinit();
25 const writer = out_comp.writer(); 24 const writer = out_comp.writer();
26 25
27 const endian = builtin.cpu.arch.endian(); 26 const endian = builtin.cpu.arch.endian();
diff --git a/codegen/normp.zig b/codegen/normp.zig
index 8ceda36..60dabdc 100644
--- a/codegen/normp.zig
+++ b/codegen/normp.zig
@@ -117,11 +117,10 @@ pub fn main() !void {
117 _ = args_iter.skip(); 117 _ = args_iter.skip();
118 const output_path = args_iter.next() orelse @panic("No output file arg!"); 118 const output_path = args_iter.next() orelse @panic("No output file arg!");
119 119
120 const compressor = std.compress.deflate.compressor; 120 const compressor = std.compress.flate.deflate.compressor;
121 var out_file = try std.fs.cwd().createFile(output_path, .{}); 121 var out_file = try std.fs.cwd().createFile(output_path, .{});
122 defer out_file.close(); 122 defer out_file.close();
123 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); 123 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
124 defer out_comp.deinit();
125 const writer = out_comp.writer(); 124 const writer = out_comp.writer();
126 125
127 const endian = builtin.cpu.arch.endian(); 126 const endian = builtin.cpu.arch.endian();
diff --git a/codegen/numeric.zig b/codegen/numeric.zig
index ad8490c..038ac0a 100644
--- a/codegen/numeric.zig
+++ b/codegen/numeric.zig
@@ -118,11 +118,10 @@ pub fn main() !void {
118 _ = args_iter.skip(); 118 _ = args_iter.skip();
119 const output_path = args_iter.next() orelse @panic("No output file arg!"); 119 const output_path = args_iter.next() orelse @panic("No output file arg!");
120 120
121 const compressor = std.compress.deflate.compressor; 121 const compressor = std.compress.flate.deflate.compressor;
122 var out_file = try std.fs.cwd().createFile(output_path, .{}); 122 var out_file = try std.fs.cwd().createFile(output_path, .{});
123 defer out_file.close(); 123 defer out_file.close();
124 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); 124 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
125 defer out_comp.deinit();
126 const writer = out_comp.writer(); 125 const writer = out_comp.writer();
127 126
128 const endian = builtin.cpu.arch.endian(); 127 const endian = builtin.cpu.arch.endian();
diff --git a/codegen/props.zig b/codegen/props.zig
index 57a205e..24b22e0 100644
--- a/codegen/props.zig
+++ b/codegen/props.zig
@@ -118,11 +118,10 @@ pub fn main() !void {
118 _ = args_iter.skip(); 118 _ = args_iter.skip();
119 const output_path = args_iter.next() orelse @panic("No output file arg!"); 119 const output_path = args_iter.next() orelse @panic("No output file arg!");
120 120
121 const compressor = std.compress.deflate.compressor; 121 const compressor = std.compress.flate.deflate.compressor;
122 var out_file = try std.fs.cwd().createFile(output_path, .{}); 122 var out_file = try std.fs.cwd().createFile(output_path, .{});
123 defer out_file.close(); 123 defer out_file.close();
124 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); 124 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
125 defer out_comp.deinit();
126 const writer = out_comp.writer(); 125 const writer = out_comp.writer();
127 126
128 const endian = builtin.cpu.arch.endian(); 127 const endian = builtin.cpu.arch.endian();
diff --git a/codegen/scripts.zig b/codegen/scripts.zig
index e985c1e..660699d 100644
--- a/codegen/scripts.zig
+++ b/codegen/scripts.zig
@@ -288,11 +288,10 @@ pub fn main() !void {
288 _ = args_iter.skip(); 288 _ = args_iter.skip();
289 const output_path = args_iter.next() orelse @panic("No output file arg!"); 289 const output_path = args_iter.next() orelse @panic("No output file arg!");
290 290
291 const compressor = std.compress.deflate.compressor; 291 const compressor = std.compress.flate.deflate.compressor;
292 var out_file = try std.fs.cwd().createFile(output_path, .{}); 292 var out_file = try std.fs.cwd().createFile(output_path, .{});
293 defer out_file.close(); 293 defer out_file.close();
294 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); 294 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
295 defer out_comp.deinit();
296 const writer = out_comp.writer(); 295 const writer = out_comp.writer();
297 296
298 const endian = builtin.cpu.arch.endian(); 297 const endian = builtin.cpu.arch.endian();
diff --git a/codegen/upper.zig b/codegen/upper.zig
index 455fe2c..5848911 100644
--- a/codegen/upper.zig
+++ b/codegen/upper.zig
@@ -17,11 +17,10 @@ pub fn main() !void {
17 _ = args_iter.skip(); 17 _ = args_iter.skip();
18 const output_path = args_iter.next() orelse @panic("No output file arg!"); 18 const output_path = args_iter.next() orelse @panic("No output file arg!");
19 19
20 const compressor = std.compress.deflate.compressor; 20 const compressor = std.compress.flate.deflate.compressor;
21 var out_file = try std.fs.cwd().createFile(output_path, .{}); 21 var out_file = try std.fs.cwd().createFile(output_path, .{});
22 defer out_file.close(); 22 defer out_file.close();
23 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); 23 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
24 defer out_comp.deinit();
25 const writer = out_comp.writer(); 24 const writer = out_comp.writer();
26 25
27 const endian = builtin.cpu.arch.endian(); 26 const endian = builtin.cpu.arch.endian();
diff --git a/src/CanonData.zig b/src/CanonData.zig
index 64d5555..be2b381 100644
--- a/src/CanonData.zig
+++ b/src/CanonData.zig
@@ -10,11 +10,10 @@ nfd: [][]u21 = undefined,
10const Self = @This(); 10const Self = @This();
11 11
12pub fn init(allocator: mem.Allocator) !Self { 12pub fn init(allocator: mem.Allocator) !Self {
13 const decompressor = compress.deflate.decompressor; 13 const decompressor = compress.flate.inflate.decompressor;
14 const in_bytes = @embedFile("canon"); 14 const in_bytes = @embedFile("canon");
15 var in_fbs = std.io.fixedBufferStream(in_bytes); 15 var in_fbs = std.io.fixedBufferStream(in_bytes);
16 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 16 var in_decomp = decompressor(.raw, in_fbs.reader());
17 defer in_decomp.deinit();
18 var reader = in_decomp.reader(); 17 var reader = in_decomp.reader();
19 18
20 const endian = builtin.cpu.arch.endian(); 19 const endian = builtin.cpu.arch.endian();
diff --git a/src/CaseData.zig b/src/CaseData.zig
index c9ccc1e..260637a 100644
--- a/src/CaseData.zig
+++ b/src/CaseData.zig
@@ -15,7 +15,7 @@ prop_s2: []u8 = undefined,
15const Self = @This(); 15const Self = @This();
16 16
17pub fn init(allocator: mem.Allocator) !Self { 17pub fn init(allocator: mem.Allocator) !Self {
18 const decompressor = compress.deflate.decompressor; 18 const decompressor = compress.flate.inflate.decompressor;
19 const endian = builtin.cpu.arch.endian(); 19 const endian = builtin.cpu.arch.endian();
20 20
21 var self = Self{ 21 var self = Self{
@@ -32,8 +32,7 @@ pub fn init(allocator: mem.Allocator) !Self {
32 // Uppercase 32 // Uppercase
33 const upper_bytes = @embedFile("upper"); 33 const upper_bytes = @embedFile("upper");
34 var upper_fbs = std.io.fixedBufferStream(upper_bytes); 34 var upper_fbs = std.io.fixedBufferStream(upper_bytes);
35 var upper_decomp = try decompressor(allocator, upper_fbs.reader(), null); 35 var upper_decomp = decompressor(.raw, upper_fbs.reader());
36 defer upper_decomp.deinit();
37 var upper_reader = upper_decomp.reader(); 36 var upper_reader = upper_decomp.reader();
38 37
39 while (true) { 38 while (true) {
@@ -46,8 +45,7 @@ pub fn init(allocator: mem.Allocator) !Self {
46 // Lowercase 45 // Lowercase
47 const lower_bytes = @embedFile("lower"); 46 const lower_bytes = @embedFile("lower");
48 var lower_fbs = std.io.fixedBufferStream(lower_bytes); 47 var lower_fbs = std.io.fixedBufferStream(lower_bytes);
49 var lower_decomp = try decompressor(allocator, lower_fbs.reader(), null); 48 var lower_decomp = decompressor(.raw, lower_fbs.reader());
50 defer lower_decomp.deinit();
51 var lower_reader = lower_decomp.reader(); 49 var lower_reader = lower_decomp.reader();
52 50
53 while (true) { 51 while (true) {
@@ -60,8 +58,7 @@ pub fn init(allocator: mem.Allocator) !Self {
60 // Case properties 58 // Case properties
61 const cp_bytes = @embedFile("case_prop"); 59 const cp_bytes = @embedFile("case_prop");
62 var cp_fbs = std.io.fixedBufferStream(cp_bytes); 60 var cp_fbs = std.io.fixedBufferStream(cp_bytes);
63 var cp_decomp = try decompressor(allocator, cp_fbs.reader(), null); 61 var cp_decomp = decompressor(.raw, cp_fbs.reader());
64 defer cp_decomp.deinit();
65 var cp_reader = cp_decomp.reader(); 62 var cp_reader = cp_decomp.reader();
66 63
67 const stage_1_len: u16 = try cp_reader.readInt(u16, endian); 64 const stage_1_len: u16 = try cp_reader.readInt(u16, endian);
diff --git a/src/CaseFold.zig b/src/CaseFold.zig
index 9b10e16..3e7535e 100644
--- a/src/CaseFold.zig
+++ b/src/CaseFold.zig
@@ -10,7 +10,9 @@ fold_data: *const FoldData,
10 10
11const Self = @This(); 11const Self = @This();
12 12
13fn caseFold( 13/// Produces the case folded code points for `cps`. Caller must free returned
14/// slice with `allocator`.
15pub fn caseFold(
14 self: Self, 16 self: Self,
15 allocator: mem.Allocator, 17 allocator: mem.Allocator,
16 cps: []const u21, 18 cps: []const u21,
@@ -37,6 +39,8 @@ fn changesWhenCaseFolded(self: Self, cps: []const u21) bool {
37 } else false; 39 } else false;
38} 40}
39 41
42/// Caseless compare `a` and `b` by decomposing to NFKD. This is the most
43/// comprehensive comparison possible, but slower than `canonCaselessMatch`.
40pub fn compatCaselessMatch( 44pub fn compatCaselessMatch(
41 self: Self, 45 self: Self,
42 allocator: mem.Allocator, 46 allocator: mem.Allocator,
@@ -108,6 +112,8 @@ test "compatCaselessMatch" {
108 try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, c)); 112 try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, c));
109} 113}
110 114
115/// Performs canonical caseless string matching by decomposing to NFD. This is
116/// faster than `compatCaselessMatch`, but less comprehensive.
111pub fn canonCaselessMatch( 117pub fn canonCaselessMatch(
112 self: Self, 118 self: Self,
113 allocator: mem.Allocator, 119 allocator: mem.Allocator,
diff --git a/src/CombiningData.zig b/src/CombiningData.zig
index a40cbde..16b923f 100644
--- a/src/CombiningData.zig
+++ b/src/CombiningData.zig
@@ -10,11 +10,10 @@ s2: []u8 = undefined,
10const Self = @This(); 10const Self = @This();
11 11
12pub fn init(allocator: mem.Allocator) !Self { 12pub fn init(allocator: mem.Allocator) !Self {
13 const decompressor = compress.deflate.decompressor; 13 const decompressor = compress.flate.inflate.decompressor;
14 const in_bytes = @embedFile("ccc"); 14 const in_bytes = @embedFile("ccc");
15 var in_fbs = std.io.fixedBufferStream(in_bytes); 15 var in_fbs = std.io.fixedBufferStream(in_bytes);
16 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 16 var in_decomp = decompressor(.raw, in_fbs.reader());
17 defer in_decomp.deinit();
18 var reader = in_decomp.reader(); 17 var reader = in_decomp.reader();
19 18
20 const endian = builtin.cpu.arch.endian(); 19 const endian = builtin.cpu.arch.endian();
diff --git a/src/CompatData.zig b/src/CompatData.zig
index a931cb3..3346a06 100644
--- a/src/CompatData.zig
+++ b/src/CompatData.zig
@@ -9,11 +9,10 @@ nfkd: [][]u21 = undefined,
9const Self = @This(); 9const Self = @This();
10 10
11pub fn init(allocator: mem.Allocator) !Self { 11pub fn init(allocator: mem.Allocator) !Self {
12 const decompressor = compress.deflate.decompressor; 12 const decompressor = compress.flate.inflate.decompressor;
13 const in_bytes = @embedFile("compat"); 13 const in_bytes = @embedFile("compat");
14 var in_fbs = std.io.fixedBufferStream(in_bytes); 14 var in_fbs = std.io.fixedBufferStream(in_bytes);
15 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 15 var in_decomp = decompressor(.raw, in_fbs.reader());
16 defer in_decomp.deinit();
17 var reader = in_decomp.reader(); 16 var reader = in_decomp.reader();
18 17
19 const endian = builtin.cpu.arch.endian(); 18 const endian = builtin.cpu.arch.endian();
diff --git a/src/FoldData.zig b/src/FoldData.zig
index a06eefe..d4312b0 100644
--- a/src/FoldData.zig
+++ b/src/FoldData.zig
@@ -10,11 +10,10 @@ cwcf: []bool = undefined,
10const Self = @This(); 10const Self = @This();
11 11
12pub fn init(allocator: mem.Allocator) !Self { 12pub fn init(allocator: mem.Allocator) !Self {
13 const decompressor = compress.deflate.decompressor; 13 const decompressor = compress.flate.inflate.decompressor;
14 const in_bytes = @embedFile("fold"); 14 const in_bytes = @embedFile("fold");
15 var in_fbs = std.io.fixedBufferStream(in_bytes); 15 var in_fbs = std.io.fixedBufferStream(in_bytes);
16 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 16 var in_decomp = decompressor(.raw, in_fbs.reader());
17 defer in_decomp.deinit();
18 var reader = in_decomp.reader(); 17 var reader = in_decomp.reader();
19 18
20 const endian = builtin.cpu.arch.endian(); 19 const endian = builtin.cpu.arch.endian();
diff --git a/src/GenCatData.zig b/src/GenCatData.zig
index 12501bf..454c45a 100644
--- a/src/GenCatData.zig
+++ b/src/GenCatData.zig
@@ -45,11 +45,10 @@ s3: []u5 = undefined,
45const Self = @This(); 45const Self = @This();
46 46
47pub fn init(allocator: mem.Allocator) !Self { 47pub fn init(allocator: mem.Allocator) !Self {
48 const decompressor = compress.deflate.decompressor; 48 const decompressor = compress.flate.inflate.decompressor;
49 const in_bytes = @embedFile("gencat"); 49 const in_bytes = @embedFile("gencat");
50 var in_fbs = std.io.fixedBufferStream(in_bytes); 50 var in_fbs = std.io.fixedBufferStream(in_bytes);
51 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 51 var in_decomp = decompressor(.raw, in_fbs.reader());
52 defer in_decomp.deinit();
53 var reader = in_decomp.reader(); 52 var reader = in_decomp.reader();
54 53
55 const endian = builtin.cpu.arch.endian(); 54 const endian = builtin.cpu.arch.endian();
diff --git a/src/GraphemeData.zig b/src/GraphemeData.zig
index 500ffea..1710870 100644
--- a/src/GraphemeData.zig
+++ b/src/GraphemeData.zig
@@ -38,11 +38,10 @@ s3: []u8 = undefined,
38const Self = @This(); 38const Self = @This();
39 39
40pub fn init(allocator: mem.Allocator) !Self { 40pub fn init(allocator: mem.Allocator) !Self {
41 const decompressor = compress.deflate.decompressor; 41 const decompressor = compress.flate.inflate.decompressor;
42 const in_bytes = @embedFile("gbp"); 42 const in_bytes = @embedFile("gbp");
43 var in_fbs = std.io.fixedBufferStream(in_bytes); 43 var in_fbs = std.io.fixedBufferStream(in_bytes);
44 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 44 var in_decomp = decompressor(.raw, in_fbs.reader());
45 defer in_decomp.deinit();
46 var reader = in_decomp.reader(); 45 var reader = in_decomp.reader();
47 46
48 const endian = builtin.cpu.arch.endian(); 47 const endian = builtin.cpu.arch.endian();
diff --git a/src/HangulData.zig b/src/HangulData.zig
index 99d91c1..5eee427 100644
--- a/src/HangulData.zig
+++ b/src/HangulData.zig
@@ -20,11 +20,10 @@ s2: []u3 = undefined,
20const Self = @This(); 20const Self = @This();
21 21
22pub fn init(allocator: mem.Allocator) !Self { 22pub fn init(allocator: mem.Allocator) !Self {
23 const decompressor = compress.deflate.decompressor; 23 const decompressor = compress.flate.inflate.decompressor;
24 const in_bytes = @embedFile("hangul"); 24 const in_bytes = @embedFile("hangul");
25 var in_fbs = std.io.fixedBufferStream(in_bytes); 25 var in_fbs = std.io.fixedBufferStream(in_bytes);
26 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 26 var in_decomp = decompressor(.raw, in_fbs.reader());
27 defer in_decomp.deinit();
28 var reader = in_decomp.reader(); 27 var reader = in_decomp.reader();
29 28
30 const endian = builtin.cpu.arch.endian(); 29 const endian = builtin.cpu.arch.endian();
diff --git a/src/NormPropsData.zig b/src/NormPropsData.zig
index 86d497b..899bb8f 100644
--- a/src/NormPropsData.zig
+++ b/src/NormPropsData.zig
@@ -11,11 +11,10 @@ s2: []u4 = undefined,
11const Self = @This(); 11const Self = @This();
12 12
13pub fn init(allocator: mem.Allocator) !Self { 13pub fn init(allocator: mem.Allocator) !Self {
14 const decompressor = compress.deflate.decompressor; 14 const decompressor = compress.flate.inflate.decompressor;
15 const in_bytes = @embedFile("normp"); 15 const in_bytes = @embedFile("normp");
16 var in_fbs = std.io.fixedBufferStream(in_bytes); 16 var in_fbs = std.io.fixedBufferStream(in_bytes);
17 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 17 var in_decomp = decompressor(.raw, in_fbs.reader());
18 defer in_decomp.deinit();
19 var reader = in_decomp.reader(); 18 var reader = in_decomp.reader();
20 19
21 const endian = builtin.cpu.arch.endian(); 20 const endian = builtin.cpu.arch.endian();
diff --git a/src/Normalize.zig b/src/Normalize.zig
index f437f4f..85e3aa3 100644
--- a/src/Normalize.zig
+++ b/src/Normalize.zig
@@ -572,47 +572,6 @@ test "eql" {
572 try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); 572 try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
573} 573}
574 574
575// FCD
576fn getLeadCcc(self: Self, cp: u21) u8 {
577 const dc = self.mapping(cp, .nfd);
578 const dcp = if (dc.form == .same) cp else dc.cps[0];
579 return self.norm_data.ccc_data.ccc(dcp);
580}
581
582fn getTrailCcc(self: Self, cp: u21) u8 {
583 const dc = self.mapping(cp, .nfd);
584 const dcp = if (dc.form == .same) cp else dc.cps[dc.cps.len - 1];
585 return self.norm_data.ccc_data.ccc(dcp);
586}
587
588// Fast check to detect if a string is already in NFC or NFD form.
589fn isFcd(self: Self, str: []const u8) bool {
590 var prev_ccc: u8 = 0;
591 var cp_iter = CodePointIterator{ .bytes = str };
592
593 return while (cp_iter.next()) |cp| {
594 const ccc = self.getLeadCcc(cp.code);
595 if (ccc != 0 and ccc < prev_ccc) break false;
596 prev_ccc = self.getTrailCcc(cp.code);
597 } else true;
598}
599
600test "isFcd" {
601 const allocator = testing.allocator;
602 const data = try NormData.init(allocator);
603 defer data.deinit();
604 const n = Self{ .norm_data = &data };
605
606 const is_nfc = "José \u{3D3}";
607 try testing.expect(n.isFcd(is_nfc));
608
609 const is_nfd = "Jose\u{301} \u{3d2}\u{301}";
610 try testing.expect(n.isFcd(is_nfd));
611
612 const not_fcd = "Jose\u{301} \u{3d2}\u{315}\u{301}";
613 try testing.expect(!n.isFcd(not_fcd));
614}
615
616/// Returns true if `str` only contains Latin-1 Supplement 575/// Returns true if `str` only contains Latin-1 Supplement
617/// code points. Uses SIMD if possible. 576/// code points. Uses SIMD if possible.
618pub fn isLatin1Only(str: []const u8) bool { 577pub fn isLatin1Only(str: []const u8) bool {
diff --git a/src/PropsData.zig b/src/PropsData.zig
index 9d24e68..f6c8370 100644
--- a/src/PropsData.zig
+++ b/src/PropsData.zig
@@ -15,14 +15,13 @@ num_s2: []u8 = undefined,
15const Self = @This(); 15const Self = @This();
16 16
17pub fn init(allocator: mem.Allocator) !Self { 17pub fn init(allocator: mem.Allocator) !Self {
18 const decompressor = compress.deflate.decompressor; 18 const decompressor = compress.flate.inflate.decompressor;
19 const endian = builtin.cpu.arch.endian(); 19 const endian = builtin.cpu.arch.endian();
20 20
21 // Process DerivedCoreProperties.txt 21 // Process DerivedCoreProperties.txt
22 const core_bytes = @embedFile("core_props"); 22 const core_bytes = @embedFile("core_props");
23 var core_fbs = std.io.fixedBufferStream(core_bytes); 23 var core_fbs = std.io.fixedBufferStream(core_bytes);
24 var core_decomp = try decompressor(allocator, core_fbs.reader(), null); 24 var core_decomp = decompressor(.raw, core_fbs.reader());
25 defer core_decomp.deinit();
26 var core_reader = core_decomp.reader(); 25 var core_reader = core_decomp.reader();
27 26
28 var self = Self{ .allocator = allocator }; 27 var self = Self{ .allocator = allocator };
@@ -40,8 +39,7 @@ pub fn init(allocator: mem.Allocator) !Self {
40 // Process PropList.txt 39 // Process PropList.txt
41 const props_bytes = @embedFile("props"); 40 const props_bytes = @embedFile("props");
42 var props_fbs = std.io.fixedBufferStream(props_bytes); 41 var props_fbs = std.io.fixedBufferStream(props_bytes);
43 var props_decomp = try decompressor(allocator, props_fbs.reader(), null); 42 var props_decomp = decompressor(.raw, props_fbs.reader());
44 defer props_decomp.deinit();
45 var props_reader = props_decomp.reader(); 43 var props_reader = props_decomp.reader();
46 44
47 const stage_1_len: u16 = try props_reader.readInt(u16, endian); 45 const stage_1_len: u16 = try props_reader.readInt(u16, endian);
@@ -57,8 +55,7 @@ pub fn init(allocator: mem.Allocator) !Self {
57 // Process DerivedNumericType.txt 55 // Process DerivedNumericType.txt
58 const num_bytes = @embedFile("numeric"); 56 const num_bytes = @embedFile("numeric");
59 var num_fbs = std.io.fixedBufferStream(num_bytes); 57 var num_fbs = std.io.fixedBufferStream(num_bytes);
60 var num_decomp = try decompressor(allocator, num_fbs.reader(), null); 58 var num_decomp = decompressor(.raw, num_fbs.reader());
61 defer num_decomp.deinit();
62 var num_reader = num_decomp.reader(); 59 var num_reader = num_decomp.reader();
63 60
64 const num_stage_1_len: u16 = try num_reader.readInt(u16, endian); 61 const num_stage_1_len: u16 = try num_reader.readInt(u16, endian);
diff --git a/src/ScriptsData.zig b/src/ScriptsData.zig
index 4e371bf..415ce2d 100644
--- a/src/ScriptsData.zig
+++ b/src/ScriptsData.zig
@@ -4,7 +4,7 @@ const compress = std.compress;
4const mem = std.mem; 4const mem = std.mem;
5const testing = std.testing; 5const testing = std.testing;
6 6
7/// Script 7/// Scripts
8pub const Script = enum { 8pub const Script = enum {
9 none, 9 none,
10 Adlam, 10 Adlam,
@@ -180,11 +180,10 @@ s3: []u8 = undefined,
180const Self = @This(); 180const Self = @This();
181 181
182pub fn init(allocator: mem.Allocator) !Self { 182pub fn init(allocator: mem.Allocator) !Self {
183 const decompressor = compress.deflate.decompressor; 183 const decompressor = compress.flate.inflate.decompressor;
184 const in_bytes = @embedFile("scripts"); 184 const in_bytes = @embedFile("scripts");
185 var in_fbs = std.io.fixedBufferStream(in_bytes); 185 var in_fbs = std.io.fixedBufferStream(in_bytes);
186 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 186 var in_decomp = decompressor(.raw, in_fbs.reader());
187 defer in_decomp.deinit();
188 var reader = in_decomp.reader(); 187 var reader = in_decomp.reader();
189 188
190 const endian = builtin.cpu.arch.endian(); 189 const endian = builtin.cpu.arch.endian();
diff --git a/src/WidthData.zig b/src/WidthData.zig
index b9ef84e..cf31b7f 100644
--- a/src/WidthData.zig
+++ b/src/WidthData.zig
@@ -14,11 +14,10 @@ s2: []i3 = undefined,
14const Self = @This(); 14const Self = @This();
15 15
16pub fn init(allocator: mem.Allocator) !Self { 16pub fn init(allocator: mem.Allocator) !Self {
17 const decompressor = compress.deflate.decompressor; 17 const decompressor = compress.flate.inflate.decompressor;
18 const in_bytes = @embedFile("dwp"); 18 const in_bytes = @embedFile("dwp");
19 var in_fbs = std.io.fixedBufferStream(in_bytes); 19 var in_fbs = std.io.fixedBufferStream(in_bytes);
20 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 20 var in_decomp = decompressor(.raw, in_fbs.reader());
21 defer in_decomp.deinit();
22 var reader = in_decomp.reader(); 21 var reader = in_decomp.reader();
23 22
24 const endian = builtin.cpu.arch.endian(); 23 const endian = builtin.cpu.arch.endian();