From 200c617c865a5952f0bd12378802cc06ea3eb1c2 Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Sun, 31 Mar 2024 09:59:51 -0400 Subject: Updated README --- README.md | 537 +++++++++++++++++++++++++++++++++++++++++++++++++ build.zig | 2 +- codegen/canon.zig | 5 +- codegen/case_prop.zig | 5 +- codegen/ccc.zig | 5 +- codegen/compat.zig | 5 +- codegen/core_props.zig | 5 +- codegen/dwp.zig | 5 +- codegen/fold.zig | 5 +- codegen/gbp.zig | 5 +- codegen/gencat.zig | 5 +- codegen/hangul.zig | 5 +- codegen/lower.zig | 5 +- codegen/normp.zig | 5 +- codegen/numeric.zig | 5 +- codegen/props.zig | 5 +- codegen/scripts.zig | 5 +- codegen/upper.zig | 5 +- src/CanonData.zig | 5 +- src/CaseData.zig | 11 +- src/CaseFold.zig | 8 +- src/CombiningData.zig | 5 +- src/CompatData.zig | 5 +- src/FoldData.zig | 5 +- src/GenCatData.zig | 5 +- src/GraphemeData.zig | 5 +- src/HangulData.zig | 5 +- src/NormPropsData.zig | 5 +- src/Normalize.zig | 41 ---- src/PropsData.zig | 11 +- src/ScriptsData.zig | 7 +- src/WidthData.zig | 5 +- 32 files changed, 606 insertions(+), 136 deletions(-) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..d4fc8f6 --- /dev/null +++ b/README.md @@ -0,0 +1,537 @@ +# zg +zg provides Unicode text processing for Zig projects. + +## Unicode Version +The Unicode version supported by zg is 15.1.0. + +## Zig Version +The minimum Zig version required is 0.12.0-dev.3496+a2df84d0. + +## Integrating zg into your Zig Project +You first need to add zg as a dependency in your `build.zig.zon` file: + +```zig +.zg = .{ + .url = "https://codeberg.org/dude_the_builder/zg/archive/v0.1.0.tar.gz", +} +``` + +Then instantiate the dependency in your `build.zig`: + + +```zig +const zg = b.dependency("zg", .{}); +``` + +## A Modular Approach +zg is a modular library. This approach minimizes binary file size and memory +requirements by only including the Unicode data required for the specified module. +The following sections describe the various modules and their specific use case. + +## Code Points +In the `code_point` module, you'll find a data structure representing a single code +point, `CodePoint`, and an `Iterator` to iterate over the code points in a string. + +In your `build.zig`: + +```zig +exe.root_module.addImport("code_point", zg.module("code_point")); +``` + +In your code: + +```zig +const code_point = @import("code_point"); + +test "Code point iterator" { + const str = "Hi 😊"; + var iter = code_point.Iterator{ .bytes = str }; + var i: usize = 0; + + while (iter.next()) |cp| : (i += 1) { + // The `code` field is the actual code point scalar as a `u21`. + if (i == 0) try expect(cp.code == 'H'); + if (i == 1) try expect(cp.code == 'i'); + if (i == 2) try expect(cp.code == ' '); + + if (i == 3) { + try expect(cp.code == '😊'); + + // The `offset` field is the byte offset in the + // source string. + try expect(cp.offset == 3); + + // The `len` field is the length in bytes of the + // code point in the source string. + try expect(cp.len == 4); + } + } +} +``` + +## Grapheme Clusters +Many characters are composed from more than one code point. These are known as +Grapheme Clusters and the `grapheme` module has a data structure to represent +them, `Grapheme`, and an `Iterator` to iterate over them in a string. + +In your `build.zig`: + +```zig +exe.root_module.addImport("grapheme", zg.module("grapheme")); +``` + +In your code: + +```zig +const grapheme = @import("grapheme"); + +test "Grapheme cluster iterator" { + // we need some Unicode data to process Grapheme Clusters. + const gd = try grapheme.GraphemeData.init(allocator); + defer gd.deinit(); + + const str = "He\u{301}"; // Hé + var iter = grapheme.Iterator.init(str, &gd); + + var i: usize = 0; + + while (iter.next()) |gc| : (i += 1) { + // The `len` field is the length in bytes of the + // grapheme cluster in the source string. + if (i == 0) try expect(gc.len == 1); + + if (i == 1) { + try expect(gc.len == 3); + + // The `offset` in bytes of the grapheme cluster + // in the source string. + try expect(gc.offset == 1); + + // The `bytes` method returns the slice of bytes + // that comprise this grapheme cluster in the + // source string `str`. + try expectEqualStrings("e\u{301}", gc.bytes(str)); + } + } +} +``` + +## Unicode General Categories +To detect the general category for a code point, use the `GenCatData` module. + +In your `build.zig`: + +```zig +exe.root_module.addImport("GenCatData", zg.module("GenCatData")); +``` + +In your code: + +```zig +const GenCatData = @import("GenCatData"); + +test "General Category" { + const gcd = try GenCatData.init(allocator); + defer gcd.deinit(); + + // The `gc` method returns the abbreviated General Category. + // These abbreviations and descriptive comments can be found + // in the source file `src/GenCatData.zig` as en enum. + try expect(gcd.gc('A') == .Lu); // Lu: uppercase letter + try expect(gcd.gc('3') == .Nd); // Nd: decimal number + + // The following are convenience methods for groups of General + // Categories. For example, all letter categories start with `L`: + // Lu, Ll, Lt, Lo. + try expect(gcd.isControl(0)); + try expect(gcd.isLetter('z')); + try expect(gcd.isMark('\u{301}')); + try expect(gcd.isNumber('3')); + try expect(gcd.isPunctuation('[')); + try expect(gcd.isSeparator(' ')); + try expect(gcd.isSymbol('©')); +} +``` + +## Unicode Properties +You can detect common properties of a code point with the `PropsData` module. + +In your `build.zig`: + +```zig +exe.root_module.addImport("PropsData", zg.module("PropsData")); +``` + +In your code: + +```zig +const PropsData = @import("PropsData"); + +test "Properties" { + const pd = try PropsData.init(allocator); + defer pd.deinit(); + + // Mathematical symbols and letters. + try expect(pd.isMath('+')); + // Alphabetic only code points. + try expect(pd.isAlphabetic('Z')); + // Space, tab, and other separators. + try expect(pd.isWhitespace(' ')); + // Hexadecimal digits and variations thereof. + try expect(pd.isHexDigit('f')); + try expect(!pd.isHexDigit('z')); + + // Accents, dieresis, and other combining marks. + try expect(pd.isDiacritic('\u{301}')); + + // Unicode has a specification for valid identifiers like + // the ones used in programming and regular expressions. + try expect(pd.isIdStart('Z')); // Identifier start character + try expect(!pd.isIdStart('1')); + try expect(pd.isIdContinue('1')); + + // The `X` versions add some code points that can appear after + // normalizing a string. + try expect(pd.isXidStart('\u{b33}')); // Extended identifier start character + try expect(pd.isXidContinue('\u{e33}')); + try expect(!pd.isXidStart('1')); + + // Note surprising Unicode numeric type properties! + try expect(pd.isNumeric('\u{277f}')); + try expect(!pd.isNumeric('3')); // 3 is not numeric! + try expect(pd.isDigit('\u{2070}')); + try expect(!pd.isDigit('3')); // 3 is not a digit! + try expect(pd.isDecimal('3')); // 3 is a decimal digit +} +``` + +## Letter Case Detection and Conversion +To detect and convert to and from different letter cases, use the `CaseData` +module. + +In your `build.zig`: + +```zig +exe.root_module.addImport("CaseData", zg.module("CaseData")); +``` + +In your code: + +```zig +const CaseData = @import("CaseData"); + +test "Case" { + const cd = try CaseData.init(allocator); + defer cd.deinit(); + + // Upper and lower case. + try expect(cd.isUpper('A')); + try expect('A' == cd.toUpper('a')); + try expect(cd.isLower('a')); + try expect('a' == cd.toLower('A')); + + // Code points that have case. + try expect(cd.isCased('É')); + try expect(!cd.isCased('3')); + + // Case detection and conversion for strings. + try expect(cd.isUpperStr("HELLO 123!")); + const ucased = try cd.toUpperStr(allocator, "hello 123"); + defer allocator.free(ucased); + try expectEqualStrings("HELLO 123", ucased); + + try expect(cd.isLowerStr("hello 123!")); + const lcased = try cd.toLowerStr(allocator, "HELLO 123"); + defer allocator.free(lcased); + try expectEqualStrings("hello 123", lcased); +} +``` + +## Normalization +Unicode normalization is the process of converting a string into a uniform +representation that can guarantee a known structure by following a strict set +of rules. There are four normalization forms: + +Canonical Composition (NFC) +: The most compact representation obtained by first +decomposing to Canonical Decomposition and then composing to NFC. + +Compatibility Composition (NFKC) +: The most comprehensive composition obtained +by first decomposing to Compatibility Decomposition and then composing to NFKC. + +Canonical Decomposition (NFD) +: Only code points with canonical decompositions +are decomposed. This is a more compact and faster decomposition but will not +provide the most comprehensive normalization possible. + +Compatibility Decomposition (NFKD) +: The most comprehensive decomposition method +where both canonical and compatibility decompositions are performed recursively. + +zg has methods to produce all four normalization forms in the `Normalize` module. + +In your `build.zig`: + +```zig +exe.root_module.addImport("Normalize", zg.module("Normalize")); +``` + +In your code: + +```zig +const Normalize = @import("Normalize"); + +test "Normalization" { + // We need lots of Unicode dta for normalization. + var norm_data = try Normalize.NormData.init(allocator); + defer norm_data.deinit(); + + // The `Normalize` structure takes a pointer to the data. + const n = Normalize{ .norm_data = &norm_data }; + + // NFC: Canonical composition + const nfc_result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); + defer nfc_result.deinit(); + try expectEqualStrings("Complex char: \u{3D3}", nfc_result.slice); + + // NFKC: Compatibility composition + const nfkc_result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); + defer nfkc_result.deinit(); + try expectEqualStrings("Complex char: \u{038E}", nfkc_result.slice); + + // NFD: Canonical decomposition + const nfd_result = try n.nfd(allocator, "Héllo World! \u{3d3}"); + defer nfd_result.deinit(); + try expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", nfd_result.slice); + + // NFKD: Compatibility decomposition + const nfkd_result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); + defer nfkd_result.deinit(); + try expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", nfkd_result.slice); + + // Test for equality of two strings after normalizing to NFC. + try expect(try n.eql(allocator, "foé", "foe\u{0301}")); + try expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); +} +``` + +## Caseless Matching via Case Folding +Unicode provides a more efficient way of comparing strings while ignoring letter +case differences: case folding. When you case fold a string, it's converted into a +normalized case form suitable for efficient matching. Use the `CaseFold` module +for this. + +In your `build.zig`: + +```zig +exe.root_module.addImport("Normalize", zg.module("Normalize")); +exe.root_module.addImport("CaseFold", zg.module("CaseFold")); +``` + +In your code: + +```zig +const Normalize = @import("Normalize"); +const CaseFold = @import("CaseFold"); + +test "Caseless matching" { + // We need to normalize during the matching process. + var norm_data = try Normalize.NormData.init(allocator); + defer norm_data.deinit(); + const n = Normalize{ .norm_data = &norm_data }; + + // We need Unicode case fold data. + const cfd = try CaseFold.FoldData.init(allocator); + defer cfd.deinit(); + + // The `CaseFold` structure takes a pointer to the data. + const cf = CaseFold{ .fold_data = &cfd }; + + // `compatCaselessMatch` provides the deepest level of caseless + // matching because it decomposes fully to NFKD. + const a = "Héllo World! \u{3d3}"; + const b = "He\u{301}llo World! \u{3a5}\u{301}"; + try expect(try cf.compatCaselessMatch(allocator, &n, a, b)); + + const c = "He\u{301}llo World! \u{3d2}\u{301}"; + try expect(try cf.compatCaselessMatch(allocator, &n, a, c)); + + // `canonCaselessMatch` isn't as comprehensive as `compatCaselessMatch` + // because it only decomposes to NFD. Naturally, it's faster because of this. + try expect(!try cf.canonCaselessMatch(allocator, &n, a, b)); + try expect(try cf.canonCaselessMatch(allocator, &n, a, c)); +} +``` + +## Display Width of Characters and Strings +When displaying text with a fixed-width font on a terminal screen, it's very +important to know exactly how many columns or cells each character should take. +Most characters will use one column, but there are many, like emoji and East- +Asian ideographs that need more space. The `DisplayWidth` module provides +methods for this purpose. It also has methods that use the display width calculation +to `center`, `padLeft`, `padRight`, and `wrap` text. + +In your `build.zig`: + +```zig +exe.root_module.addImport("DisplayWidth", zg.module("DisplayWidth")); +``` + +In your code: + +```zig +const DisplayWidth = @import("DisplayWidth"); + +test "Display width" { + // We need Unicode data for display width calculation. + const dwd = try DisplayWidth.DisplayWidthData.init(allocator); + defer dwd.deinit(); + + // The `DisplayWidth` structure takes a pointer to the data. + const dw = DisplayWidth{ .data = &dwd }; + + // String display width + try expectEqual(@as(usize, 5), dw.strWidth("Hello\r\n")); + try expectEqual(@as(usize, 8), dw.strWidth("Hello 😊")); + try expectEqual(@as(usize, 8), dw.strWidth("Héllo 😊")); + try expectEqual(@as(usize, 9), dw.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ")); + try expectEqual(@as(usize, 17), dw.strWidth("슬라바 우크라이나")); + + // Centering text + const centered = try dw.center(allocator, "w😊w", 10, "-"); + defer allocator.free(centered); + try expectEqualStrings("---w😊w---", centered); + + // Pad left + const right_aligned = try dw.padLeft(allocator, "abc", 9, "*"); + defer allocator.free(right_aligned); + try expectEqualStrings("******abc", right_aligned); + + // Pad right + const left_aligned = try dw.padRight(allocator, "abc", 9, "*"); + defer allocator.free(left_aligned); + try expectEqualStrings("abc******", left_aligned); + + // Wrap text + const input = "The quick brown fox\r\njumped over the lazy dog!"; + const wrapped = try dw.wrap(allocator, input, 10, 3); + defer allocator.free(wrapped); + const want = + \\The quick + \\brown fox + \\jumped + \\over the + \\lazy dog! + ; + try expectEqualStrings(want, wrapped); +} +``` + +## Scripts +Unicode categorizes code points by the Script in which they belong. A Script +collects letters and other symbols that belong to a particular writing system. +You can detect the Script for a code point with the `ScriptsData` module. + +In your `build.zig`: + +```zig +exe.root_module.addImport("ScriptsData", zg.module("ScriptsData")); +``` + +In your code: + +```zig +const ScriptsData = @import("ScriptsData"); + +test "Scripts" { + const sd = try ScriptsData.init(allocator); + defer sd.deinit(); + + // To see the full list of Scripts, look at the + // `src/ScriptsData.zig` file. They are list in an enum. + try expect(sd.script('A') == .Latin); + try expect(sd.script('Ω') == .Greek); + try expect(sd.script('צ') == .Hebrew); +} +``` + +## Relation to Ziglyph +zg is a total re-write of some of the components of Ziglyph. The idea was to +reduce binary size and improve performance. These goals were achieved by using +trie-like data structures instead of generated functions. Where Ziglyph uses a +function call, zg uses an array lookup, which is quite faster. In addition, all +these data structures in zg are loaded at runtime from compressed versions in the +binary. This allows for smaller binary sizes at the expense of increased memory +footprint at runtime. + +Benchmarks demonstrate the above stated goals have been met: + +```plain +Binary sizes ======= + +149K ziglyph_case +87K zg_case + +275K ziglyph_caseless +168K zg_caseless + +68K ziglyph_codepoint +68K zg_codepoint + +101K ziglyph_grapheme +86K zg_grapheme + +185K ziglyph_normalizer +152K zg_normalize + +101K ziglyph_width +86K zg_width + +Benchmarks ========== + +Ziglyph toUpperStr/toLowerStr: result: 7911596, took: 80 +Ziglyph isUpperStr/isLowerStr: result: 110959, took: 17 +zg toUpperStr/toLowerStr: result: 7911596, took: 62 +zg isUpperStr/isLowerStr: result: 110959, took: 7 + +Ziglyph Normalizer.eqlCaseless: result: 625, took: 500 +zg CaseFold.canonCaselessMatch: result: 625, took: 385 +zg CaseFold.compatCaselessMatch: result: 625, took: 593 + +Ziglyph CodePointIterator: result: 3769314, took: 2 +zg CodePointIterator: result: 3769314, took: 3 + +Ziglyph GraphemeIterator: result: 3691806, took: 48 +zg GraphemeIterator: result: 3691806, took: 16 + +Ziglyph Normalizer.nfkc: result: 3934162, took: 416 +zg Normalize.nfkc: result: 3934162, took: 182 + +Ziglyph Normalizer.nfc: result: 3955798, took: 57 +zg Normalize.nfc: result: 3955798, took: 28 + +Ziglyph Normalizer.nfkd: result: 4006398, took: 172 +zg Normalize.nfkd: result: 4006398, took: 104 + +Ziglyph Normalizer.nfd: result: 4028034, took: 169 +zg Normalize.nfd: result: 4028034, took: 104 + +Ziglyph Normalizer.eql: result: 625, took: 337 +Zg Normalize.eql: result: 625, took: 53 + +Ziglyph display_width.strWidth: result: 3700914, took: 71 +zg DisplayWidth.strWidth: result: 3700914, took: 24 +``` + +These results were obtained on an M1 Mac with 16 GiB of RAM. + +In contrast to Ziglyph, zg does not have: + +- Word segmentation +- Sentence segmentation +- Collation + +It's possible that any missing functionality will be added in future versions, +but only if enough demand is present in the community. + diff --git a/build.zig b/build.zig index c05b4a1..9f7f518 100644 --- a/build.zig +++ b/build.zig @@ -315,7 +315,7 @@ pub fn build(b: *std.Build) void { scripts_data.addAnonymousImport("scripts", .{ .root_source_file = scripts_gen_out }); // Properties - const props_data = b.addModule("ScriptsData", .{ + const props_data = b.addModule("PropsData", .{ .root_source_file = .{ .path = "src/PropsData.zig" }, .target = target, .optimize = optimize, diff --git a/codegen/canon.zig b/codegen/canon.zig index 9c84bfc..28b7f28 100644 --- a/codegen/canon.zig +++ b/codegen/canon.zig @@ -17,11 +17,10 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - const compressor = std.compress.deflate.compressor; + const compressor = std.compress.flate.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); - defer out_comp.deinit(); + var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best }); const writer = out_comp.writer(); const endian = builtin.cpu.arch.endian(); diff --git a/codegen/case_prop.zig b/codegen/case_prop.zig index ce7ee0d..6c912a8 100644 --- a/codegen/case_prop.zig +++ b/codegen/case_prop.zig @@ -118,11 +118,10 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - const compressor = std.compress.deflate.compressor; + const compressor = std.compress.flate.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); - defer out_comp.deinit(); + var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best }); const writer = out_comp.writer(); const endian = builtin.cpu.arch.endian(); diff --git a/codegen/ccc.zig b/codegen/ccc.zig index fd278ea..a01c8d2 100644 --- a/codegen/ccc.zig +++ b/codegen/ccc.zig @@ -107,11 +107,10 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - const compressor = std.compress.deflate.compressor; + const compressor = std.compress.flate.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); - defer out_comp.deinit(); + var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best }); const writer = out_comp.writer(); const endian = builtin.cpu.arch.endian(); diff --git a/codegen/compat.zig b/codegen/compat.zig index d0a108a..07616fc 100644 --- a/codegen/compat.zig +++ b/codegen/compat.zig @@ -17,11 +17,10 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - const compressor = std.compress.deflate.compressor; + const compressor = std.compress.flate.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); - defer out_comp.deinit(); + var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best }); const writer = out_comp.writer(); const endian = builtin.cpu.arch.endian(); diff --git a/codegen/core_props.zig b/codegen/core_props.zig index 1f46f9e..f60c7a9 100644 --- a/codegen/core_props.zig +++ b/codegen/core_props.zig @@ -121,11 +121,10 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - const compressor = std.compress.deflate.compressor; + const compressor = std.compress.flate.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); - defer out_comp.deinit(); + var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best }); const writer = out_comp.writer(); const endian = builtin.cpu.arch.endian(); diff --git a/codegen/dwp.zig b/codegen/dwp.zig index 76a14d3..b36b2c9 100644 --- a/codegen/dwp.zig +++ b/codegen/dwp.zig @@ -230,11 +230,10 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - const compressor = std.compress.deflate.compressor; + const compressor = std.compress.flate.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); - defer out_comp.deinit(); + var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best }); const writer = out_comp.writer(); const endian = builtin.cpu.arch.endian(); diff --git a/codegen/fold.zig b/codegen/fold.zig index b3192e7..6dc51ac 100644 --- a/codegen/fold.zig +++ b/codegen/fold.zig @@ -63,11 +63,10 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - const compressor = std.compress.deflate.compressor; + const compressor = std.compress.flate.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); - defer out_comp.deinit(); + var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best }); const writer = out_comp.writer(); const endian = builtin.cpu.arch.endian(); diff --git a/codegen/gbp.zig b/codegen/gbp.zig index 39e0da3..3fc4461 100644 --- a/codegen/gbp.zig +++ b/codegen/gbp.zig @@ -227,11 +227,10 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - const compressor = std.compress.deflate.compressor; + const compressor = std.compress.flate.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); - defer out_comp.deinit(); + var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best }); const writer = out_comp.writer(); const endian = builtin.cpu.arch.endian(); diff --git a/codegen/gencat.zig b/codegen/gencat.zig index a7713e6..fe06bd7 100644 --- a/codegen/gencat.zig +++ b/codegen/gencat.zig @@ -151,11 +151,10 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - const compressor = std.compress.deflate.compressor; + const compressor = std.compress.flate.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); - defer out_comp.deinit(); + var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best }); const writer = out_comp.writer(); const endian = builtin.cpu.arch.endian(); diff --git a/codegen/hangul.zig b/codegen/hangul.zig index 73680c6..2c42bb7 100644 --- a/codegen/hangul.zig +++ b/codegen/hangul.zig @@ -116,11 +116,10 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - const compressor = std.compress.deflate.compressor; + const compressor = std.compress.flate.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); - defer out_comp.deinit(); + var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best }); const writer = out_comp.writer(); const endian = builtin.cpu.arch.endian(); diff --git a/codegen/lower.zig b/codegen/lower.zig index 644ec13..a053fe3 100644 --- a/codegen/lower.zig +++ b/codegen/lower.zig @@ -17,11 +17,10 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - const compressor = std.compress.deflate.compressor; + const compressor = std.compress.flate.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); - defer out_comp.deinit(); + var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best }); const writer = out_comp.writer(); const endian = builtin.cpu.arch.endian(); diff --git a/codegen/normp.zig b/codegen/normp.zig index 8ceda36..60dabdc 100644 --- a/codegen/normp.zig +++ b/codegen/normp.zig @@ -117,11 +117,10 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - const compressor = std.compress.deflate.compressor; + const compressor = std.compress.flate.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); - defer out_comp.deinit(); + var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best }); const writer = out_comp.writer(); const endian = builtin.cpu.arch.endian(); diff --git a/codegen/numeric.zig b/codegen/numeric.zig index ad8490c..038ac0a 100644 --- a/codegen/numeric.zig +++ b/codegen/numeric.zig @@ -118,11 +118,10 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - const compressor = std.compress.deflate.compressor; + const compressor = std.compress.flate.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); - defer out_comp.deinit(); + var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best }); const writer = out_comp.writer(); const endian = builtin.cpu.arch.endian(); diff --git a/codegen/props.zig b/codegen/props.zig index 57a205e..24b22e0 100644 --- a/codegen/props.zig +++ b/codegen/props.zig @@ -118,11 +118,10 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - const compressor = std.compress.deflate.compressor; + const compressor = std.compress.flate.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); - defer out_comp.deinit(); + var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best }); const writer = out_comp.writer(); const endian = builtin.cpu.arch.endian(); diff --git a/codegen/scripts.zig b/codegen/scripts.zig index e985c1e..660699d 100644 --- a/codegen/scripts.zig +++ b/codegen/scripts.zig @@ -288,11 +288,10 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - const compressor = std.compress.deflate.compressor; + const compressor = std.compress.flate.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); - defer out_comp.deinit(); + var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best }); const writer = out_comp.writer(); const endian = builtin.cpu.arch.endian(); diff --git a/codegen/upper.zig b/codegen/upper.zig index 455fe2c..5848911 100644 --- a/codegen/upper.zig +++ b/codegen/upper.zig @@ -17,11 +17,10 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - const compressor = std.compress.deflate.compressor; + const compressor = std.compress.flate.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); - defer out_comp.deinit(); + var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best }); const writer = out_comp.writer(); const endian = builtin.cpu.arch.endian(); diff --git a/src/CanonData.zig b/src/CanonData.zig index 64d5555..be2b381 100644 --- a/src/CanonData.zig +++ b/src/CanonData.zig @@ -10,11 +10,10 @@ nfd: [][]u21 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("canon"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/CaseData.zig b/src/CaseData.zig index c9ccc1e..260637a 100644 --- a/src/CaseData.zig +++ b/src/CaseData.zig @@ -15,7 +15,7 @@ prop_s2: []u8 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const endian = builtin.cpu.arch.endian(); var self = Self{ @@ -32,8 +32,7 @@ pub fn init(allocator: mem.Allocator) !Self { // Uppercase const upper_bytes = @embedFile("upper"); var upper_fbs = std.io.fixedBufferStream(upper_bytes); - var upper_decomp = try decompressor(allocator, upper_fbs.reader(), null); - defer upper_decomp.deinit(); + var upper_decomp = decompressor(.raw, upper_fbs.reader()); var upper_reader = upper_decomp.reader(); while (true) { @@ -46,8 +45,7 @@ pub fn init(allocator: mem.Allocator) !Self { // Lowercase const lower_bytes = @embedFile("lower"); var lower_fbs = std.io.fixedBufferStream(lower_bytes); - var lower_decomp = try decompressor(allocator, lower_fbs.reader(), null); - defer lower_decomp.deinit(); + var lower_decomp = decompressor(.raw, lower_fbs.reader()); var lower_reader = lower_decomp.reader(); while (true) { @@ -60,8 +58,7 @@ pub fn init(allocator: mem.Allocator) !Self { // Case properties const cp_bytes = @embedFile("case_prop"); var cp_fbs = std.io.fixedBufferStream(cp_bytes); - var cp_decomp = try decompressor(allocator, cp_fbs.reader(), null); - defer cp_decomp.deinit(); + var cp_decomp = decompressor(.raw, cp_fbs.reader()); var cp_reader = cp_decomp.reader(); const stage_1_len: u16 = try cp_reader.readInt(u16, endian); diff --git a/src/CaseFold.zig b/src/CaseFold.zig index 9b10e16..3e7535e 100644 --- a/src/CaseFold.zig +++ b/src/CaseFold.zig @@ -10,7 +10,9 @@ fold_data: *const FoldData, const Self = @This(); -fn caseFold( +/// Produces the case folded code points for `cps`. Caller must free returned +/// slice with `allocator`. +pub fn caseFold( self: Self, allocator: mem.Allocator, cps: []const u21, @@ -37,6 +39,8 @@ fn changesWhenCaseFolded(self: Self, cps: []const u21) bool { } else false; } +/// Caseless compare `a` and `b` by decomposing to NFKD. This is the most +/// comprehensive comparison possible, but slower than `canonCaselessMatch`. pub fn compatCaselessMatch( self: Self, allocator: mem.Allocator, @@ -108,6 +112,8 @@ test "compatCaselessMatch" { try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, c)); } +/// Performs canonical caseless string matching by decomposing to NFD. This is +/// faster than `compatCaselessMatch`, but less comprehensive. pub fn canonCaselessMatch( self: Self, allocator: mem.Allocator, diff --git a/src/CombiningData.zig b/src/CombiningData.zig index a40cbde..16b923f 100644 --- a/src/CombiningData.zig +++ b/src/CombiningData.zig @@ -10,11 +10,10 @@ s2: []u8 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("ccc"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/CompatData.zig b/src/CompatData.zig index a931cb3..3346a06 100644 --- a/src/CompatData.zig +++ b/src/CompatData.zig @@ -9,11 +9,10 @@ nfkd: [][]u21 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("compat"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/FoldData.zig b/src/FoldData.zig index a06eefe..d4312b0 100644 --- a/src/FoldData.zig +++ b/src/FoldData.zig @@ -10,11 +10,10 @@ cwcf: []bool = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("fold"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/GenCatData.zig b/src/GenCatData.zig index 12501bf..454c45a 100644 --- a/src/GenCatData.zig +++ b/src/GenCatData.zig @@ -45,11 +45,10 @@ s3: []u5 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("gencat"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/GraphemeData.zig b/src/GraphemeData.zig index 500ffea..1710870 100644 --- a/src/GraphemeData.zig +++ b/src/GraphemeData.zig @@ -38,11 +38,10 @@ s3: []u8 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("gbp"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/HangulData.zig b/src/HangulData.zig index 99d91c1..5eee427 100644 --- a/src/HangulData.zig +++ b/src/HangulData.zig @@ -20,11 +20,10 @@ s2: []u3 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("hangul"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/NormPropsData.zig b/src/NormPropsData.zig index 86d497b..899bb8f 100644 --- a/src/NormPropsData.zig +++ b/src/NormPropsData.zig @@ -11,11 +11,10 @@ s2: []u4 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("normp"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/Normalize.zig b/src/Normalize.zig index f437f4f..85e3aa3 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig @@ -572,47 +572,6 @@ test "eql" { try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); } -// FCD -fn getLeadCcc(self: Self, cp: u21) u8 { - const dc = self.mapping(cp, .nfd); - const dcp = if (dc.form == .same) cp else dc.cps[0]; - return self.norm_data.ccc_data.ccc(dcp); -} - -fn getTrailCcc(self: Self, cp: u21) u8 { - const dc = self.mapping(cp, .nfd); - const dcp = if (dc.form == .same) cp else dc.cps[dc.cps.len - 1]; - return self.norm_data.ccc_data.ccc(dcp); -} - -// Fast check to detect if a string is already in NFC or NFD form. -fn isFcd(self: Self, str: []const u8) bool { - var prev_ccc: u8 = 0; - var cp_iter = CodePointIterator{ .bytes = str }; - - return while (cp_iter.next()) |cp| { - const ccc = self.getLeadCcc(cp.code); - if (ccc != 0 and ccc < prev_ccc) break false; - prev_ccc = self.getTrailCcc(cp.code); - } else true; -} - -test "isFcd" { - const allocator = testing.allocator; - const data = try NormData.init(allocator); - defer data.deinit(); - const n = Self{ .norm_data = &data }; - - const is_nfc = "José \u{3D3}"; - try testing.expect(n.isFcd(is_nfc)); - - const is_nfd = "Jose\u{301} \u{3d2}\u{301}"; - try testing.expect(n.isFcd(is_nfd)); - - const not_fcd = "Jose\u{301} \u{3d2}\u{315}\u{301}"; - try testing.expect(!n.isFcd(not_fcd)); -} - /// Returns true if `str` only contains Latin-1 Supplement /// code points. Uses SIMD if possible. pub fn isLatin1Only(str: []const u8) bool { diff --git a/src/PropsData.zig b/src/PropsData.zig index 9d24e68..f6c8370 100644 --- a/src/PropsData.zig +++ b/src/PropsData.zig @@ -15,14 +15,13 @@ num_s2: []u8 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const endian = builtin.cpu.arch.endian(); // Process DerivedCoreProperties.txt const core_bytes = @embedFile("core_props"); var core_fbs = std.io.fixedBufferStream(core_bytes); - var core_decomp = try decompressor(allocator, core_fbs.reader(), null); - defer core_decomp.deinit(); + var core_decomp = decompressor(.raw, core_fbs.reader()); var core_reader = core_decomp.reader(); var self = Self{ .allocator = allocator }; @@ -40,8 +39,7 @@ pub fn init(allocator: mem.Allocator) !Self { // Process PropList.txt const props_bytes = @embedFile("props"); var props_fbs = std.io.fixedBufferStream(props_bytes); - var props_decomp = try decompressor(allocator, props_fbs.reader(), null); - defer props_decomp.deinit(); + var props_decomp = decompressor(.raw, props_fbs.reader()); var props_reader = props_decomp.reader(); const stage_1_len: u16 = try props_reader.readInt(u16, endian); @@ -57,8 +55,7 @@ pub fn init(allocator: mem.Allocator) !Self { // Process DerivedNumericType.txt const num_bytes = @embedFile("numeric"); var num_fbs = std.io.fixedBufferStream(num_bytes); - var num_decomp = try decompressor(allocator, num_fbs.reader(), null); - defer num_decomp.deinit(); + var num_decomp = decompressor(.raw, num_fbs.reader()); var num_reader = num_decomp.reader(); const num_stage_1_len: u16 = try num_reader.readInt(u16, endian); diff --git a/src/ScriptsData.zig b/src/ScriptsData.zig index 4e371bf..415ce2d 100644 --- a/src/ScriptsData.zig +++ b/src/ScriptsData.zig @@ -4,7 +4,7 @@ const compress = std.compress; const mem = std.mem; const testing = std.testing; -/// Script +/// Scripts pub const Script = enum { none, Adlam, @@ -180,11 +180,10 @@ s3: []u8 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("scripts"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/WidthData.zig b/src/WidthData.zig index b9ef84e..cf31b7f 100644 --- a/src/WidthData.zig +++ b/src/WidthData.zig @@ -14,11 +14,10 @@ s2: []i3 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("dwp"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); -- cgit v1.2.3