From 200c617c865a5952f0bd12378802cc06ea3eb1c2 Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Sun, 31 Mar 2024 09:59:51 -0400 Subject: Updated README --- src/CanonData.zig | 5 ++--- src/CaseData.zig | 11 ++++------- src/CaseFold.zig | 8 +++++++- src/CombiningData.zig | 5 ++--- src/CompatData.zig | 5 ++--- src/FoldData.zig | 5 ++--- src/GenCatData.zig | 5 ++--- src/GraphemeData.zig | 5 ++--- src/HangulData.zig | 5 ++--- src/NormPropsData.zig | 5 ++--- src/Normalize.zig | 41 ----------------------------------------- src/PropsData.zig | 11 ++++------- src/ScriptsData.zig | 7 +++---- src/WidthData.zig | 5 ++--- 14 files changed, 36 insertions(+), 87 deletions(-) (limited to 'src') diff --git a/src/CanonData.zig b/src/CanonData.zig index 64d5555..be2b381 100644 --- a/src/CanonData.zig +++ b/src/CanonData.zig @@ -10,11 +10,10 @@ nfd: [][]u21 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("canon"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/CaseData.zig b/src/CaseData.zig index c9ccc1e..260637a 100644 --- a/src/CaseData.zig +++ b/src/CaseData.zig @@ -15,7 +15,7 @@ prop_s2: []u8 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const endian = builtin.cpu.arch.endian(); var self = Self{ @@ -32,8 +32,7 @@ pub fn init(allocator: mem.Allocator) !Self { // Uppercase const upper_bytes = @embedFile("upper"); var upper_fbs = std.io.fixedBufferStream(upper_bytes); - var upper_decomp = try decompressor(allocator, upper_fbs.reader(), null); - defer upper_decomp.deinit(); + var upper_decomp = decompressor(.raw, upper_fbs.reader()); var upper_reader = upper_decomp.reader(); while (true) { @@ -46,8 +45,7 @@ pub fn init(allocator: mem.Allocator) !Self { // Lowercase const lower_bytes = @embedFile("lower"); var lower_fbs = std.io.fixedBufferStream(lower_bytes); - var lower_decomp = try decompressor(allocator, lower_fbs.reader(), null); - defer lower_decomp.deinit(); + var lower_decomp = decompressor(.raw, lower_fbs.reader()); var lower_reader = lower_decomp.reader(); while (true) { @@ -60,8 +58,7 @@ pub fn init(allocator: mem.Allocator) !Self { // Case properties const cp_bytes = @embedFile("case_prop"); var cp_fbs = std.io.fixedBufferStream(cp_bytes); - var cp_decomp = try decompressor(allocator, cp_fbs.reader(), null); - defer cp_decomp.deinit(); + var cp_decomp = decompressor(.raw, cp_fbs.reader()); var cp_reader = cp_decomp.reader(); const stage_1_len: u16 = try cp_reader.readInt(u16, endian); diff --git a/src/CaseFold.zig b/src/CaseFold.zig index 9b10e16..3e7535e 100644 --- a/src/CaseFold.zig +++ b/src/CaseFold.zig @@ -10,7 +10,9 @@ fold_data: *const FoldData, const Self = @This(); -fn caseFold( +/// Produces the case folded code points for `cps`. Caller must free returned +/// slice with `allocator`. +pub fn caseFold( self: Self, allocator: mem.Allocator, cps: []const u21, @@ -37,6 +39,8 @@ fn changesWhenCaseFolded(self: Self, cps: []const u21) bool { } else false; } +/// Caseless compare `a` and `b` by decomposing to NFKD. This is the most +/// comprehensive comparison possible, but slower than `canonCaselessMatch`. pub fn compatCaselessMatch( self: Self, allocator: mem.Allocator, @@ -108,6 +112,8 @@ test "compatCaselessMatch" { try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, c)); } +/// Performs canonical caseless string matching by decomposing to NFD. This is +/// faster than `compatCaselessMatch`, but less comprehensive. pub fn canonCaselessMatch( self: Self, allocator: mem.Allocator, diff --git a/src/CombiningData.zig b/src/CombiningData.zig index a40cbde..16b923f 100644 --- a/src/CombiningData.zig +++ b/src/CombiningData.zig @@ -10,11 +10,10 @@ s2: []u8 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("ccc"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/CompatData.zig b/src/CompatData.zig index a931cb3..3346a06 100644 --- a/src/CompatData.zig +++ b/src/CompatData.zig @@ -9,11 +9,10 @@ nfkd: [][]u21 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("compat"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/FoldData.zig b/src/FoldData.zig index a06eefe..d4312b0 100644 --- a/src/FoldData.zig +++ b/src/FoldData.zig @@ -10,11 +10,10 @@ cwcf: []bool = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("fold"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/GenCatData.zig b/src/GenCatData.zig index 12501bf..454c45a 100644 --- a/src/GenCatData.zig +++ b/src/GenCatData.zig @@ -45,11 +45,10 @@ s3: []u5 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("gencat"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/GraphemeData.zig b/src/GraphemeData.zig index 500ffea..1710870 100644 --- a/src/GraphemeData.zig +++ b/src/GraphemeData.zig @@ -38,11 +38,10 @@ s3: []u8 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("gbp"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/HangulData.zig b/src/HangulData.zig index 99d91c1..5eee427 100644 --- a/src/HangulData.zig +++ b/src/HangulData.zig @@ -20,11 +20,10 @@ s2: []u3 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("hangul"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/NormPropsData.zig b/src/NormPropsData.zig index 86d497b..899bb8f 100644 --- a/src/NormPropsData.zig +++ b/src/NormPropsData.zig @@ -11,11 +11,10 @@ s2: []u4 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("normp"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/Normalize.zig b/src/Normalize.zig index f437f4f..85e3aa3 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig @@ -572,47 +572,6 @@ test "eql" { try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); } -// FCD -fn getLeadCcc(self: Self, cp: u21) u8 { - const dc = self.mapping(cp, .nfd); - const dcp = if (dc.form == .same) cp else dc.cps[0]; - return self.norm_data.ccc_data.ccc(dcp); -} - -fn getTrailCcc(self: Self, cp: u21) u8 { - const dc = self.mapping(cp, .nfd); - const dcp = if (dc.form == .same) cp else dc.cps[dc.cps.len - 1]; - return self.norm_data.ccc_data.ccc(dcp); -} - -// Fast check to detect if a string is already in NFC or NFD form. -fn isFcd(self: Self, str: []const u8) bool { - var prev_ccc: u8 = 0; - var cp_iter = CodePointIterator{ .bytes = str }; - - return while (cp_iter.next()) |cp| { - const ccc = self.getLeadCcc(cp.code); - if (ccc != 0 and ccc < prev_ccc) break false; - prev_ccc = self.getTrailCcc(cp.code); - } else true; -} - -test "isFcd" { - const allocator = testing.allocator; - const data = try NormData.init(allocator); - defer data.deinit(); - const n = Self{ .norm_data = &data }; - - const is_nfc = "José \u{3D3}"; - try testing.expect(n.isFcd(is_nfc)); - - const is_nfd = "Jose\u{301} \u{3d2}\u{301}"; - try testing.expect(n.isFcd(is_nfd)); - - const not_fcd = "Jose\u{301} \u{3d2}\u{315}\u{301}"; - try testing.expect(!n.isFcd(not_fcd)); -} - /// Returns true if `str` only contains Latin-1 Supplement /// code points. Uses SIMD if possible. pub fn isLatin1Only(str: []const u8) bool { diff --git a/src/PropsData.zig b/src/PropsData.zig index 9d24e68..f6c8370 100644 --- a/src/PropsData.zig +++ b/src/PropsData.zig @@ -15,14 +15,13 @@ num_s2: []u8 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const endian = builtin.cpu.arch.endian(); // Process DerivedCoreProperties.txt const core_bytes = @embedFile("core_props"); var core_fbs = std.io.fixedBufferStream(core_bytes); - var core_decomp = try decompressor(allocator, core_fbs.reader(), null); - defer core_decomp.deinit(); + var core_decomp = decompressor(.raw, core_fbs.reader()); var core_reader = core_decomp.reader(); var self = Self{ .allocator = allocator }; @@ -40,8 +39,7 @@ pub fn init(allocator: mem.Allocator) !Self { // Process PropList.txt const props_bytes = @embedFile("props"); var props_fbs = std.io.fixedBufferStream(props_bytes); - var props_decomp = try decompressor(allocator, props_fbs.reader(), null); - defer props_decomp.deinit(); + var props_decomp = decompressor(.raw, props_fbs.reader()); var props_reader = props_decomp.reader(); const stage_1_len: u16 = try props_reader.readInt(u16, endian); @@ -57,8 +55,7 @@ pub fn init(allocator: mem.Allocator) !Self { // Process DerivedNumericType.txt const num_bytes = @embedFile("numeric"); var num_fbs = std.io.fixedBufferStream(num_bytes); - var num_decomp = try decompressor(allocator, num_fbs.reader(), null); - defer num_decomp.deinit(); + var num_decomp = decompressor(.raw, num_fbs.reader()); var num_reader = num_decomp.reader(); const num_stage_1_len: u16 = try num_reader.readInt(u16, endian); diff --git a/src/ScriptsData.zig b/src/ScriptsData.zig index 4e371bf..415ce2d 100644 --- a/src/ScriptsData.zig +++ b/src/ScriptsData.zig @@ -4,7 +4,7 @@ const compress = std.compress; const mem = std.mem; const testing = std.testing; -/// Script +/// Scripts pub const Script = enum { none, Adlam, @@ -180,11 +180,10 @@ s3: []u8 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("scripts"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); diff --git a/src/WidthData.zig b/src/WidthData.zig index b9ef84e..cf31b7f 100644 --- a/src/WidthData.zig +++ b/src/WidthData.zig @@ -14,11 +14,10 @@ s2: []i3 = undefined, const Self = @This(); pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.deflate.decompressor; + const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("dwp"); var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = try decompressor(allocator, in_fbs.reader(), null); - defer in_decomp.deinit(); + var in_decomp = decompressor(.raw, in_fbs.reader()); var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); -- cgit v1.2.3