summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-03-31 09:59:51 -0400
committerGravatar Jose Colon Rodriguez2024-03-31 09:59:51 -0400
commit200c617c865a5952f0bd12378802cc06ea3eb1c2 (patch)
tree2af456d4c62a08330cf961e7237f083fc4566370 /src
parentSplit out Unicode tests to separate file (diff)
downloadzg-200c617c865a5952f0bd12378802cc06ea3eb1c2.tar.gz
zg-200c617c865a5952f0bd12378802cc06ea3eb1c2.tar.xz
zg-200c617c865a5952f0bd12378802cc06ea3eb1c2.zip
Updated README
Diffstat (limited to 'src')
-rw-r--r--src/CanonData.zig5
-rw-r--r--src/CaseData.zig11
-rw-r--r--src/CaseFold.zig8
-rw-r--r--src/CombiningData.zig5
-rw-r--r--src/CompatData.zig5
-rw-r--r--src/FoldData.zig5
-rw-r--r--src/GenCatData.zig5
-rw-r--r--src/GraphemeData.zig5
-rw-r--r--src/HangulData.zig5
-rw-r--r--src/NormPropsData.zig5
-rw-r--r--src/Normalize.zig41
-rw-r--r--src/PropsData.zig11
-rw-r--r--src/ScriptsData.zig7
-rw-r--r--src/WidthData.zig5
14 files changed, 36 insertions, 87 deletions
diff --git a/src/CanonData.zig b/src/CanonData.zig
index 64d5555..be2b381 100644
--- a/src/CanonData.zig
+++ b/src/CanonData.zig
@@ -10,11 +10,10 @@ nfd: [][]u21 = undefined,
10const Self = @This(); 10const Self = @This();
11 11
12pub fn init(allocator: mem.Allocator) !Self { 12pub fn init(allocator: mem.Allocator) !Self {
13 const decompressor = compress.deflate.decompressor; 13 const decompressor = compress.flate.inflate.decompressor;
14 const in_bytes = @embedFile("canon"); 14 const in_bytes = @embedFile("canon");
15 var in_fbs = std.io.fixedBufferStream(in_bytes); 15 var in_fbs = std.io.fixedBufferStream(in_bytes);
16 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 16 var in_decomp = decompressor(.raw, in_fbs.reader());
17 defer in_decomp.deinit();
18 var reader = in_decomp.reader(); 17 var reader = in_decomp.reader();
19 18
20 const endian = builtin.cpu.arch.endian(); 19 const endian = builtin.cpu.arch.endian();
diff --git a/src/CaseData.zig b/src/CaseData.zig
index c9ccc1e..260637a 100644
--- a/src/CaseData.zig
+++ b/src/CaseData.zig
@@ -15,7 +15,7 @@ prop_s2: []u8 = undefined,
15const Self = @This(); 15const Self = @This();
16 16
17pub fn init(allocator: mem.Allocator) !Self { 17pub fn init(allocator: mem.Allocator) !Self {
18 const decompressor = compress.deflate.decompressor; 18 const decompressor = compress.flate.inflate.decompressor;
19 const endian = builtin.cpu.arch.endian(); 19 const endian = builtin.cpu.arch.endian();
20 20
21 var self = Self{ 21 var self = Self{
@@ -32,8 +32,7 @@ pub fn init(allocator: mem.Allocator) !Self {
32 // Uppercase 32 // Uppercase
33 const upper_bytes = @embedFile("upper"); 33 const upper_bytes = @embedFile("upper");
34 var upper_fbs = std.io.fixedBufferStream(upper_bytes); 34 var upper_fbs = std.io.fixedBufferStream(upper_bytes);
35 var upper_decomp = try decompressor(allocator, upper_fbs.reader(), null); 35 var upper_decomp = decompressor(.raw, upper_fbs.reader());
36 defer upper_decomp.deinit();
37 var upper_reader = upper_decomp.reader(); 36 var upper_reader = upper_decomp.reader();
38 37
39 while (true) { 38 while (true) {
@@ -46,8 +45,7 @@ pub fn init(allocator: mem.Allocator) !Self {
46 // Lowercase 45 // Lowercase
47 const lower_bytes = @embedFile("lower"); 46 const lower_bytes = @embedFile("lower");
48 var lower_fbs = std.io.fixedBufferStream(lower_bytes); 47 var lower_fbs = std.io.fixedBufferStream(lower_bytes);
49 var lower_decomp = try decompressor(allocator, lower_fbs.reader(), null); 48 var lower_decomp = decompressor(.raw, lower_fbs.reader());
50 defer lower_decomp.deinit();
51 var lower_reader = lower_decomp.reader(); 49 var lower_reader = lower_decomp.reader();
52 50
53 while (true) { 51 while (true) {
@@ -60,8 +58,7 @@ pub fn init(allocator: mem.Allocator) !Self {
60 // Case properties 58 // Case properties
61 const cp_bytes = @embedFile("case_prop"); 59 const cp_bytes = @embedFile("case_prop");
62 var cp_fbs = std.io.fixedBufferStream(cp_bytes); 60 var cp_fbs = std.io.fixedBufferStream(cp_bytes);
63 var cp_decomp = try decompressor(allocator, cp_fbs.reader(), null); 61 var cp_decomp = decompressor(.raw, cp_fbs.reader());
64 defer cp_decomp.deinit();
65 var cp_reader = cp_decomp.reader(); 62 var cp_reader = cp_decomp.reader();
66 63
67 const stage_1_len: u16 = try cp_reader.readInt(u16, endian); 64 const stage_1_len: u16 = try cp_reader.readInt(u16, endian);
diff --git a/src/CaseFold.zig b/src/CaseFold.zig
index 9b10e16..3e7535e 100644
--- a/src/CaseFold.zig
+++ b/src/CaseFold.zig
@@ -10,7 +10,9 @@ fold_data: *const FoldData,
10 10
11const Self = @This(); 11const Self = @This();
12 12
13fn caseFold( 13/// Produces the case folded code points for `cps`. Caller must free returned
14/// slice with `allocator`.
15pub fn caseFold(
14 self: Self, 16 self: Self,
15 allocator: mem.Allocator, 17 allocator: mem.Allocator,
16 cps: []const u21, 18 cps: []const u21,
@@ -37,6 +39,8 @@ fn changesWhenCaseFolded(self: Self, cps: []const u21) bool {
37 } else false; 39 } else false;
38} 40}
39 41
42/// Caseless compare `a` and `b` by decomposing to NFKD. This is the most
43/// comprehensive comparison possible, but slower than `canonCaselessMatch`.
40pub fn compatCaselessMatch( 44pub fn compatCaselessMatch(
41 self: Self, 45 self: Self,
42 allocator: mem.Allocator, 46 allocator: mem.Allocator,
@@ -108,6 +112,8 @@ test "compatCaselessMatch" {
108 try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, c)); 112 try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, c));
109} 113}
110 114
115/// Performs canonical caseless string matching by decomposing to NFD. This is
116/// faster than `compatCaselessMatch`, but less comprehensive.
111pub fn canonCaselessMatch( 117pub fn canonCaselessMatch(
112 self: Self, 118 self: Self,
113 allocator: mem.Allocator, 119 allocator: mem.Allocator,
diff --git a/src/CombiningData.zig b/src/CombiningData.zig
index a40cbde..16b923f 100644
--- a/src/CombiningData.zig
+++ b/src/CombiningData.zig
@@ -10,11 +10,10 @@ s2: []u8 = undefined,
10const Self = @This(); 10const Self = @This();
11 11
12pub fn init(allocator: mem.Allocator) !Self { 12pub fn init(allocator: mem.Allocator) !Self {
13 const decompressor = compress.deflate.decompressor; 13 const decompressor = compress.flate.inflate.decompressor;
14 const in_bytes = @embedFile("ccc"); 14 const in_bytes = @embedFile("ccc");
15 var in_fbs = std.io.fixedBufferStream(in_bytes); 15 var in_fbs = std.io.fixedBufferStream(in_bytes);
16 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 16 var in_decomp = decompressor(.raw, in_fbs.reader());
17 defer in_decomp.deinit();
18 var reader = in_decomp.reader(); 17 var reader = in_decomp.reader();
19 18
20 const endian = builtin.cpu.arch.endian(); 19 const endian = builtin.cpu.arch.endian();
diff --git a/src/CompatData.zig b/src/CompatData.zig
index a931cb3..3346a06 100644
--- a/src/CompatData.zig
+++ b/src/CompatData.zig
@@ -9,11 +9,10 @@ nfkd: [][]u21 = undefined,
9const Self = @This(); 9const Self = @This();
10 10
11pub fn init(allocator: mem.Allocator) !Self { 11pub fn init(allocator: mem.Allocator) !Self {
12 const decompressor = compress.deflate.decompressor; 12 const decompressor = compress.flate.inflate.decompressor;
13 const in_bytes = @embedFile("compat"); 13 const in_bytes = @embedFile("compat");
14 var in_fbs = std.io.fixedBufferStream(in_bytes); 14 var in_fbs = std.io.fixedBufferStream(in_bytes);
15 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 15 var in_decomp = decompressor(.raw, in_fbs.reader());
16 defer in_decomp.deinit();
17 var reader = in_decomp.reader(); 16 var reader = in_decomp.reader();
18 17
19 const endian = builtin.cpu.arch.endian(); 18 const endian = builtin.cpu.arch.endian();
diff --git a/src/FoldData.zig b/src/FoldData.zig
index a06eefe..d4312b0 100644
--- a/src/FoldData.zig
+++ b/src/FoldData.zig
@@ -10,11 +10,10 @@ cwcf: []bool = undefined,
10const Self = @This(); 10const Self = @This();
11 11
12pub fn init(allocator: mem.Allocator) !Self { 12pub fn init(allocator: mem.Allocator) !Self {
13 const decompressor = compress.deflate.decompressor; 13 const decompressor = compress.flate.inflate.decompressor;
14 const in_bytes = @embedFile("fold"); 14 const in_bytes = @embedFile("fold");
15 var in_fbs = std.io.fixedBufferStream(in_bytes); 15 var in_fbs = std.io.fixedBufferStream(in_bytes);
16 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 16 var in_decomp = decompressor(.raw, in_fbs.reader());
17 defer in_decomp.deinit();
18 var reader = in_decomp.reader(); 17 var reader = in_decomp.reader();
19 18
20 const endian = builtin.cpu.arch.endian(); 19 const endian = builtin.cpu.arch.endian();
diff --git a/src/GenCatData.zig b/src/GenCatData.zig
index 12501bf..454c45a 100644
--- a/src/GenCatData.zig
+++ b/src/GenCatData.zig
@@ -45,11 +45,10 @@ s3: []u5 = undefined,
45const Self = @This(); 45const Self = @This();
46 46
47pub fn init(allocator: mem.Allocator) !Self { 47pub fn init(allocator: mem.Allocator) !Self {
48 const decompressor = compress.deflate.decompressor; 48 const decompressor = compress.flate.inflate.decompressor;
49 const in_bytes = @embedFile("gencat"); 49 const in_bytes = @embedFile("gencat");
50 var in_fbs = std.io.fixedBufferStream(in_bytes); 50 var in_fbs = std.io.fixedBufferStream(in_bytes);
51 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 51 var in_decomp = decompressor(.raw, in_fbs.reader());
52 defer in_decomp.deinit();
53 var reader = in_decomp.reader(); 52 var reader = in_decomp.reader();
54 53
55 const endian = builtin.cpu.arch.endian(); 54 const endian = builtin.cpu.arch.endian();
diff --git a/src/GraphemeData.zig b/src/GraphemeData.zig
index 500ffea..1710870 100644
--- a/src/GraphemeData.zig
+++ b/src/GraphemeData.zig
@@ -38,11 +38,10 @@ s3: []u8 = undefined,
38const Self = @This(); 38const Self = @This();
39 39
40pub fn init(allocator: mem.Allocator) !Self { 40pub fn init(allocator: mem.Allocator) !Self {
41 const decompressor = compress.deflate.decompressor; 41 const decompressor = compress.flate.inflate.decompressor;
42 const in_bytes = @embedFile("gbp"); 42 const in_bytes = @embedFile("gbp");
43 var in_fbs = std.io.fixedBufferStream(in_bytes); 43 var in_fbs = std.io.fixedBufferStream(in_bytes);
44 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 44 var in_decomp = decompressor(.raw, in_fbs.reader());
45 defer in_decomp.deinit();
46 var reader = in_decomp.reader(); 45 var reader = in_decomp.reader();
47 46
48 const endian = builtin.cpu.arch.endian(); 47 const endian = builtin.cpu.arch.endian();
diff --git a/src/HangulData.zig b/src/HangulData.zig
index 99d91c1..5eee427 100644
--- a/src/HangulData.zig
+++ b/src/HangulData.zig
@@ -20,11 +20,10 @@ s2: []u3 = undefined,
20const Self = @This(); 20const Self = @This();
21 21
22pub fn init(allocator: mem.Allocator) !Self { 22pub fn init(allocator: mem.Allocator) !Self {
23 const decompressor = compress.deflate.decompressor; 23 const decompressor = compress.flate.inflate.decompressor;
24 const in_bytes = @embedFile("hangul"); 24 const in_bytes = @embedFile("hangul");
25 var in_fbs = std.io.fixedBufferStream(in_bytes); 25 var in_fbs = std.io.fixedBufferStream(in_bytes);
26 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 26 var in_decomp = decompressor(.raw, in_fbs.reader());
27 defer in_decomp.deinit();
28 var reader = in_decomp.reader(); 27 var reader = in_decomp.reader();
29 28
30 const endian = builtin.cpu.arch.endian(); 29 const endian = builtin.cpu.arch.endian();
diff --git a/src/NormPropsData.zig b/src/NormPropsData.zig
index 86d497b..899bb8f 100644
--- a/src/NormPropsData.zig
+++ b/src/NormPropsData.zig
@@ -11,11 +11,10 @@ s2: []u4 = undefined,
11const Self = @This(); 11const Self = @This();
12 12
13pub fn init(allocator: mem.Allocator) !Self { 13pub fn init(allocator: mem.Allocator) !Self {
14 const decompressor = compress.deflate.decompressor; 14 const decompressor = compress.flate.inflate.decompressor;
15 const in_bytes = @embedFile("normp"); 15 const in_bytes = @embedFile("normp");
16 var in_fbs = std.io.fixedBufferStream(in_bytes); 16 var in_fbs = std.io.fixedBufferStream(in_bytes);
17 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 17 var in_decomp = decompressor(.raw, in_fbs.reader());
18 defer in_decomp.deinit();
19 var reader = in_decomp.reader(); 18 var reader = in_decomp.reader();
20 19
21 const endian = builtin.cpu.arch.endian(); 20 const endian = builtin.cpu.arch.endian();
diff --git a/src/Normalize.zig b/src/Normalize.zig
index f437f4f..85e3aa3 100644
--- a/src/Normalize.zig
+++ b/src/Normalize.zig
@@ -572,47 +572,6 @@ test "eql" {
572 try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); 572 try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
573} 573}
574 574
575// FCD
576fn getLeadCcc(self: Self, cp: u21) u8 {
577 const dc = self.mapping(cp, .nfd);
578 const dcp = if (dc.form == .same) cp else dc.cps[0];
579 return self.norm_data.ccc_data.ccc(dcp);
580}
581
582fn getTrailCcc(self: Self, cp: u21) u8 {
583 const dc = self.mapping(cp, .nfd);
584 const dcp = if (dc.form == .same) cp else dc.cps[dc.cps.len - 1];
585 return self.norm_data.ccc_data.ccc(dcp);
586}
587
588// Fast check to detect if a string is already in NFC or NFD form.
589fn isFcd(self: Self, str: []const u8) bool {
590 var prev_ccc: u8 = 0;
591 var cp_iter = CodePointIterator{ .bytes = str };
592
593 return while (cp_iter.next()) |cp| {
594 const ccc = self.getLeadCcc(cp.code);
595 if (ccc != 0 and ccc < prev_ccc) break false;
596 prev_ccc = self.getTrailCcc(cp.code);
597 } else true;
598}
599
600test "isFcd" {
601 const allocator = testing.allocator;
602 const data = try NormData.init(allocator);
603 defer data.deinit();
604 const n = Self{ .norm_data = &data };
605
606 const is_nfc = "José \u{3D3}";
607 try testing.expect(n.isFcd(is_nfc));
608
609 const is_nfd = "Jose\u{301} \u{3d2}\u{301}";
610 try testing.expect(n.isFcd(is_nfd));
611
612 const not_fcd = "Jose\u{301} \u{3d2}\u{315}\u{301}";
613 try testing.expect(!n.isFcd(not_fcd));
614}
615
616/// Returns true if `str` only contains Latin-1 Supplement 575/// Returns true if `str` only contains Latin-1 Supplement
617/// code points. Uses SIMD if possible. 576/// code points. Uses SIMD if possible.
618pub fn isLatin1Only(str: []const u8) bool { 577pub fn isLatin1Only(str: []const u8) bool {
diff --git a/src/PropsData.zig b/src/PropsData.zig
index 9d24e68..f6c8370 100644
--- a/src/PropsData.zig
+++ b/src/PropsData.zig
@@ -15,14 +15,13 @@ num_s2: []u8 = undefined,
15const Self = @This(); 15const Self = @This();
16 16
17pub fn init(allocator: mem.Allocator) !Self { 17pub fn init(allocator: mem.Allocator) !Self {
18 const decompressor = compress.deflate.decompressor; 18 const decompressor = compress.flate.inflate.decompressor;
19 const endian = builtin.cpu.arch.endian(); 19 const endian = builtin.cpu.arch.endian();
20 20
21 // Process DerivedCoreProperties.txt 21 // Process DerivedCoreProperties.txt
22 const core_bytes = @embedFile("core_props"); 22 const core_bytes = @embedFile("core_props");
23 var core_fbs = std.io.fixedBufferStream(core_bytes); 23 var core_fbs = std.io.fixedBufferStream(core_bytes);
24 var core_decomp = try decompressor(allocator, core_fbs.reader(), null); 24 var core_decomp = decompressor(.raw, core_fbs.reader());
25 defer core_decomp.deinit();
26 var core_reader = core_decomp.reader(); 25 var core_reader = core_decomp.reader();
27 26
28 var self = Self{ .allocator = allocator }; 27 var self = Self{ .allocator = allocator };
@@ -40,8 +39,7 @@ pub fn init(allocator: mem.Allocator) !Self {
40 // Process PropList.txt 39 // Process PropList.txt
41 const props_bytes = @embedFile("props"); 40 const props_bytes = @embedFile("props");
42 var props_fbs = std.io.fixedBufferStream(props_bytes); 41 var props_fbs = std.io.fixedBufferStream(props_bytes);
43 var props_decomp = try decompressor(allocator, props_fbs.reader(), null); 42 var props_decomp = decompressor(.raw, props_fbs.reader());
44 defer props_decomp.deinit();
45 var props_reader = props_decomp.reader(); 43 var props_reader = props_decomp.reader();
46 44
47 const stage_1_len: u16 = try props_reader.readInt(u16, endian); 45 const stage_1_len: u16 = try props_reader.readInt(u16, endian);
@@ -57,8 +55,7 @@ pub fn init(allocator: mem.Allocator) !Self {
57 // Process DerivedNumericType.txt 55 // Process DerivedNumericType.txt
58 const num_bytes = @embedFile("numeric"); 56 const num_bytes = @embedFile("numeric");
59 var num_fbs = std.io.fixedBufferStream(num_bytes); 57 var num_fbs = std.io.fixedBufferStream(num_bytes);
60 var num_decomp = try decompressor(allocator, num_fbs.reader(), null); 58 var num_decomp = decompressor(.raw, num_fbs.reader());
61 defer num_decomp.deinit();
62 var num_reader = num_decomp.reader(); 59 var num_reader = num_decomp.reader();
63 60
64 const num_stage_1_len: u16 = try num_reader.readInt(u16, endian); 61 const num_stage_1_len: u16 = try num_reader.readInt(u16, endian);
diff --git a/src/ScriptsData.zig b/src/ScriptsData.zig
index 4e371bf..415ce2d 100644
--- a/src/ScriptsData.zig
+++ b/src/ScriptsData.zig
@@ -4,7 +4,7 @@ const compress = std.compress;
4const mem = std.mem; 4const mem = std.mem;
5const testing = std.testing; 5const testing = std.testing;
6 6
7/// Script 7/// Scripts
8pub const Script = enum { 8pub const Script = enum {
9 none, 9 none,
10 Adlam, 10 Adlam,
@@ -180,11 +180,10 @@ s3: []u8 = undefined,
180const Self = @This(); 180const Self = @This();
181 181
182pub fn init(allocator: mem.Allocator) !Self { 182pub fn init(allocator: mem.Allocator) !Self {
183 const decompressor = compress.deflate.decompressor; 183 const decompressor = compress.flate.inflate.decompressor;
184 const in_bytes = @embedFile("scripts"); 184 const in_bytes = @embedFile("scripts");
185 var in_fbs = std.io.fixedBufferStream(in_bytes); 185 var in_fbs = std.io.fixedBufferStream(in_bytes);
186 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 186 var in_decomp = decompressor(.raw, in_fbs.reader());
187 defer in_decomp.deinit();
188 var reader = in_decomp.reader(); 187 var reader = in_decomp.reader();
189 188
190 const endian = builtin.cpu.arch.endian(); 189 const endian = builtin.cpu.arch.endian();
diff --git a/src/WidthData.zig b/src/WidthData.zig
index b9ef84e..cf31b7f 100644
--- a/src/WidthData.zig
+++ b/src/WidthData.zig
@@ -14,11 +14,10 @@ s2: []i3 = undefined,
14const Self = @This(); 14const Self = @This();
15 15
16pub fn init(allocator: mem.Allocator) !Self { 16pub fn init(allocator: mem.Allocator) !Self {
17 const decompressor = compress.deflate.decompressor; 17 const decompressor = compress.flate.inflate.decompressor;
18 const in_bytes = @embedFile("dwp"); 18 const in_bytes = @embedFile("dwp");
19 var in_fbs = std.io.fixedBufferStream(in_bytes); 19 var in_fbs = std.io.fixedBufferStream(in_bytes);
20 var in_decomp = try decompressor(allocator, in_fbs.reader(), null); 20 var in_decomp = decompressor(.raw, in_fbs.reader());
21 defer in_decomp.deinit();
22 var reader = in_decomp.reader(); 21 var reader = in_decomp.reader();
23 22
24 const endian = builtin.cpu.arch.endian(); 23 const endian = builtin.cpu.arch.endian();