summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/CanonData.zig57
-rw-r--r--src/CaseFolding.zig40
-rw-r--r--src/Normalize.zig143
-rw-r--r--src/unicode_tests.zig11
4 files changed, 85 insertions, 166 deletions
diff --git a/src/CanonData.zig b/src/CanonData.zig
index c972534..5c1ffa6 100644
--- a/src/CanonData.zig
+++ b/src/CanonData.zig
@@ -1,13 +1,23 @@
1//! Canonicalization Data 1//! Canonicalization Data
2 2
3s1: []const u16 = undefined, 3const Data = struct {
4s2: []const @import("canon").Canonicalization = undefined, 4 s1: []const u16 = undefined,
5nfc: std.AutoHashMapUnmanaged([2]u21, u21), 5 s2: []const @import("canon").Canonicalization = undefined,
6};
7
8const canon_data = canon_data: {
9 const canon_ = @import("canon");
10 break :canon_data Data{
11 .s1 = &canon_.s1,
12 .s2 = &canon_.s2,
13 };
14};
6 15
7const CanonData = @This(); 16const CanonData = @This();
8 17
9// There's a bug here, which is down to how static u21 vs. runtime are handled, 18// There's a bug here, which is down to how static u21 vs. runtime are handled,
10// the "unique representation" claim is not working out. So we do this: 19// the "unique representation" claim is not working out. AutoHash casts to bytes,
20// and that won't fly. So we do this:
11 21
12const Context = struct { 22const Context = struct {
13 pub fn hash(_: Context, cps: [2]u21) u64 { 23 pub fn hash(_: Context, cps: [2]u21) u64 {
@@ -22,47 +32,14 @@ const Context = struct {
22 32
23const c_map = comptime_map.ComptimeHashMap([2]u21, u21, Context, @import("canon").c_map); 33const c_map = comptime_map.ComptimeHashMap([2]u21, u21, Context, @import("canon").c_map);
24 34
25pub fn init(allocator: mem.Allocator) !CanonData {
26 var cdata = CanonData{
27 .nfc = .empty,
28 };
29 errdefer cdata.deinit(allocator);
30
31 const data = @import("canon");
32 cdata.s1 = &data.s1;
33 cdata.s2 = &data.s2;
34 var count: usize = 0;
35 for (data.composite) |cp| {
36 count += 1;
37 const cps = cdata.toNfd(cp);
38 std.debug.assert(cps.len == 2);
39 try cdata.nfc.put(allocator, cps[0..2].*, cp);
40 }
41
42 // var keys = cdata.nfc.keyIterator();
43 // while (keys.next()) |key| {
44 // const c32: [2]u32 = .{ key[0], key[1] };
45 // if (c_map.get(c32)) |_| {
46 // std.debug.print("got", .{});
47 // }
48 // }
49
50 return cdata;
51}
52
53pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void {
54 cdata.nfc.deinit(allocator);
55}
56
57/// Returns canonical decomposition for `cp`. 35/// Returns canonical decomposition for `cp`.
58pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 { 36pub fn toNfd(cp: u21) []const u21 {
59 const canon = &cdata.s2[cdata.s1[cp >> 8] + (cp & 0xff)]; 37 const canon = &canon_data.s2[canon_data.s1[cp >> 8] + (cp & 0xff)];
60 return canon.cps[0..canon.len]; 38 return canon.cps[0..canon.len];
61} 39}
62 40
63// Returns the primary composite for the codepoints in `cp`. 41// Returns the primary composite for the codepoints in `cp`.
64pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 { 42pub fn toNfc(cps: [2]u21) ?u21 {
65 _ = cdata;
66 if (c_map.get(cps)) |cpp| { 43 if (c_map.get(cps)) |cpp| {
67 return cpp.*; 44 return cpp.*;
68 } else { 45 } else {
diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig
index 88f047c..d69cddc 100644
--- a/src/CaseFolding.zig
+++ b/src/CaseFolding.zig
@@ -100,14 +100,13 @@ fn isCwcfException(cp: u21) bool {
100/// comprehensive comparison possible, but slower than `canonCaselessMatch`. 100/// comprehensive comparison possible, but slower than `canonCaselessMatch`.
101pub fn compatCaselessMatch( 101pub fn compatCaselessMatch(
102 allocator: Allocator, 102 allocator: Allocator,
103 normalize: Normalize,
104 a: []const u8, 103 a: []const u8,
105 b: []const u8, 104 b: []const u8,
106) Allocator.Error!bool { 105) Allocator.Error!bool {
107 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); 106 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
108 107
109 // Process a 108 // Process a
110 const nfd_a = try normalize.nfxdCodePoints(allocator, a, .nfd); 109 const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd);
111 defer allocator.free(nfd_a); 110 defer allocator.free(nfd_a);
112 111
113 var need_free_cf_nfd_a = false; 112 var need_free_cf_nfd_a = false;
@@ -118,15 +117,15 @@ pub fn compatCaselessMatch(
118 } 117 }
119 defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); 118 defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a);
120 119
121 const nfkd_cf_nfd_a = try normalize.nfkdCodePoints(allocator, cf_nfd_a); 120 const nfkd_cf_nfd_a = try Normalize.nfkdCodePoints(allocator, cf_nfd_a);
122 defer allocator.free(nfkd_cf_nfd_a); 121 defer allocator.free(nfkd_cf_nfd_a);
123 const cf_nfkd_cf_nfd_a = try CaseFolding.caseFoldAlloc(allocator, nfkd_cf_nfd_a); 122 const cf_nfkd_cf_nfd_a = try CaseFolding.caseFoldAlloc(allocator, nfkd_cf_nfd_a);
124 defer allocator.free(cf_nfkd_cf_nfd_a); 123 defer allocator.free(cf_nfkd_cf_nfd_a);
125 const nfkd_cf_nfkd_cf_nfd_a = try normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); 124 const nfkd_cf_nfkd_cf_nfd_a = try Normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a);
126 defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); 125 defer allocator.free(nfkd_cf_nfkd_cf_nfd_a);
127 126
128 // Process b 127 // Process b
129 const nfd_b = try normalize.nfxdCodePoints(allocator, b, .nfd); 128 const nfd_b = try Normalize.nfxdCodePoints(allocator, b, .nfd);
130 defer allocator.free(nfd_b); 129 defer allocator.free(nfd_b);
131 130
132 var need_free_cf_nfd_b = false; 131 var need_free_cf_nfd_b = false;
@@ -137,11 +136,11 @@ pub fn compatCaselessMatch(
137 } 136 }
138 defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); 137 defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b);
139 138
140 const nfkd_cf_nfd_b = try normalize.nfkdCodePoints(allocator, cf_nfd_b); 139 const nfkd_cf_nfd_b = try Normalize.nfkdCodePoints(allocator, cf_nfd_b);
141 defer allocator.free(nfkd_cf_nfd_b); 140 defer allocator.free(nfkd_cf_nfd_b);
142 const cf_nfkd_cf_nfd_b = try CaseFolding.caseFoldAlloc(allocator, nfkd_cf_nfd_b); 141 const cf_nfkd_cf_nfd_b = try CaseFolding.caseFoldAlloc(allocator, nfkd_cf_nfd_b);
143 defer allocator.free(cf_nfkd_cf_nfd_b); 142 defer allocator.free(cf_nfkd_cf_nfd_b);
144 const nfkd_cf_nfkd_cf_nfd_b = try normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); 143 const nfkd_cf_nfkd_cf_nfd_b = try Normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b);
145 defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); 144 defer allocator.free(nfkd_cf_nfkd_cf_nfd_b);
146 145
147 return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); 146 return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b);
@@ -176,31 +175,27 @@ test "caseFold" {
176test "compatCaselessMatch" { 175test "compatCaselessMatch" {
177 const allocator = testing.allocator; 176 const allocator = testing.allocator;
178 177
179 var normalize = try Normalize.init(allocator); 178 try testing.expect(try compatCaselessMatch(allocator, "ascii only!", "ASCII Only!"));
180 defer normalize.deinit(allocator);
181
182 try testing.expect(try compatCaselessMatch(allocator, normalize, "ascii only!", "ASCII Only!"));
183 179
184 const a = "Héllo World! \u{3d3}"; 180 const a = "Héllo World! \u{3d3}";
185 const b = "He\u{301}llo World! \u{3a5}\u{301}"; 181 const b = "He\u{301}llo World! \u{3a5}\u{301}";
186 try testing.expect(try compatCaselessMatch(allocator, normalize, a, b)); 182 try testing.expect(try compatCaselessMatch(allocator, a, b));
187 183
188 const c = "He\u{301}llo World! \u{3d2}\u{301}"; 184 const c = "He\u{301}llo World! \u{3d2}\u{301}";
189 try testing.expect(try compatCaselessMatch(allocator, normalize, a, c)); 185 try testing.expect(try compatCaselessMatch(allocator, a, c));
190} 186}
191 187
192/// Performs canonical caseless string matching by decomposing to NFD. This is 188/// Performs canonical caseless string matching by decomposing to NFD. This is
193/// faster than `compatCaselessMatch`, but less comprehensive. 189/// faster than `compatCaselessMatch`, but less comprehensive.
194pub fn canonCaselessMatch( 190pub fn canonCaselessMatch(
195 allocator: Allocator, 191 allocator: Allocator,
196 normalize: Normalize,
197 a: []const u8, 192 a: []const u8,
198 b: []const u8, 193 b: []const u8,
199) Allocator.Error!bool { 194) Allocator.Error!bool {
200 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); 195 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
201 196
202 // Process a 197 // Process a
203 const nfd_a = try normalize.nfxdCodePoints(allocator, a, .nfd); 198 const nfd_a = try Normalize.nfxdCodePoints(allocator, a, .nfd);
204 defer allocator.free(nfd_a); 199 defer allocator.free(nfd_a);
205 200
206 var need_free_cf_nfd_a = false; 201 var need_free_cf_nfd_a = false;
@@ -214,13 +209,13 @@ pub fn canonCaselessMatch(
214 var need_free_nfd_cf_nfd_a = false; 209 var need_free_nfd_cf_nfd_a = false;
215 var nfd_cf_nfd_a = cf_nfd_a; 210 var nfd_cf_nfd_a = cf_nfd_a;
216 if (!need_free_cf_nfd_a) { 211 if (!need_free_cf_nfd_a) {
217 nfd_cf_nfd_a = try normalize.nfdCodePoints(allocator, cf_nfd_a); 212 nfd_cf_nfd_a = try Normalize.nfdCodePoints(allocator, cf_nfd_a);
218 need_free_nfd_cf_nfd_a = true; 213 need_free_nfd_cf_nfd_a = true;
219 } 214 }
220 defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); 215 defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a);
221 216
222 // Process b 217 // Process b
223 const nfd_b = try normalize.nfxdCodePoints(allocator, b, .nfd); 218 const nfd_b = try Normalize.nfxdCodePoints(allocator, b, .nfd);
224 defer allocator.free(nfd_b); 219 defer allocator.free(nfd_b);
225 220
226 var need_free_cf_nfd_b = false; 221 var need_free_cf_nfd_b = false;
@@ -234,7 +229,7 @@ pub fn canonCaselessMatch(
234 var need_free_nfd_cf_nfd_b = false; 229 var need_free_nfd_cf_nfd_b = false;
235 var nfd_cf_nfd_b = cf_nfd_b; 230 var nfd_cf_nfd_b = cf_nfd_b;
236 if (!need_free_cf_nfd_b) { 231 if (!need_free_cf_nfd_b) {
237 nfd_cf_nfd_b = try normalize.nfdCodePoints(allocator, cf_nfd_b); 232 nfd_cf_nfd_b = try Normalize.nfdCodePoints(allocator, cf_nfd_b);
238 need_free_nfd_cf_nfd_b = true; 233 need_free_nfd_cf_nfd_b = true;
239 } 234 }
240 defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b); 235 defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b);
@@ -245,17 +240,14 @@ pub fn canonCaselessMatch(
245test "canonCaselessMatch" { 240test "canonCaselessMatch" {
246 const allocator = testing.allocator; 241 const allocator = testing.allocator;
247 242
248 var normalize = try Normalize.init(allocator); 243 try testing.expect(try canonCaselessMatch(allocator, "ascii only!", "ASCII Only!"));
249 defer normalize.deinit(allocator);
250
251 try testing.expect(try canonCaselessMatch(allocator, normalize, "ascii only!", "ASCII Only!"));
252 244
253 const a = "Héllo World! \u{3d3}"; 245 const a = "Héllo World! \u{3d3}";
254 const b = "He\u{301}llo World! \u{3a5}\u{301}"; 246 const b = "He\u{301}llo World! \u{3a5}\u{301}";
255 try testing.expect(!try canonCaselessMatch(allocator, normalize, a, b)); 247 try testing.expect(!try canonCaselessMatch(allocator, a, b));
256 248
257 const c = "He\u{301}llo World! \u{3d2}\u{301}"; 249 const c = "He\u{301}llo World! \u{3d2}\u{301}";
258 try testing.expect(try canonCaselessMatch(allocator, normalize, a, c)); 250 try testing.expect(try canonCaselessMatch(allocator, a, c));
259} 251}
260 252
261const std = @import("std"); 253const std = @import("std");
diff --git a/src/Normalize.zig b/src/Normalize.zig
index 3191a8c..865318f 100644
--- a/src/Normalize.zig
+++ b/src/Normalize.zig
@@ -2,25 +2,8 @@
2//! Unicode Normalization. You can normalize strings into NFC, 2//! Unicode Normalization. You can normalize strings into NFC,
3//! NFKC, NFD, and NFKD normalization forms. 3//! NFKC, NFD, and NFKD normalization forms.
4 4
5canon_data: CanonData = undefined,
6
7const Normalize = @This(); 5const Normalize = @This();
8 6
9pub fn init(allocator: Allocator) !Normalize {
10 var norm: Normalize = undefined;
11 try norm.setup(allocator);
12 return norm;
13}
14
15pub fn setup(self: *Normalize, allocator: Allocator) !void {
16 self.canon_data = try CanonData.init(allocator);
17}
18
19pub fn deinit(norm: *const Normalize, allocator: Allocator) void {
20 const mut_norm = @constCast(norm);
21 mut_norm.canon_data.deinit(allocator);
22}
23
24const SBase: u21 = 0xAC00; 7const SBase: u21 = 0xAC00;
25const LBase: u21 = 0x1100; 8const LBase: u21 = 0x1100;
26const VBase: u21 = 0x1161; 9const VBase: u21 = 0x1161;
@@ -91,12 +74,12 @@ const Decomp = struct {
91}; 74};
92 75
93// `mapping` retrieves the decomposition mapping for a code point as per the UCD. 76// `mapping` retrieves the decomposition mapping for a code point as per the UCD.
94fn mapping(self: Normalize, cp: u21, form: Form) Decomp { 77fn mapping(cp: u21, form: Form) Decomp {
95 var dc = Decomp{}; 78 var dc = Decomp{};
96 79
97 switch (form) { 80 switch (form) {
98 .nfd => { 81 .nfd => {
99 dc.cps = self.canon_data.toNfd(cp); 82 dc.cps = CanonData.toNfd(cp);
100 if (dc.cps.len != 0) dc.form = .nfd; 83 if (dc.cps.len != 0) dc.form = .nfd;
101 }, 84 },
102 85
@@ -105,7 +88,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp {
105 if (dc.cps.len != 0) { 88 if (dc.cps.len != 0) {
106 dc.form = .nfkd; 89 dc.form = .nfkd;
107 } else { 90 } else {
108 dc.cps = self.canon_data.toNfd(cp); 91 dc.cps = CanonData.toNfd(cp);
109 if (dc.cps.len != 0) dc.form = .nfkd; 92 if (dc.cps.len != 0) dc.form = .nfkd;
110 } 93 }
111 }, 94 },
@@ -117,12 +100,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp {
117} 100}
118 101
119// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`. 102// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`.
120fn decompose( 103fn decompose(cp: u21, form: Form, buf: []u21) Decomp {
121 self: Normalize,
122 cp: u21,
123 form: Form,
124 buf: []u21,
125) Decomp {
126 // ASCII 104 // ASCII
127 if (cp < 128) return .{}; 105 if (cp < 128) return .{};
128 106
@@ -149,7 +127,7 @@ fn decompose(
149 // Look at previous code point in work queue. 127 // Look at previous code point in work queue.
150 work_index -= 1; 128 work_index -= 1;
151 const next = work[work_index]; 129 const next = work[work_index];
152 const m = self.mapping(next, form); 130 const m = Normalize.mapping(next, form);
153 131
154 // No more of decompositions for this code point. 132 // No more of decompositions for this code point.
155 if (m.form == .same) { 133 if (m.form == .same) {
@@ -175,44 +153,41 @@ fn decompose(
175} 153}
176 154
177test "decompose" { 155test "decompose" {
178 const allocator = testing.allocator;
179 var n = try Normalize.init(allocator);
180 defer n.deinit(allocator);
181 var buf: [18]u21 = undefined; 156 var buf: [18]u21 = undefined;
182 157
183 var dc = n.decompose('é', .nfd, &buf); 158 var dc = Normalize.decompose('é', .nfd, &buf);
184 try testing.expect(dc.form == .nfd); 159 try testing.expect(dc.form == .nfd);
185 try testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]); 160 try testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]);
186 161
187 dc = n.decompose('\u{1e0a}', .nfd, &buf); 162 dc = Normalize.decompose('\u{1e0a}', .nfd, &buf);
188 try testing.expect(dc.form == .nfd); 163 try testing.expect(dc.form == .nfd);
189 try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); 164 try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
190 165
191 dc = n.decompose('\u{1e0a}', .nfkd, &buf); 166 dc = Normalize.decompose('\u{1e0a}', .nfkd, &buf);
192 try testing.expect(dc.form == .nfkd); 167 try testing.expect(dc.form == .nfkd);
193 try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); 168 try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
194 169
195 dc = n.decompose('\u{3189}', .nfd, &buf); 170 dc = Normalize.decompose('\u{3189}', .nfd, &buf);
196 try testing.expect(dc.form == .same); 171 try testing.expect(dc.form == .same);
197 try testing.expect(dc.cps.len == 0); 172 try testing.expect(dc.cps.len == 0);
198 173
199 dc = n.decompose('\u{3189}', .nfkd, &buf); 174 dc = Normalize.decompose('\u{3189}', .nfkd, &buf);
200 try testing.expect(dc.form == .nfkd); 175 try testing.expect(dc.form == .nfkd);
201 try testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]); 176 try testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]);
202 177
203 dc = n.decompose('\u{ace1}', .nfd, &buf); 178 dc = Normalize.decompose('\u{ace1}', .nfd, &buf);
204 try testing.expect(dc.form == .nfd); 179 try testing.expect(dc.form == .nfd);
205 try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); 180 try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
206 181
207 dc = n.decompose('\u{ace1}', .nfkd, &buf); 182 dc = Normalize.decompose('\u{ace1}', .nfkd, &buf);
208 try testing.expect(dc.form == .nfd); 183 try testing.expect(dc.form == .nfd);
209 try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); 184 try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
210 185
211 dc = n.decompose('\u{3d3}', .nfd, &buf); 186 dc = Normalize.decompose('\u{3d3}', .nfd, &buf);
212 try testing.expect(dc.form == .nfd); 187 try testing.expect(dc.form == .nfd);
213 try testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]); 188 try testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]);
214 189
215 dc = n.decompose('\u{3d3}', .nfkd, &buf); 190 dc = Normalize.decompose('\u{3d3}', .nfkd, &buf);
216 try testing.expect(dc.form == .nfkd); 191 try testing.expect(dc.form == .nfkd);
217 try testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]); 192 try testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]);
218} 193}
@@ -231,8 +206,8 @@ pub const Result = struct {
231 return .{ .allocated = true, .slice = try allocator.dupe(u8, result.slice) }; 206 return .{ .allocated = true, .slice = try allocator.dupe(u8, result.slice) };
232 } 207 }
233 208
234 pub fn deinit(self: *const Result, allocator: Allocator) void { 209 pub fn deinit(result: *const Result, allocator: Allocator) void {
235 if (self.allocated) allocator.free(self.slice); 210 if (result.allocated) allocator.free(result.slice);
236 } 211 }
237}; 212};
238 213
@@ -252,16 +227,16 @@ fn canonicalSort(cps: []u21) void {
252} 227}
253 228
254/// Normalize `str` to NFD. 229/// Normalize `str` to NFD.
255pub fn nfd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { 230pub fn nfd(allocator: Allocator, str: []const u8) Allocator.Error!Result {
256 return self.nfxd(allocator, str, .nfd); 231 return Normalize.nfxd(allocator, str, .nfd);
257} 232}
258 233
259/// Normalize `str` to NFKD. 234/// Normalize `str` to NFKD.
260pub fn nfkd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { 235pub fn nfkd(allocator: Allocator, str: []const u8) Allocator.Error!Result {
261 return self.nfxd(allocator, str, .nfkd); 236 return Normalize.nfxd(allocator, str, .nfkd);
262} 237}
263 238
264pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error![]u21 { 239pub fn nfxdCodePoints(allocator: Allocator, str: []const u8, form: Form) Allocator.Error![]u21 {
265 var dcp_list = std.array_list.Managed(u21).init(allocator); 240 var dcp_list = std.array_list.Managed(u21).init(allocator);
266 defer dcp_list.deinit(); 241 defer dcp_list.deinit();
267 242
@@ -269,7 +244,7 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo
269 var dc_buf: [18]u21 = undefined; 244 var dc_buf: [18]u21 = undefined;
270 245
271 while (cp_iter.next()) |cp| { 246 while (cp_iter.next()) |cp| {
272 const dc = self.decompose(cp.code, form, &dc_buf); 247 const dc = Normalize.decompose(cp.code, form, &dc_buf);
273 if (dc.form == .same) { 248 if (dc.form == .same) {
274 try dcp_list.append(cp.code); 249 try dcp_list.append(cp.code);
275 } else { 250 } else {
@@ -282,11 +257,11 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo
282 return try dcp_list.toOwnedSlice(); 257 return try dcp_list.toOwnedSlice();
283} 258}
284 259
285fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { 260fn nfxd(allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result {
286 // Quick checks. 261 // Quick checks.
287 if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; 262 if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
288 263
289 const dcps = try self.nfxdCodePoints(allocator, str, form); 264 const dcps = try Normalize.nfxdCodePoints(allocator, str, form);
290 defer allocator.free(dcps); 265 defer allocator.free(dcps);
291 266
292 var dstr_list = std.array_list.Managed(u8).init(allocator); 267 var dstr_list = std.array_list.Managed(u8).init(allocator);
@@ -303,10 +278,8 @@ fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
303 278
304test "nfd ASCII / no-alloc" { 279test "nfd ASCII / no-alloc" {
305 const allocator = testing.allocator; 280 const allocator = testing.allocator;
306 var n = try Normalize.init(allocator);
307 defer n.deinit(allocator);
308 281
309 const result = try n.nfd(allocator, "Hello World!"); 282 const result = try Normalize.nfd(allocator, "Hello World!");
310 defer result.deinit(allocator); 283 defer result.deinit(allocator);
311 284
312 try testing.expectEqualStrings("Hello World!", result.slice); 285 try testing.expectEqualStrings("Hello World!", result.slice);
@@ -314,10 +287,8 @@ test "nfd ASCII / no-alloc" {
314 287
315test "nfd !ASCII / alloc" { 288test "nfd !ASCII / alloc" {
316 const allocator = testing.allocator; 289 const allocator = testing.allocator;
317 var n = try Normalize.init(allocator);
318 defer n.deinit(allocator);
319 290
320 const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); 291 const result = try Normalize.nfd(allocator, "Héllo World! \u{3d3}");
321 defer result.deinit(allocator); 292 defer result.deinit(allocator);
322 293
323 try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice); 294 try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice);
@@ -325,10 +296,8 @@ test "nfd !ASCII / alloc" {
325 296
326test "nfkd ASCII / no-alloc" { 297test "nfkd ASCII / no-alloc" {
327 const allocator = testing.allocator; 298 const allocator = testing.allocator;
328 var n = try Normalize.init(allocator);
329 defer n.deinit(allocator);
330 299
331 const result = try n.nfkd(allocator, "Hello World!"); 300 const result = try Normalize.nfkd(allocator, "Hello World!");
332 defer result.deinit(allocator); 301 defer result.deinit(allocator);
333 302
334 try testing.expectEqualStrings("Hello World!", result.slice); 303 try testing.expectEqualStrings("Hello World!", result.slice);
@@ -336,27 +305,21 @@ test "nfkd ASCII / no-alloc" {
336 305
337test "nfkd !ASCII / alloc" { 306test "nfkd !ASCII / alloc" {
338 const allocator = testing.allocator; 307 const allocator = testing.allocator;
339 var n = try Normalize.init(allocator);
340 defer n.deinit(allocator);
341 308
342 const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); 309 const result = try Normalize.nfkd(allocator, "Héllo World! \u{3d3}");
343 defer result.deinit(allocator); 310 defer result.deinit(allocator);
344 311
345 try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); 312 try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice);
346} 313}
347 314
348pub fn nfdCodePoints( 315pub fn nfdCodePoints(allocator: Allocator, cps: []const u21) Allocator.Error![]u21 {
349 self: Normalize,
350 allocator: Allocator,
351 cps: []const u21,
352) Allocator.Error![]u21 {
353 var dcp_list = std.array_list.Managed(u21).init(allocator); 316 var dcp_list = std.array_list.Managed(u21).init(allocator);
354 defer dcp_list.deinit(); 317 defer dcp_list.deinit();
355 318
356 var dc_buf: [18]u21 = undefined; 319 var dc_buf: [18]u21 = undefined;
357 320
358 for (cps) |cp| { 321 for (cps) |cp| {
359 const dc = self.decompose(cp, .nfd, &dc_buf); 322 const dc = Normalize.decompose(cp, .nfd, &dc_buf);
360 323
361 if (dc.form == .same) { 324 if (dc.form == .same) {
362 try dcp_list.append(cp); 325 try dcp_list.append(cp);
@@ -370,18 +333,14 @@ pub fn nfdCodePoints(
370 return try dcp_list.toOwnedSlice(); 333 return try dcp_list.toOwnedSlice();
371} 334}
372 335
373pub fn nfkdCodePoints( 336pub fn nfkdCodePoints(allocator: Allocator, cps: []const u21) Allocator.Error![]u21 {
374 self: Normalize,
375 allocator: Allocator,
376 cps: []const u21,
377) Allocator.Error![]u21 {
378 var dcp_list = std.array_list.Managed(u21).init(allocator); 337 var dcp_list = std.array_list.Managed(u21).init(allocator);
379 defer dcp_list.deinit(); 338 defer dcp_list.deinit();
380 339
381 var dc_buf: [18]u21 = undefined; 340 var dc_buf: [18]u21 = undefined;
382 341
383 for (cps) |cp| { 342 for (cps) |cp| {
384 const dc = self.decompose(cp, .nfkd, &dc_buf); 343 const dc = Normalize.decompose(cp, .nfkd, &dc_buf);
385 344
386 if (dc.form == .same) { 345 if (dc.form == .same) {
387 try dcp_list.append(cp); 346 try dcp_list.append(cp);
@@ -402,29 +361,29 @@ fn isHangul(cp: u21) bool {
402} 361}
403 362
404/// Normalizes `str` to NFC. 363/// Normalizes `str` to NFC.
405pub fn nfc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { 364pub fn nfc(allocator: Allocator, str: []const u8) Allocator.Error!Result {
406 return self.nfxc(allocator, str, .nfc); 365 return Normalize.nfxc(allocator, str, .nfc);
407} 366}
408 367
409/// Normalizes `str` to NFKC. 368/// Normalizes `str` to NFKC.
410pub fn nfkc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { 369pub fn nfkc(allocator: Allocator, str: []const u8) Allocator.Error!Result {
411 return self.nfxc(allocator, str, .nfkc); 370 return Normalize.nfxc(allocator, str, .nfkc);
412} 371}
413 372
414fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { 373fn nfxc(allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result {
415 // Quick checks. 374 // Quick checks.
416 if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; 375 if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
417 if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str }; 376 if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str };
418 377
419 // Decompose first. 378 // Decompose first.
420 var dcps = if (form == .nfc) 379 var dcps = if (form == .nfc)
421 try self.nfxdCodePoints(allocator, str, .nfd) 380 try Normalize.nfxdCodePoints(allocator, str, .nfd)
422 else 381 else
423 try self.nfxdCodePoints(allocator, str, .nfkd); 382 try Normalize.nfxdCodePoints(allocator, str, .nfkd);
424 defer allocator.free(dcps); 383 defer allocator.free(dcps);
425 384
426 // Compose 385 // Compose
427 const tombstone = 0xe000; // Start of BMP Private Use Area 386 const tombstone = 0x1FFFF; // Convenient Cn noncharacter point
428 387
429 // Loop over all decomposed code points. 388 // Loop over all decomposed code points.
430 while (true) { 389 while (true) {
@@ -498,7 +457,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
498 if (!processed_hangul) { 457 if (!processed_hangul) {
499 // L, C are not Hangul, so check for primary composite 458 // L, C are not Hangul, so check for primary composite
500 // in the Unicode Character Database. 459 // in the Unicode Character Database.
501 if (self.canon_data.toNfc(.{ L, C })) |P| { 460 if (CanonData.toNfc(.{ L, C })) |P| {
502 // We have a primary composite P for L, C. 461 // We have a primary composite P for L, C.
503 // We must check if P is not in the Full 462 // We must check if P is not in the Full
504 // Composition Exclusions (FCX) list, 463 // Composition Exclusions (FCX) list,
@@ -534,10 +493,8 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
534 493
535test "nfc" { 494test "nfc" {
536 const allocator = testing.allocator; 495 const allocator = testing.allocator;
537 var n = try Normalize.init(allocator);
538 defer n.deinit(allocator);
539 496
540 const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); 497 const result = try Normalize.nfc(allocator, "Complex char: \u{3D2}\u{301}");
541 defer result.deinit(allocator); 498 defer result.deinit(allocator);
542 499
543 try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice); 500 try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice);
@@ -545,20 +502,18 @@ test "nfc" {
545 502
546test "nfkc" { 503test "nfkc" {
547 const allocator = testing.allocator; 504 const allocator = testing.allocator;
548 var n = try Normalize.init(allocator);
549 defer n.deinit(allocator);
550 505
551 const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); 506 const result = try Normalize.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
552 defer result.deinit(allocator); 507 defer result.deinit(allocator);
553 508
554 try testing.expectEqualStrings("Complex char: \u{038E}", result.slice); 509 try testing.expectEqualStrings("Complex char: \u{038E}", result.slice);
555} 510}
556 511
557/// Tests for equality of `a` and `b` after normalizing to NFC. 512/// Tests for equality of `a` and `b` after normalizing to NFC.
558pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8) !bool { 513pub fn eql(allocator: Allocator, a: []const u8, b: []const u8) !bool {
559 const norm_result_a = try self.nfc(allocator, a); 514 const norm_result_a = try Normalize.nfc(allocator, a);
560 defer norm_result_a.deinit(allocator); 515 defer norm_result_a.deinit(allocator);
561 const norm_result_b = try self.nfc(allocator, b); 516 const norm_result_b = try Normalize.nfc(allocator, b);
562 defer norm_result_b.deinit(allocator); 517 defer norm_result_b.deinit(allocator);
563 518
564 return mem.eql(u8, norm_result_a.slice, norm_result_b.slice); 519 return mem.eql(u8, norm_result_a.slice, norm_result_b.slice);
@@ -566,11 +521,9 @@ pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8)
566 521
567test "eql" { 522test "eql" {
568 const allocator = testing.allocator; 523 const allocator = testing.allocator;
569 var n = try Normalize.init(allocator);
570 defer n.deinit(allocator);
571 524
572 try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); 525 try testing.expect(try Normalize.eql(allocator, "foé", "foe\u{0301}"));
573 try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); 526 try testing.expect(try Normalize.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
574} 527}
575 528
576/// Returns true if `str` only contains Latin-1 Supplement 529/// Returns true if `str` only contains Latin-1 Supplement
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 50b8824..81ea90d 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -5,9 +5,6 @@ test "Unicode normalization tests" {
5 defer arena.deinit(); 5 defer arena.deinit();
6 const allocator = arena.allocator(); 6 const allocator = arena.allocator();
7 7
8 const n = try Normalize.init(allocator);
9 defer n.deinit(allocator);
10
11 var reader = std.io.Reader.fixed(@embedFile("NormalizationTest.txt")); 8 var reader = std.io.Reader.fixed(@embedFile("NormalizationTest.txt"));
12 var cp_buf: [4]u8 = undefined; 9 var cp_buf: [4]u8 = undefined;
13 10
@@ -47,7 +44,7 @@ test "Unicode normalization tests" {
47 } 44 }
48 45
49 const want = w_buf.items; 46 const want = w_buf.items;
50 var got = try n.nfc(allocator, input); 47 var got = try Normalize.nfc(allocator, input);
51 defer got.deinit(allocator); 48 defer got.deinit(allocator);
52 49
53 try testing.expectEqualStrings(want, got.slice); 50 try testing.expectEqualStrings(want, got.slice);
@@ -64,7 +61,7 @@ test "Unicode normalization tests" {
64 } 61 }
65 62
66 const want = w_buf.items; 63 const want = w_buf.items;
67 var got = try n.nfd(allocator, input); 64 var got = try Normalize.nfd(allocator, input);
68 defer got.deinit(allocator); 65 defer got.deinit(allocator);
69 66
70 try testing.expectEqualStrings(want, got.slice); 67 try testing.expectEqualStrings(want, got.slice);
@@ -81,7 +78,7 @@ test "Unicode normalization tests" {
81 } 78 }
82 79
83 const want = w_buf.items; 80 const want = w_buf.items;
84 var got = try n.nfkc(allocator, input); 81 var got = try Normalize.nfkc(allocator, input);
85 defer got.deinit(allocator); 82 defer got.deinit(allocator);
86 83
87 try testing.expectEqualStrings(want, got.slice); 84 try testing.expectEqualStrings(want, got.slice);
@@ -98,7 +95,7 @@ test "Unicode normalization tests" {
98 } 95 }
99 96
100 const want = w_buf.items; 97 const want = w_buf.items;
101 const got = try n.nfkd(allocator, input); 98 const got = try Normalize.nfkd(allocator, input);
102 defer got.deinit(allocator); 99 defer got.deinit(allocator);
103 100
104 try testing.expectEqualStrings(want, got.slice); 101 try testing.expectEqualStrings(want, got.slice);