summaryrefslogtreecommitdiff
path: root/src/Normalize.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/Normalize.zig')
-rw-r--r--src/Normalize.zig143
1 files changed, 48 insertions, 95 deletions
diff --git a/src/Normalize.zig b/src/Normalize.zig
index 3191a8c..865318f 100644
--- a/src/Normalize.zig
+++ b/src/Normalize.zig
@@ -2,25 +2,8 @@
2//! Unicode Normalization. You can normalize strings into NFC, 2//! Unicode Normalization. You can normalize strings into NFC,
3//! NFKC, NFD, and NFKD normalization forms. 3//! NFKC, NFD, and NFKD normalization forms.
4 4
5canon_data: CanonData = undefined,
6
7const Normalize = @This(); 5const Normalize = @This();
8 6
9pub fn init(allocator: Allocator) !Normalize {
10 var norm: Normalize = undefined;
11 try norm.setup(allocator);
12 return norm;
13}
14
15pub fn setup(self: *Normalize, allocator: Allocator) !void {
16 self.canon_data = try CanonData.init(allocator);
17}
18
19pub fn deinit(norm: *const Normalize, allocator: Allocator) void {
20 const mut_norm = @constCast(norm);
21 mut_norm.canon_data.deinit(allocator);
22}
23
24const SBase: u21 = 0xAC00; 7const SBase: u21 = 0xAC00;
25const LBase: u21 = 0x1100; 8const LBase: u21 = 0x1100;
26const VBase: u21 = 0x1161; 9const VBase: u21 = 0x1161;
@@ -91,12 +74,12 @@ const Decomp = struct {
91}; 74};
92 75
93// `mapping` retrieves the decomposition mapping for a code point as per the UCD. 76// `mapping` retrieves the decomposition mapping for a code point as per the UCD.
94fn mapping(self: Normalize, cp: u21, form: Form) Decomp { 77fn mapping(cp: u21, form: Form) Decomp {
95 var dc = Decomp{}; 78 var dc = Decomp{};
96 79
97 switch (form) { 80 switch (form) {
98 .nfd => { 81 .nfd => {
99 dc.cps = self.canon_data.toNfd(cp); 82 dc.cps = CanonData.toNfd(cp);
100 if (dc.cps.len != 0) dc.form = .nfd; 83 if (dc.cps.len != 0) dc.form = .nfd;
101 }, 84 },
102 85
@@ -105,7 +88,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp {
105 if (dc.cps.len != 0) { 88 if (dc.cps.len != 0) {
106 dc.form = .nfkd; 89 dc.form = .nfkd;
107 } else { 90 } else {
108 dc.cps = self.canon_data.toNfd(cp); 91 dc.cps = CanonData.toNfd(cp);
109 if (dc.cps.len != 0) dc.form = .nfkd; 92 if (dc.cps.len != 0) dc.form = .nfkd;
110 } 93 }
111 }, 94 },
@@ -117,12 +100,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp {
117} 100}
118 101
119// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`. 102// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`.
120fn decompose( 103fn decompose(cp: u21, form: Form, buf: []u21) Decomp {
121 self: Normalize,
122 cp: u21,
123 form: Form,
124 buf: []u21,
125) Decomp {
126 // ASCII 104 // ASCII
127 if (cp < 128) return .{}; 105 if (cp < 128) return .{};
128 106
@@ -149,7 +127,7 @@ fn decompose(
149 // Look at previous code point in work queue. 127 // Look at previous code point in work queue.
150 work_index -= 1; 128 work_index -= 1;
151 const next = work[work_index]; 129 const next = work[work_index];
152 const m = self.mapping(next, form); 130 const m = Normalize.mapping(next, form);
153 131
154 // No more of decompositions for this code point. 132 // No more of decompositions for this code point.
155 if (m.form == .same) { 133 if (m.form == .same) {
@@ -175,44 +153,41 @@ fn decompose(
175} 153}
176 154
177test "decompose" { 155test "decompose" {
178 const allocator = testing.allocator;
179 var n = try Normalize.init(allocator);
180 defer n.deinit(allocator);
181 var buf: [18]u21 = undefined; 156 var buf: [18]u21 = undefined;
182 157
183 var dc = n.decompose('é', .nfd, &buf); 158 var dc = Normalize.decompose('é', .nfd, &buf);
184 try testing.expect(dc.form == .nfd); 159 try testing.expect(dc.form == .nfd);
185 try testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]); 160 try testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]);
186 161
187 dc = n.decompose('\u{1e0a}', .nfd, &buf); 162 dc = Normalize.decompose('\u{1e0a}', .nfd, &buf);
188 try testing.expect(dc.form == .nfd); 163 try testing.expect(dc.form == .nfd);
189 try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); 164 try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
190 165
191 dc = n.decompose('\u{1e0a}', .nfkd, &buf); 166 dc = Normalize.decompose('\u{1e0a}', .nfkd, &buf);
192 try testing.expect(dc.form == .nfkd); 167 try testing.expect(dc.form == .nfkd);
193 try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); 168 try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
194 169
195 dc = n.decompose('\u{3189}', .nfd, &buf); 170 dc = Normalize.decompose('\u{3189}', .nfd, &buf);
196 try testing.expect(dc.form == .same); 171 try testing.expect(dc.form == .same);
197 try testing.expect(dc.cps.len == 0); 172 try testing.expect(dc.cps.len == 0);
198 173
199 dc = n.decompose('\u{3189}', .nfkd, &buf); 174 dc = Normalize.decompose('\u{3189}', .nfkd, &buf);
200 try testing.expect(dc.form == .nfkd); 175 try testing.expect(dc.form == .nfkd);
201 try testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]); 176 try testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]);
202 177
203 dc = n.decompose('\u{ace1}', .nfd, &buf); 178 dc = Normalize.decompose('\u{ace1}', .nfd, &buf);
204 try testing.expect(dc.form == .nfd); 179 try testing.expect(dc.form == .nfd);
205 try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); 180 try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
206 181
207 dc = n.decompose('\u{ace1}', .nfkd, &buf); 182 dc = Normalize.decompose('\u{ace1}', .nfkd, &buf);
208 try testing.expect(dc.form == .nfd); 183 try testing.expect(dc.form == .nfd);
209 try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); 184 try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
210 185
211 dc = n.decompose('\u{3d3}', .nfd, &buf); 186 dc = Normalize.decompose('\u{3d3}', .nfd, &buf);
212 try testing.expect(dc.form == .nfd); 187 try testing.expect(dc.form == .nfd);
213 try testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]); 188 try testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]);
214 189
215 dc = n.decompose('\u{3d3}', .nfkd, &buf); 190 dc = Normalize.decompose('\u{3d3}', .nfkd, &buf);
216 try testing.expect(dc.form == .nfkd); 191 try testing.expect(dc.form == .nfkd);
217 try testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]); 192 try testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]);
218} 193}
@@ -231,8 +206,8 @@ pub const Result = struct {
231 return .{ .allocated = true, .slice = try allocator.dupe(u8, result.slice) }; 206 return .{ .allocated = true, .slice = try allocator.dupe(u8, result.slice) };
232 } 207 }
233 208
234 pub fn deinit(self: *const Result, allocator: Allocator) void { 209 pub fn deinit(result: *const Result, allocator: Allocator) void {
235 if (self.allocated) allocator.free(self.slice); 210 if (result.allocated) allocator.free(result.slice);
236 } 211 }
237}; 212};
238 213
@@ -252,16 +227,16 @@ fn canonicalSort(cps: []u21) void {
252} 227}
253 228
254/// Normalize `str` to NFD. 229/// Normalize `str` to NFD.
255pub fn nfd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { 230pub fn nfd(allocator: Allocator, str: []const u8) Allocator.Error!Result {
256 return self.nfxd(allocator, str, .nfd); 231 return Normalize.nfxd(allocator, str, .nfd);
257} 232}
258 233
259/// Normalize `str` to NFKD. 234/// Normalize `str` to NFKD.
260pub fn nfkd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { 235pub fn nfkd(allocator: Allocator, str: []const u8) Allocator.Error!Result {
261 return self.nfxd(allocator, str, .nfkd); 236 return Normalize.nfxd(allocator, str, .nfkd);
262} 237}
263 238
264pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error![]u21 { 239pub fn nfxdCodePoints(allocator: Allocator, str: []const u8, form: Form) Allocator.Error![]u21 {
265 var dcp_list = std.array_list.Managed(u21).init(allocator); 240 var dcp_list = std.array_list.Managed(u21).init(allocator);
266 defer dcp_list.deinit(); 241 defer dcp_list.deinit();
267 242
@@ -269,7 +244,7 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo
269 var dc_buf: [18]u21 = undefined; 244 var dc_buf: [18]u21 = undefined;
270 245
271 while (cp_iter.next()) |cp| { 246 while (cp_iter.next()) |cp| {
272 const dc = self.decompose(cp.code, form, &dc_buf); 247 const dc = Normalize.decompose(cp.code, form, &dc_buf);
273 if (dc.form == .same) { 248 if (dc.form == .same) {
274 try dcp_list.append(cp.code); 249 try dcp_list.append(cp.code);
275 } else { 250 } else {
@@ -282,11 +257,11 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo
282 return try dcp_list.toOwnedSlice(); 257 return try dcp_list.toOwnedSlice();
283} 258}
284 259
285fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { 260fn nfxd(allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result {
286 // Quick checks. 261 // Quick checks.
287 if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; 262 if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
288 263
289 const dcps = try self.nfxdCodePoints(allocator, str, form); 264 const dcps = try Normalize.nfxdCodePoints(allocator, str, form);
290 defer allocator.free(dcps); 265 defer allocator.free(dcps);
291 266
292 var dstr_list = std.array_list.Managed(u8).init(allocator); 267 var dstr_list = std.array_list.Managed(u8).init(allocator);
@@ -303,10 +278,8 @@ fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
303 278
304test "nfd ASCII / no-alloc" { 279test "nfd ASCII / no-alloc" {
305 const allocator = testing.allocator; 280 const allocator = testing.allocator;
306 var n = try Normalize.init(allocator);
307 defer n.deinit(allocator);
308 281
309 const result = try n.nfd(allocator, "Hello World!"); 282 const result = try Normalize.nfd(allocator, "Hello World!");
310 defer result.deinit(allocator); 283 defer result.deinit(allocator);
311 284
312 try testing.expectEqualStrings("Hello World!", result.slice); 285 try testing.expectEqualStrings("Hello World!", result.slice);
@@ -314,10 +287,8 @@ test "nfd ASCII / no-alloc" {
314 287
315test "nfd !ASCII / alloc" { 288test "nfd !ASCII / alloc" {
316 const allocator = testing.allocator; 289 const allocator = testing.allocator;
317 var n = try Normalize.init(allocator);
318 defer n.deinit(allocator);
319 290
320 const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); 291 const result = try Normalize.nfd(allocator, "Héllo World! \u{3d3}");
321 defer result.deinit(allocator); 292 defer result.deinit(allocator);
322 293
323 try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice); 294 try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice);
@@ -325,10 +296,8 @@ test "nfd !ASCII / alloc" {
325 296
326test "nfkd ASCII / no-alloc" { 297test "nfkd ASCII / no-alloc" {
327 const allocator = testing.allocator; 298 const allocator = testing.allocator;
328 var n = try Normalize.init(allocator);
329 defer n.deinit(allocator);
330 299
331 const result = try n.nfkd(allocator, "Hello World!"); 300 const result = try Normalize.nfkd(allocator, "Hello World!");
332 defer result.deinit(allocator); 301 defer result.deinit(allocator);
333 302
334 try testing.expectEqualStrings("Hello World!", result.slice); 303 try testing.expectEqualStrings("Hello World!", result.slice);
@@ -336,27 +305,21 @@ test "nfkd ASCII / no-alloc" {
336 305
337test "nfkd !ASCII / alloc" { 306test "nfkd !ASCII / alloc" {
338 const allocator = testing.allocator; 307 const allocator = testing.allocator;
339 var n = try Normalize.init(allocator);
340 defer n.deinit(allocator);
341 308
342 const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); 309 const result = try Normalize.nfkd(allocator, "Héllo World! \u{3d3}");
343 defer result.deinit(allocator); 310 defer result.deinit(allocator);
344 311
345 try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); 312 try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice);
346} 313}
347 314
348pub fn nfdCodePoints( 315pub fn nfdCodePoints(allocator: Allocator, cps: []const u21) Allocator.Error![]u21 {
349 self: Normalize,
350 allocator: Allocator,
351 cps: []const u21,
352) Allocator.Error![]u21 {
353 var dcp_list = std.array_list.Managed(u21).init(allocator); 316 var dcp_list = std.array_list.Managed(u21).init(allocator);
354 defer dcp_list.deinit(); 317 defer dcp_list.deinit();
355 318
356 var dc_buf: [18]u21 = undefined; 319 var dc_buf: [18]u21 = undefined;
357 320
358 for (cps) |cp| { 321 for (cps) |cp| {
359 const dc = self.decompose(cp, .nfd, &dc_buf); 322 const dc = Normalize.decompose(cp, .nfd, &dc_buf);
360 323
361 if (dc.form == .same) { 324 if (dc.form == .same) {
362 try dcp_list.append(cp); 325 try dcp_list.append(cp);
@@ -370,18 +333,14 @@ pub fn nfdCodePoints(
370 return try dcp_list.toOwnedSlice(); 333 return try dcp_list.toOwnedSlice();
371} 334}
372 335
373pub fn nfkdCodePoints( 336pub fn nfkdCodePoints(allocator: Allocator, cps: []const u21) Allocator.Error![]u21 {
374 self: Normalize,
375 allocator: Allocator,
376 cps: []const u21,
377) Allocator.Error![]u21 {
378 var dcp_list = std.array_list.Managed(u21).init(allocator); 337 var dcp_list = std.array_list.Managed(u21).init(allocator);
379 defer dcp_list.deinit(); 338 defer dcp_list.deinit();
380 339
381 var dc_buf: [18]u21 = undefined; 340 var dc_buf: [18]u21 = undefined;
382 341
383 for (cps) |cp| { 342 for (cps) |cp| {
384 const dc = self.decompose(cp, .nfkd, &dc_buf); 343 const dc = Normalize.decompose(cp, .nfkd, &dc_buf);
385 344
386 if (dc.form == .same) { 345 if (dc.form == .same) {
387 try dcp_list.append(cp); 346 try dcp_list.append(cp);
@@ -402,29 +361,29 @@ fn isHangul(cp: u21) bool {
402} 361}
403 362
404/// Normalizes `str` to NFC. 363/// Normalizes `str` to NFC.
405pub fn nfc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { 364pub fn nfc(allocator: Allocator, str: []const u8) Allocator.Error!Result {
406 return self.nfxc(allocator, str, .nfc); 365 return Normalize.nfxc(allocator, str, .nfc);
407} 366}
408 367
409/// Normalizes `str` to NFKC. 368/// Normalizes `str` to NFKC.
410pub fn nfkc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { 369pub fn nfkc(allocator: Allocator, str: []const u8) Allocator.Error!Result {
411 return self.nfxc(allocator, str, .nfkc); 370 return Normalize.nfxc(allocator, str, .nfkc);
412} 371}
413 372
414fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { 373fn nfxc(allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result {
415 // Quick checks. 374 // Quick checks.
416 if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; 375 if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
417 if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str }; 376 if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str };
418 377
419 // Decompose first. 378 // Decompose first.
420 var dcps = if (form == .nfc) 379 var dcps = if (form == .nfc)
421 try self.nfxdCodePoints(allocator, str, .nfd) 380 try Normalize.nfxdCodePoints(allocator, str, .nfd)
422 else 381 else
423 try self.nfxdCodePoints(allocator, str, .nfkd); 382 try Normalize.nfxdCodePoints(allocator, str, .nfkd);
424 defer allocator.free(dcps); 383 defer allocator.free(dcps);
425 384
426 // Compose 385 // Compose
427 const tombstone = 0xe000; // Start of BMP Private Use Area 386 const tombstone = 0x1FFFF; // Convenient Cn noncharacter point
428 387
429 // Loop over all decomposed code points. 388 // Loop over all decomposed code points.
430 while (true) { 389 while (true) {
@@ -498,7 +457,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
498 if (!processed_hangul) { 457 if (!processed_hangul) {
499 // L, C are not Hangul, so check for primary composite 458 // L, C are not Hangul, so check for primary composite
500 // in the Unicode Character Database. 459 // in the Unicode Character Database.
501 if (self.canon_data.toNfc(.{ L, C })) |P| { 460 if (CanonData.toNfc(.{ L, C })) |P| {
502 // We have a primary composite P for L, C. 461 // We have a primary composite P for L, C.
503 // We must check if P is not in the Full 462 // We must check if P is not in the Full
504 // Composition Exclusions (FCX) list, 463 // Composition Exclusions (FCX) list,
@@ -534,10 +493,8 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo
534 493
535test "nfc" { 494test "nfc" {
536 const allocator = testing.allocator; 495 const allocator = testing.allocator;
537 var n = try Normalize.init(allocator);
538 defer n.deinit(allocator);
539 496
540 const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); 497 const result = try Normalize.nfc(allocator, "Complex char: \u{3D2}\u{301}");
541 defer result.deinit(allocator); 498 defer result.deinit(allocator);
542 499
543 try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice); 500 try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice);
@@ -545,20 +502,18 @@ test "nfc" {
545 502
546test "nfkc" { 503test "nfkc" {
547 const allocator = testing.allocator; 504 const allocator = testing.allocator;
548 var n = try Normalize.init(allocator);
549 defer n.deinit(allocator);
550 505
551 const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); 506 const result = try Normalize.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
552 defer result.deinit(allocator); 507 defer result.deinit(allocator);
553 508
554 try testing.expectEqualStrings("Complex char: \u{038E}", result.slice); 509 try testing.expectEqualStrings("Complex char: \u{038E}", result.slice);
555} 510}
556 511
557/// Tests for equality of `a` and `b` after normalizing to NFC. 512/// Tests for equality of `a` and `b` after normalizing to NFC.
558pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8) !bool { 513pub fn eql(allocator: Allocator, a: []const u8, b: []const u8) !bool {
559 const norm_result_a = try self.nfc(allocator, a); 514 const norm_result_a = try Normalize.nfc(allocator, a);
560 defer norm_result_a.deinit(allocator); 515 defer norm_result_a.deinit(allocator);
561 const norm_result_b = try self.nfc(allocator, b); 516 const norm_result_b = try Normalize.nfc(allocator, b);
562 defer norm_result_b.deinit(allocator); 517 defer norm_result_b.deinit(allocator);
563 518
564 return mem.eql(u8, norm_result_a.slice, norm_result_b.slice); 519 return mem.eql(u8, norm_result_a.slice, norm_result_b.slice);
@@ -566,11 +521,9 @@ pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8)
566 521
567test "eql" { 522test "eql" {
568 const allocator = testing.allocator; 523 const allocator = testing.allocator;
569 var n = try Normalize.init(allocator);
570 defer n.deinit(allocator);
571 524
572 try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); 525 try testing.expect(try Normalize.eql(allocator, "foé", "foe\u{0301}"));
573 try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); 526 try testing.expect(try Normalize.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
574} 527}
575 528
576/// Returns true if `str` only contains Latin-1 Supplement 529/// Returns true if `str` only contains Latin-1 Supplement