summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-29 10:36:18 -0400
committerGravatar Jose Colon Rodriguez2024-02-29 10:36:18 -0400
commita498ae01d8198efba90b63dd84f1ee8f9166036a (patch)
tree0405cd8a44baecd31d0506e205727673af31c9a2 /src
parentAdded nfc latin1 check back (diff)
downloadzg-a498ae01d8198efba90b63dd84f1ee8f9166036a.tar.gz
zg-a498ae01d8198efba90b63dd84f1ee8f9166036a.tar.xz
zg-a498ae01d8198efba90b63dd84f1ee8f9166036a.zip
Major Normalizer optimizations
Diffstat (limited to 'src')
-rw-r--r--src/Normalizer.zig135
1 files changed, 75 insertions, 60 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig
index d32ad52..a575bc4 100644
--- a/src/Normalizer.zig
+++ b/src/Normalizer.zig
@@ -255,10 +255,7 @@ pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) !Result {
255 return self.nfxd(allocator, str, .nfkd); 255 return self.nfxd(allocator, str, .nfkd);
256} 256}
257 257
258fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Result { 258fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) ![]u21 {
259 // Quick checks.
260 if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
261
262 var dcp_list = std.ArrayList(u21).init(allocator); 259 var dcp_list = std.ArrayList(u21).init(allocator);
263 defer dcp_list.deinit(); 260 defer dcp_list.deinit();
264 261
@@ -276,13 +273,23 @@ fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Resu
276 273
277 self.canonicalSort(dcp_list.items); 274 self.canonicalSort(dcp_list.items);
278 275
279 var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4); 276 return try dcp_list.toOwnedSlice();
280 defer dstr_list.deinit(); 277}
278
279fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Result {
280 // Quick checks.
281 if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
282
283 const dcps = try self.nfxdCodePoints(allocator, str, form);
284 defer allocator.free(dcps);
281 285
286 var dstr_list = std.ArrayList(u8).init(allocator);
287 defer dstr_list.deinit();
282 var buf: [4]u8 = undefined; 288 var buf: [4]u8 = undefined;
283 for (dcp_list.items) |dcp| { 289
290 for (dcps) |dcp| {
284 const len = try unicode.utf8Encode(dcp, &buf); 291 const len = try unicode.utf8Encode(dcp, &buf);
285 dstr_list.appendSliceAssumeCapacity(buf[0..len]); 292 try dstr_list.appendSlice(buf[0..len]);
286 } 293 }
287 294
288 return Result{ .allocator = allocator, .slice = try dstr_list.toOwnedSlice() }; 295 return Result{ .allocator = allocator, .slice = try dstr_list.toOwnedSlice() };
@@ -342,10 +349,6 @@ fn isHangul(self: Self, cp: u21) bool {
342 return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none; 349 return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none;
343} 350}
344 351
345fn isNonHangulStarter(self: Self, cp: u21) bool {
346 return !self.isHangul(cp) and self.norm_data.ccc_data.isStarter(cp);
347}
348
349/// Normalizes `str` to NFC. 352/// Normalizes `str` to NFC.
350pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) !Result { 353pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) !Result {
351 return self.nfxc(allocator, str, .nfc); 354 return self.nfxc(allocator, str, .nfc);
@@ -362,47 +365,42 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Resu
362 if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str }; 365 if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str };
363 366
364 // Decompose first. 367 // Decompose first.
365 var d_result = if (form == .nfc) 368 var dcps = if (form == .nfc)
366 try self.nfd(allocator, str) 369 try self.nfxdCodePoints(allocator, str, .nfd)
367 else 370 else
368 try self.nfkd(allocator, str); 371 try self.nfxdCodePoints(allocator, str, .nfkd);
369 defer d_result.deinit(); 372 defer allocator.free(dcps);
370
371 // Get code points.
372 var cp_iter = CodePointIterator{ .bytes = d_result.slice };
373
374 var d_list = try std.ArrayList(u21).initCapacity(allocator, d_result.slice.len);
375 defer d_list.deinit();
376
377 while (cp_iter.next()) |cp| d_list.appendAssumeCapacity(cp.code);
378 373
379 // Compose 374 // Compose
380 const tombstone = 0xe000; // Start of BMP Private Use Area 375 const tombstone = 0xe000; // Start of BMP Private Use Area
381 376
377 // Loop over all decomposed code points.
382 while (true) { 378 while (true) {
383 var i: usize = 1; // start at second code point. 379 var i: usize = 1; // start at second code point.
384 var deleted: usize = 0; 380 var deleted: usize = 0;
385 381
386 block_check: while (i < d_list.items.len) : (i += 1) { 382 // For each code point, C, find the preceding
387 const C = d_list.items[i]; 383 // starter code point L, if any.
384 block_check: while (i < dcps.len) : (i += 1) {
385 const C = dcps[i];
386 if (C == tombstone) continue :block_check;
388 const cc_C = self.norm_data.ccc_data.ccc(C); 387 const cc_C = self.norm_data.ccc_data.ccc(C);
389 var starter_index: ?usize = null; 388 var starter_index: ?usize = null;
390 var j: usize = i; 389 var j: usize = i;
391 390
391 // Seek back to find starter L, if any.
392 while (true) { 392 while (true) {
393 j -= 1; 393 j -= 1;
394 if (dcps[j] == tombstone) continue;
394 395
395 // Check for starter. 396 // Check for starter.
396 if (self.norm_data.ccc_data.isStarter(d_list.items[j])) { 397 if (self.norm_data.ccc_data.isStarter(dcps[j])) {
397 if (i - j > 1) { // If there's distance between the starting point and the current position. 398 // Check for blocking conditions.
398 for (d_list.items[(j + 1)..i]) |B| { 399 for (dcps[(j + 1)..i]) |B| {
399 const cc_B = self.norm_data.ccc_data.ccc(B); 400 if (B == tombstone) continue;
400 // Check for blocking conditions. 401 const cc_B = self.norm_data.ccc_data.ccc(B);
401 if (self.isHangul(C)) { 402 if (cc_B != 0 and self.isHangul(C)) continue :block_check;
402 if (cc_B != 0 or self.isNonHangulStarter(B)) continue :block_check; 403 if (cc_B >= cc_C) continue :block_check;
403 }
404 if (cc_B >= cc_C) continue :block_check;
405 }
406 } 404 }
407 405
408 // Found starter at j. 406 // Found starter at j.
@@ -413,37 +411,50 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Resu
413 if (j == 0) break; 411 if (j == 0) break;
414 } 412 }
415 413
414 // If we have a starter L, see if there's a primary
415 // composite, P, for the sequence L, C. If so, we must
416 // repace L with P and delete C.
416 if (starter_index) |sidx| { 417 if (starter_index) |sidx| {
417 const L = d_list.items[sidx]; 418 const L = dcps[sidx];
418 var processed_hangul = false; 419 var processed_hangul = false;
419 420
421 // If L and C are Hangul syllables, we can compose
422 // them algorithmically if possible.
420 if (self.isHangul(L) and self.isHangul(C)) { 423 if (self.isHangul(L) and self.isHangul(C)) {
424 // Get Hangul syllable types.
421 const l_stype = self.norm_data.hangul_data.syllable(L); 425 const l_stype = self.norm_data.hangul_data.syllable(L);
422 const c_stype = self.norm_data.hangul_data.syllable(C); 426 const c_stype = self.norm_data.hangul_data.syllable(C);
423 427
424 if (l_stype == .LV and c_stype == .T) { 428 if (l_stype == .LV and c_stype == .T) {
425 // LV, T 429 // LV, T canonical composition.
426 d_list.items[sidx] = composeHangulCanon(L, C); 430 dcps[sidx] = composeHangulCanon(L, C);
427 d_list.items[i] = tombstone; // Mark for deletion. 431 dcps[i] = tombstone; // Mark for deletion.
428 processed_hangul = true; 432 processed_hangul = true;
429 } 433 }
430 434
431 if (l_stype == .L and c_stype == .V) { 435 if (l_stype == .L and c_stype == .V) {
432 // Handle L, V. L, V, T is handled via main loop. 436 // L, V full composition. L, V, T is handled via main loop.
433 d_list.items[sidx] = composeHangulFull(L, C, 0); 437 dcps[sidx] = composeHangulFull(L, C, 0);
434 d_list.items[i] = tombstone; // Mark for deletion. 438 dcps[i] = tombstone; // Mark for deletion.
435 processed_hangul = true; 439 processed_hangul = true;
436 } 440 }
437 441
438 if (processed_hangul) deleted += 1; 442 if (processed_hangul) deleted += 1;
439 } 443 }
440 444
445 // If no composition has occurred yet.
441 if (!processed_hangul) { 446 if (!processed_hangul) {
442 // L -> C not Hangul. 447 // L, C are not Hangul, so check for primary composite
448 // in the Unicode Character Database.
443 if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| { 449 if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| {
450 // We have a primary composite P for L, C.
451 // We must check if P is not in the Full
452 // Composition Exclusions (FCX) list,
453 // preventing it from appearing in any
454 // composed form (NFC, NFKC).
444 if (!self.norm_data.normp_data.isFcx(P)) { 455 if (!self.norm_data.normp_data.isFcx(P)) {
445 d_list.items[sidx] = P; 456 dcps[sidx] = P;
446 d_list.items[i] = tombstone; // Mark for deletion. 457 dcps[i] = tombstone; // Mark for deletion.
447 deleted += 1; 458 deleted += 1;
448 } 459 }
449 } 460 }
@@ -451,31 +462,35 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Resu
451 } 462 }
452 } 463 }
453 464
454 // Check if finished. 465 // If we have no deletions. the code point sequence
466 // has been fully composed.
455 if (deleted == 0) { 467 if (deleted == 0) {
456 var cstr_list = try std.ArrayList(u8).initCapacity(allocator, d_list.items.len * 4); 468 var cstr_list = std.ArrayList(u8).init(allocator);
457 defer cstr_list.deinit(); 469 defer cstr_list.deinit();
458 var buf: [4]u8 = undefined; 470 var buf: [4]u8 = undefined;
459 471
460 for (d_list.items) |cp| { 472 for (dcps) |cp| {
473 if (cp == tombstone) continue;
461 if (cp == tombstone) continue; // "Delete" 474 if (cp == tombstone) continue; // "Delete"
462 const len = try unicode.utf8Encode(cp, &buf); 475 const len = try unicode.utf8Encode(cp, &buf);
463 cstr_list.appendSliceAssumeCapacity(buf[0..len]); 476 try cstr_list.appendSlice(buf[0..len]);
464 } 477 }
465 478
466 return Result{ .allocator = allocator, .slice = try cstr_list.toOwnedSlice() }; 479 return Result{ .allocator = allocator, .slice = try cstr_list.toOwnedSlice() };
467 } 480 }
468 481
469 // Otherwise update code points list. 482 // We're still not finished, so create a new
470 var tmp_d_list = try std.ArrayList(u21).initCapacity(allocator, d_list.items.len - deleted); 483 // code point sequence with the marked code points
471 defer tmp_d_list.deinit(); 484 // removed.
472 485 // var tmp_list = try std.ArrayList(u21).initCapacity(allocator, dcps.len - deleted);
473 for (d_list.items) |cp| { 486 // defer tmp_list.deinit();
474 if (cp != tombstone) tmp_d_list.appendAssumeCapacity(cp); 487 //
475 } 488 // for (dcps) |cp| {
476 489 // if (cp != tombstone) tmp_list.appendAssumeCapacity(cp);
477 d_list.clearRetainingCapacity(); 490 // }
478 d_list.appendSliceAssumeCapacity(tmp_d_list.items); 491 //
492 // allocator.free(dcps);
493 // dcps = try tmp_list.toOwnedSlice();
479 } 494 }
480} 495}
481 496