diff options
| author | 2024-02-29 10:36:18 -0400 | |
|---|---|---|
| committer | 2024-02-29 10:36:18 -0400 | |
| commit | a498ae01d8198efba90b63dd84f1ee8f9166036a (patch) | |
| tree | 0405cd8a44baecd31d0506e205727673af31c9a2 /src/Normalizer.zig | |
| parent | Added nfc latin1 check back (diff) | |
| download | zg-a498ae01d8198efba90b63dd84f1ee8f9166036a.tar.gz zg-a498ae01d8198efba90b63dd84f1ee8f9166036a.tar.xz zg-a498ae01d8198efba90b63dd84f1ee8f9166036a.zip | |
Major Normalizer optimizations
Diffstat (limited to 'src/Normalizer.zig')
| -rw-r--r-- | src/Normalizer.zig | 135 |
1 files changed, 75 insertions, 60 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig index d32ad52..a575bc4 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig | |||
| @@ -255,10 +255,7 @@ pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) !Result { | |||
| 255 | return self.nfxd(allocator, str, .nfkd); | 255 | return self.nfxd(allocator, str, .nfkd); |
| 256 | } | 256 | } |
| 257 | 257 | ||
| 258 | fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Result { | 258 | fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) ![]u21 { |
| 259 | // Quick checks. | ||
| 260 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; | ||
| 261 | |||
| 262 | var dcp_list = std.ArrayList(u21).init(allocator); | 259 | var dcp_list = std.ArrayList(u21).init(allocator); |
| 263 | defer dcp_list.deinit(); | 260 | defer dcp_list.deinit(); |
| 264 | 261 | ||
| @@ -276,13 +273,23 @@ fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Resu | |||
| 276 | 273 | ||
| 277 | self.canonicalSort(dcp_list.items); | 274 | self.canonicalSort(dcp_list.items); |
| 278 | 275 | ||
| 279 | var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4); | 276 | return try dcp_list.toOwnedSlice(); |
| 280 | defer dstr_list.deinit(); | 277 | } |
| 278 | |||
| 279 | fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Result { | ||
| 280 | // Quick checks. | ||
| 281 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; | ||
| 282 | |||
| 283 | const dcps = try self.nfxdCodePoints(allocator, str, form); | ||
| 284 | defer allocator.free(dcps); | ||
| 281 | 285 | ||
| 286 | var dstr_list = std.ArrayList(u8).init(allocator); | ||
| 287 | defer dstr_list.deinit(); | ||
| 282 | var buf: [4]u8 = undefined; | 288 | var buf: [4]u8 = undefined; |
| 283 | for (dcp_list.items) |dcp| { | 289 | |
| 290 | for (dcps) |dcp| { | ||
| 284 | const len = try unicode.utf8Encode(dcp, &buf); | 291 | const len = try unicode.utf8Encode(dcp, &buf); |
| 285 | dstr_list.appendSliceAssumeCapacity(buf[0..len]); | 292 | try dstr_list.appendSlice(buf[0..len]); |
| 286 | } | 293 | } |
| 287 | 294 | ||
| 288 | return Result{ .allocator = allocator, .slice = try dstr_list.toOwnedSlice() }; | 295 | return Result{ .allocator = allocator, .slice = try dstr_list.toOwnedSlice() }; |
| @@ -342,10 +349,6 @@ fn isHangul(self: Self, cp: u21) bool { | |||
| 342 | return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none; | 349 | return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none; |
| 343 | } | 350 | } |
| 344 | 351 | ||
| 345 | fn isNonHangulStarter(self: Self, cp: u21) bool { | ||
| 346 | return !self.isHangul(cp) and self.norm_data.ccc_data.isStarter(cp); | ||
| 347 | } | ||
| 348 | |||
| 349 | /// Normalizes `str` to NFC. | 352 | /// Normalizes `str` to NFC. |
| 350 | pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) !Result { | 353 | pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) !Result { |
| 351 | return self.nfxc(allocator, str, .nfc); | 354 | return self.nfxc(allocator, str, .nfc); |
| @@ -362,47 +365,42 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Resu | |||
| 362 | if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str }; | 365 | if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str }; |
| 363 | 366 | ||
| 364 | // Decompose first. | 367 | // Decompose first. |
| 365 | var d_result = if (form == .nfc) | 368 | var dcps = if (form == .nfc) |
| 366 | try self.nfd(allocator, str) | 369 | try self.nfxdCodePoints(allocator, str, .nfd) |
| 367 | else | 370 | else |
| 368 | try self.nfkd(allocator, str); | 371 | try self.nfxdCodePoints(allocator, str, .nfkd); |
| 369 | defer d_result.deinit(); | 372 | defer allocator.free(dcps); |
| 370 | |||
| 371 | // Get code points. | ||
| 372 | var cp_iter = CodePointIterator{ .bytes = d_result.slice }; | ||
| 373 | |||
| 374 | var d_list = try std.ArrayList(u21).initCapacity(allocator, d_result.slice.len); | ||
| 375 | defer d_list.deinit(); | ||
| 376 | |||
| 377 | while (cp_iter.next()) |cp| d_list.appendAssumeCapacity(cp.code); | ||
| 378 | 373 | ||
| 379 | // Compose | 374 | // Compose |
| 380 | const tombstone = 0xe000; // Start of BMP Private Use Area | 375 | const tombstone = 0xe000; // Start of BMP Private Use Area |
| 381 | 376 | ||
| 377 | // Loop over all decomposed code points. | ||
| 382 | while (true) { | 378 | while (true) { |
| 383 | var i: usize = 1; // start at second code point. | 379 | var i: usize = 1; // start at second code point. |
| 384 | var deleted: usize = 0; | 380 | var deleted: usize = 0; |
| 385 | 381 | ||
| 386 | block_check: while (i < d_list.items.len) : (i += 1) { | 382 | // For each code point, C, find the preceding |
| 387 | const C = d_list.items[i]; | 383 | // starter code point L, if any. |
| 384 | block_check: while (i < dcps.len) : (i += 1) { | ||
| 385 | const C = dcps[i]; | ||
| 386 | if (C == tombstone) continue :block_check; | ||
| 388 | const cc_C = self.norm_data.ccc_data.ccc(C); | 387 | const cc_C = self.norm_data.ccc_data.ccc(C); |
| 389 | var starter_index: ?usize = null; | 388 | var starter_index: ?usize = null; |
| 390 | var j: usize = i; | 389 | var j: usize = i; |
| 391 | 390 | ||
| 391 | // Seek back to find starter L, if any. | ||
| 392 | while (true) { | 392 | while (true) { |
| 393 | j -= 1; | 393 | j -= 1; |
| 394 | if (dcps[j] == tombstone) continue; | ||
| 394 | 395 | ||
| 395 | // Check for starter. | 396 | // Check for starter. |
| 396 | if (self.norm_data.ccc_data.isStarter(d_list.items[j])) { | 397 | if (self.norm_data.ccc_data.isStarter(dcps[j])) { |
| 397 | if (i - j > 1) { // If there's distance between the starting point and the current position. | 398 | // Check for blocking conditions. |
| 398 | for (d_list.items[(j + 1)..i]) |B| { | 399 | for (dcps[(j + 1)..i]) |B| { |
| 399 | const cc_B = self.norm_data.ccc_data.ccc(B); | 400 | if (B == tombstone) continue; |
| 400 | // Check for blocking conditions. | 401 | const cc_B = self.norm_data.ccc_data.ccc(B); |
| 401 | if (self.isHangul(C)) { | 402 | if (cc_B != 0 and self.isHangul(C)) continue :block_check; |
| 402 | if (cc_B != 0 or self.isNonHangulStarter(B)) continue :block_check; | 403 | if (cc_B >= cc_C) continue :block_check; |
| 403 | } | ||
| 404 | if (cc_B >= cc_C) continue :block_check; | ||
| 405 | } | ||
| 406 | } | 404 | } |
| 407 | 405 | ||
| 408 | // Found starter at j. | 406 | // Found starter at j. |
| @@ -413,37 +411,50 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Resu | |||
| 413 | if (j == 0) break; | 411 | if (j == 0) break; |
| 414 | } | 412 | } |
| 415 | 413 | ||
| 414 | // If we have a starter L, see if there's a primary | ||
| 415 | // composite, P, for the sequence L, C. If so, we must | ||
| 416 | // repace L with P and delete C. | ||
| 416 | if (starter_index) |sidx| { | 417 | if (starter_index) |sidx| { |
| 417 | const L = d_list.items[sidx]; | 418 | const L = dcps[sidx]; |
| 418 | var processed_hangul = false; | 419 | var processed_hangul = false; |
| 419 | 420 | ||
| 421 | // If L and C are Hangul syllables, we can compose | ||
| 422 | // them algorithmically if possible. | ||
| 420 | if (self.isHangul(L) and self.isHangul(C)) { | 423 | if (self.isHangul(L) and self.isHangul(C)) { |
| 424 | // Get Hangul syllable types. | ||
| 421 | const l_stype = self.norm_data.hangul_data.syllable(L); | 425 | const l_stype = self.norm_data.hangul_data.syllable(L); |
| 422 | const c_stype = self.norm_data.hangul_data.syllable(C); | 426 | const c_stype = self.norm_data.hangul_data.syllable(C); |
| 423 | 427 | ||
| 424 | if (l_stype == .LV and c_stype == .T) { | 428 | if (l_stype == .LV and c_stype == .T) { |
| 425 | // LV, T | 429 | // LV, T canonical composition. |
| 426 | d_list.items[sidx] = composeHangulCanon(L, C); | 430 | dcps[sidx] = composeHangulCanon(L, C); |
| 427 | d_list.items[i] = tombstone; // Mark for deletion. | 431 | dcps[i] = tombstone; // Mark for deletion. |
| 428 | processed_hangul = true; | 432 | processed_hangul = true; |
| 429 | } | 433 | } |
| 430 | 434 | ||
| 431 | if (l_stype == .L and c_stype == .V) { | 435 | if (l_stype == .L and c_stype == .V) { |
| 432 | // Handle L, V. L, V, T is handled via main loop. | 436 | // L, V full composition. L, V, T is handled via main loop. |
| 433 | d_list.items[sidx] = composeHangulFull(L, C, 0); | 437 | dcps[sidx] = composeHangulFull(L, C, 0); |
| 434 | d_list.items[i] = tombstone; // Mark for deletion. | 438 | dcps[i] = tombstone; // Mark for deletion. |
| 435 | processed_hangul = true; | 439 | processed_hangul = true; |
| 436 | } | 440 | } |
| 437 | 441 | ||
| 438 | if (processed_hangul) deleted += 1; | 442 | if (processed_hangul) deleted += 1; |
| 439 | } | 443 | } |
| 440 | 444 | ||
| 445 | // If no composition has occurred yet. | ||
| 441 | if (!processed_hangul) { | 446 | if (!processed_hangul) { |
| 442 | // L -> C not Hangul. | 447 | // L, C are not Hangul, so check for primary composite |
| 448 | // in the Unicode Character Database. | ||
| 443 | if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| { | 449 | if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| { |
| 450 | // We have a primary composite P for L, C. | ||
| 451 | // We must check if P is not in the Full | ||
| 452 | // Composition Exclusions (FCX) list, | ||
| 453 | // preventing it from appearing in any | ||
| 454 | // composed form (NFC, NFKC). | ||
| 444 | if (!self.norm_data.normp_data.isFcx(P)) { | 455 | if (!self.norm_data.normp_data.isFcx(P)) { |
| 445 | d_list.items[sidx] = P; | 456 | dcps[sidx] = P; |
| 446 | d_list.items[i] = tombstone; // Mark for deletion. | 457 | dcps[i] = tombstone; // Mark for deletion. |
| 447 | deleted += 1; | 458 | deleted += 1; |
| 448 | } | 459 | } |
| 449 | } | 460 | } |
| @@ -451,31 +462,35 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Resu | |||
| 451 | } | 462 | } |
| 452 | } | 463 | } |
| 453 | 464 | ||
| 454 | // Check if finished. | 465 | // If we have no deletions. the code point sequence |
| 466 | // has been fully composed. | ||
| 455 | if (deleted == 0) { | 467 | if (deleted == 0) { |
| 456 | var cstr_list = try std.ArrayList(u8).initCapacity(allocator, d_list.items.len * 4); | 468 | var cstr_list = std.ArrayList(u8).init(allocator); |
| 457 | defer cstr_list.deinit(); | 469 | defer cstr_list.deinit(); |
| 458 | var buf: [4]u8 = undefined; | 470 | var buf: [4]u8 = undefined; |
| 459 | 471 | ||
| 460 | for (d_list.items) |cp| { | 472 | for (dcps) |cp| { |
| 473 | if (cp == tombstone) continue; | ||
| 461 | if (cp == tombstone) continue; // "Delete" | 474 | if (cp == tombstone) continue; // "Delete" |
| 462 | const len = try unicode.utf8Encode(cp, &buf); | 475 | const len = try unicode.utf8Encode(cp, &buf); |
| 463 | cstr_list.appendSliceAssumeCapacity(buf[0..len]); | 476 | try cstr_list.appendSlice(buf[0..len]); |
| 464 | } | 477 | } |
| 465 | 478 | ||
| 466 | return Result{ .allocator = allocator, .slice = try cstr_list.toOwnedSlice() }; | 479 | return Result{ .allocator = allocator, .slice = try cstr_list.toOwnedSlice() }; |
| 467 | } | 480 | } |
| 468 | 481 | ||
| 469 | // Otherwise update code points list. | 482 | // We're still not finished, so create a new |
| 470 | var tmp_d_list = try std.ArrayList(u21).initCapacity(allocator, d_list.items.len - deleted); | 483 | // code point sequence with the marked code points |
| 471 | defer tmp_d_list.deinit(); | 484 | // removed. |
| 472 | 485 | // var tmp_list = try std.ArrayList(u21).initCapacity(allocator, dcps.len - deleted); | |
| 473 | for (d_list.items) |cp| { | 486 | // defer tmp_list.deinit(); |
| 474 | if (cp != tombstone) tmp_d_list.appendAssumeCapacity(cp); | 487 | // |
| 475 | } | 488 | // for (dcps) |cp| { |
| 476 | 489 | // if (cp != tombstone) tmp_list.appendAssumeCapacity(cp); | |
| 477 | d_list.clearRetainingCapacity(); | 490 | // } |
| 478 | d_list.appendSliceAssumeCapacity(tmp_d_list.items); | 491 | // |
| 492 | // allocator.free(dcps); | ||
| 493 | // dcps = try tmp_list.toOwnedSlice(); | ||
| 479 | } | 494 | } |
| 480 | } | 495 | } |
| 481 | 496 | ||