diff options
Diffstat (limited to 'src/Normalize.zig')
| -rw-r--r-- | src/Normalize.zig | 193 |
1 files changed, 108 insertions, 85 deletions
diff --git a/src/Normalize.zig b/src/Normalize.zig index b738b27..4f014cf 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig | |||
| @@ -2,23 +2,41 @@ | |||
| 2 | //! Unicode Normalization. You can normalize strings into NFC, | 2 | //! Unicode Normalization. You can normalize strings into NFC, |
| 3 | //! NFKC, NFD, and NFKD normalization forms. | 3 | //! NFKC, NFD, and NFKD normalization forms. |
| 4 | 4 | ||
| 5 | const std = @import("std"); | 5 | canon_data: CanonData = undefined, |
| 6 | const debug = std.debug; | 6 | ccc_data: CccData = undefined, |
| 7 | const assert = debug.assert; | 7 | compat_data: CompatData = undefined, |
| 8 | const fmt = std.fmt; | 8 | hangul_data: HangulData = undefined, |
| 9 | const heap = std.heap; | 9 | normp_data: NormPropsData = undefined, |
| 10 | const mem = std.mem; | 10 | |
| 11 | const simd = std.simd; | 11 | const Normalize = @This(); |
| 12 | const testing = std.testing; | 12 | |
| 13 | const unicode = std.unicode; | 13 | pub fn init(allocator: Allocator) !Normalize { |
| 14 | 14 | var norm: Normalize = undefined; | |
| 15 | const ascii = @import("ascii"); | 15 | try norm.setup(allocator); |
| 16 | const CodePointIterator = @import("code_point").Iterator; | 16 | return norm; |
| 17 | pub const NormData = @import("NormData"); | 17 | } |
| 18 | 18 | ||
| 19 | norm_data: *const NormData, | 19 | pub fn setup(self: *Normalize, allocator: Allocator) !void { |
| 20 | self.canon_data = try CanonData.init(allocator); | ||
| 21 | errdefer self.canon_data.deinit(allocator); | ||
| 22 | self.ccc_data = try CccData.init(allocator); | ||
| 23 | errdefer self.ccc_data.deinit(allocator); | ||
| 24 | self.compat_data = try CompatData.init(allocator); | ||
| 25 | errdefer self.compat_data.deinit(allocator); | ||
| 26 | self.hangul_data = try HangulData.init(allocator); | ||
| 27 | errdefer self.hangul_data.deinit(allocator); | ||
| 28 | self.normp_data = try NormPropsData.init(allocator); | ||
| 29 | } | ||
| 20 | 30 | ||
| 21 | const Self = @This(); | 31 | pub fn deinit(norm: *const Normalize, allocator: Allocator) void { |
| 32 | // Reasonably safe (?) | ||
| 33 | var mut_norm = @constCast(norm); | ||
| 34 | mut_norm.canon_data.deinit(allocator); | ||
| 35 | mut_norm.ccc_data.deinit(allocator); | ||
| 36 | mut_norm.compat_data.deinit(allocator); | ||
| 37 | mut_norm.hangul_data.deinit(allocator); | ||
| 38 | mut_norm.normp_data.deinit(allocator); | ||
| 39 | } | ||
| 22 | 40 | ||
| 23 | const SBase: u21 = 0xAC00; | 41 | const SBase: u21 = 0xAC00; |
| 24 | const LBase: u21 = 0x1100; | 42 | const LBase: u21 = 0x1100; |
| @@ -30,8 +48,8 @@ const TCount: u21 = 28; | |||
| 30 | const NCount: u21 = 588; // VCount * TCount | 48 | const NCount: u21 = 588; // VCount * TCount |
| 31 | const SCount: u21 = 11172; // LCount * NCount | 49 | const SCount: u21 = 11172; // LCount * NCount |
| 32 | 50 | ||
| 33 | fn decomposeHangul(self: Self, cp: u21, buf: []u21) ?Decomp { | 51 | fn decomposeHangul(self: Normalize, cp: u21, buf: []u21) ?Decomp { |
| 34 | const kind = self.norm_data.hangul_data.syllable(cp); | 52 | const kind = self.hangul_data.syllable(cp); |
| 35 | if (kind != .LV and kind != .LVT) return null; | 53 | if (kind != .LV and kind != .LVT) return null; |
| 36 | 54 | ||
| 37 | const SIndex: u21 = cp - SBase; | 55 | const SIndex: u21 = cp - SBase; |
| @@ -90,21 +108,21 @@ const Decomp = struct { | |||
| 90 | }; | 108 | }; |
| 91 | 109 | ||
| 92 | // `mapping` retrieves the decomposition mapping for a code point as per the UCD. | 110 | // `mapping` retrieves the decomposition mapping for a code point as per the UCD. |
| 93 | fn mapping(self: Self, cp: u21, form: Form) Decomp { | 111 | fn mapping(self: Normalize, cp: u21, form: Form) Decomp { |
| 94 | var dc = Decomp{}; | 112 | var dc = Decomp{}; |
| 95 | 113 | ||
| 96 | switch (form) { | 114 | switch (form) { |
| 97 | .nfd => { | 115 | .nfd => { |
| 98 | dc.cps = self.norm_data.canon_data.toNfd(cp); | 116 | dc.cps = self.canon_data.toNfd(cp); |
| 99 | if (dc.cps.len != 0) dc.form = .nfd; | 117 | if (dc.cps.len != 0) dc.form = .nfd; |
| 100 | }, | 118 | }, |
| 101 | 119 | ||
| 102 | .nfkd => { | 120 | .nfkd => { |
| 103 | dc.cps = self.norm_data.compat_data.toNfkd(cp); | 121 | dc.cps = self.compat_data.toNfkd(cp); |
| 104 | if (dc.cps.len != 0) { | 122 | if (dc.cps.len != 0) { |
| 105 | dc.form = .nfkd; | 123 | dc.form = .nfkd; |
| 106 | } else { | 124 | } else { |
| 107 | dc.cps = self.norm_data.canon_data.toNfd(cp); | 125 | dc.cps = self.canon_data.toNfd(cp); |
| 108 | if (dc.cps.len != 0) dc.form = .nfkd; | 126 | if (dc.cps.len != 0) dc.form = .nfkd; |
| 109 | } | 127 | } |
| 110 | }, | 128 | }, |
| @@ -117,7 +135,7 @@ fn mapping(self: Self, cp: u21, form: Form) Decomp { | |||
| 117 | 135 | ||
| 118 | // `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`. | 136 | // `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`. |
| 119 | fn decompose( | 137 | fn decompose( |
| 120 | self: Self, | 138 | self: Normalize, |
| 121 | cp: u21, | 139 | cp: u21, |
| 122 | form: Form, | 140 | form: Form, |
| 123 | buf: []u21, | 141 | buf: []u21, |
| @@ -127,8 +145,8 @@ fn decompose( | |||
| 127 | 145 | ||
| 128 | // NFD / NFKD quick checks. | 146 | // NFD / NFKD quick checks. |
| 129 | switch (form) { | 147 | switch (form) { |
| 130 | .nfd => if (self.norm_data.normp_data.isNfd(cp)) return .{}, | 148 | .nfd => if (self.normp_data.isNfd(cp)) return .{}, |
| 131 | .nfkd => if (self.norm_data.normp_data.isNfkd(cp)) return .{}, | 149 | .nfkd => if (self.normp_data.isNfkd(cp)) return .{}, |
| 132 | else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."), | 150 | else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."), |
| 133 | } | 151 | } |
| 134 | 152 | ||
| @@ -175,10 +193,8 @@ fn decompose( | |||
| 175 | 193 | ||
| 176 | test "decompose" { | 194 | test "decompose" { |
| 177 | const allocator = testing.allocator; | 195 | const allocator = testing.allocator; |
| 178 | var data: NormData = undefined; | 196 | const n = try Normalize.init(allocator); |
| 179 | try NormData.init(&data, allocator); | 197 | defer n.deinit(allocator); |
| 180 | defer data.deinit(allocator); | ||
| 181 | var n = Self{ .norm_data = &data }; | ||
| 182 | 198 | ||
| 183 | var buf: [18]u21 = undefined; | 199 | var buf: [18]u21 = undefined; |
| 184 | 200 | ||
| @@ -228,42 +244,42 @@ pub const Result = struct { | |||
| 228 | slice: []const u8, | 244 | slice: []const u8, |
| 229 | 245 | ||
| 230 | /// Ensures that the slice result is a copy of the input, by making a copy if it was not. | 246 | /// Ensures that the slice result is a copy of the input, by making a copy if it was not. |
| 231 | pub fn toOwned(result: Result, allocator: mem.Allocator) error{OutOfMemory}!Result { | 247 | pub fn toOwned(result: Result, allocator: Allocator) error{OutOfMemory}!Result { |
| 232 | if (result.allocated) return result; | 248 | if (result.allocated) return result; |
| 233 | return .{ .allocated = true, .slice = try allocator.dupe(u8, result.slice) }; | 249 | return .{ .allocated = true, .slice = try allocator.dupe(u8, result.slice) }; |
| 234 | } | 250 | } |
| 235 | 251 | ||
| 236 | pub fn deinit(self: *const Result, allocator: mem.Allocator) void { | 252 | pub fn deinit(self: *const Result, allocator: Allocator) void { |
| 237 | if (self.allocated) allocator.free(self.slice); | 253 | if (self.allocated) allocator.free(self.slice); |
| 238 | } | 254 | } |
| 239 | }; | 255 | }; |
| 240 | 256 | ||
| 241 | // Compares code points by Canonical Combining Class order. | 257 | // Compares code points by Canonical Combining Class order. |
| 242 | fn cccLess(self: Self, lhs: u21, rhs: u21) bool { | 258 | fn cccLess(self: Normalize, lhs: u21, rhs: u21) bool { |
| 243 | return self.norm_data.ccc_data.ccc(lhs) < self.norm_data.ccc_data.ccc(rhs); | 259 | return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs); |
| 244 | } | 260 | } |
| 245 | 261 | ||
| 246 | // Applies the Canonical Sorting Algorithm. | 262 | // Applies the Canonical Sorting Algorithm. |
| 247 | fn canonicalSort(self: Self, cps: []u21) void { | 263 | fn canonicalSort(self: Normalize, cps: []u21) void { |
| 248 | var i: usize = 0; | 264 | var i: usize = 0; |
| 249 | while (i < cps.len) : (i += 1) { | 265 | while (i < cps.len) : (i += 1) { |
| 250 | const start: usize = i; | 266 | const start: usize = i; |
| 251 | while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} | 267 | while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} |
| 252 | mem.sort(u21, cps[start..i], self, cccLess); | 268 | mem.sort(u21, cps[start..i], self, cccLess); |
| 253 | } | 269 | } |
| 254 | } | 270 | } |
| 255 | 271 | ||
| 256 | /// Normalize `str` to NFD. | 272 | /// Normalize `str` to NFD. |
| 257 | pub fn nfd(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result { | 273 | pub fn nfd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { |
| 258 | return self.nfxd(allocator, str, .nfd); | 274 | return self.nfxd(allocator, str, .nfd); |
| 259 | } | 275 | } |
| 260 | 276 | ||
| 261 | /// Normalize `str` to NFKD. | 277 | /// Normalize `str` to NFKD. |
| 262 | pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result { | 278 | pub fn nfkd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { |
| 263 | return self.nfxd(allocator, str, .nfkd); | 279 | return self.nfxd(allocator, str, .nfkd); |
| 264 | } | 280 | } |
| 265 | 281 | ||
| 266 | pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error![]u21 { | 282 | pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error![]u21 { |
| 267 | var dcp_list = std.ArrayList(u21).init(allocator); | 283 | var dcp_list = std.ArrayList(u21).init(allocator); |
| 268 | defer dcp_list.deinit(); | 284 | defer dcp_list.deinit(); |
| 269 | 285 | ||
| @@ -284,7 +300,7 @@ pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, for | |||
| 284 | return try dcp_list.toOwnedSlice(); | 300 | return try dcp_list.toOwnedSlice(); |
| 285 | } | 301 | } |
| 286 | 302 | ||
| 287 | fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error!Result { | 303 | fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { |
| 288 | // Quick checks. | 304 | // Quick checks. |
| 289 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; | 305 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; |
| 290 | 306 | ||
| @@ -305,10 +321,8 @@ fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A | |||
| 305 | 321 | ||
| 306 | test "nfd ASCII / no-alloc" { | 322 | test "nfd ASCII / no-alloc" { |
| 307 | const allocator = testing.allocator; | 323 | const allocator = testing.allocator; |
| 308 | var data: NormData = undefined; | 324 | const n = try Normalize.init(allocator); |
| 309 | try NormData.init(&data, allocator); | 325 | defer n.deinit(allocator); |
| 310 | defer data.deinit(allocator); | ||
| 311 | const n = Self{ .norm_data = &data }; | ||
| 312 | 326 | ||
| 313 | const result = try n.nfd(allocator, "Hello World!"); | 327 | const result = try n.nfd(allocator, "Hello World!"); |
| 314 | defer result.deinit(allocator); | 328 | defer result.deinit(allocator); |
| @@ -318,10 +332,8 @@ test "nfd ASCII / no-alloc" { | |||
| 318 | 332 | ||
| 319 | test "nfd !ASCII / alloc" { | 333 | test "nfd !ASCII / alloc" { |
| 320 | const allocator = testing.allocator; | 334 | const allocator = testing.allocator; |
| 321 | var data: NormData = undefined; | 335 | const n = try Normalize.init(allocator); |
| 322 | try NormData.init(&data, allocator); | 336 | defer n.deinit(allocator); |
| 323 | defer data.deinit(allocator); | ||
| 324 | const n = Self{ .norm_data = &data }; | ||
| 325 | 337 | ||
| 326 | const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); | 338 | const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); |
| 327 | defer result.deinit(allocator); | 339 | defer result.deinit(allocator); |
| @@ -331,10 +343,8 @@ test "nfd !ASCII / alloc" { | |||
| 331 | 343 | ||
| 332 | test "nfkd ASCII / no-alloc" { | 344 | test "nfkd ASCII / no-alloc" { |
| 333 | const allocator = testing.allocator; | 345 | const allocator = testing.allocator; |
| 334 | var data: NormData = undefined; | 346 | const n = try Normalize.init(allocator); |
| 335 | try NormData.init(&data, allocator); | 347 | defer n.deinit(allocator); |
| 336 | defer data.deinit(allocator); | ||
| 337 | const n = Self{ .norm_data = &data }; | ||
| 338 | 348 | ||
| 339 | const result = try n.nfkd(allocator, "Hello World!"); | 349 | const result = try n.nfkd(allocator, "Hello World!"); |
| 340 | defer result.deinit(allocator); | 350 | defer result.deinit(allocator); |
| @@ -344,10 +354,8 @@ test "nfkd ASCII / no-alloc" { | |||
| 344 | 354 | ||
| 345 | test "nfkd !ASCII / alloc" { | 355 | test "nfkd !ASCII / alloc" { |
| 346 | const allocator = testing.allocator; | 356 | const allocator = testing.allocator; |
| 347 | var data: NormData = undefined; | 357 | const n = try Normalize.init(allocator); |
| 348 | try NormData.init(&data, allocator); | 358 | defer n.deinit(allocator); |
| 349 | defer data.deinit(allocator); | ||
| 350 | const n = Self{ .norm_data = &data }; | ||
| 351 | 359 | ||
| 352 | const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); | 360 | const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); |
| 353 | defer result.deinit(allocator); | 361 | defer result.deinit(allocator); |
| @@ -356,10 +364,10 @@ test "nfkd !ASCII / alloc" { | |||
| 356 | } | 364 | } |
| 357 | 365 | ||
| 358 | pub fn nfdCodePoints( | 366 | pub fn nfdCodePoints( |
| 359 | self: Self, | 367 | self: Normalize, |
| 360 | allocator: mem.Allocator, | 368 | allocator: Allocator, |
| 361 | cps: []const u21, | 369 | cps: []const u21, |
| 362 | ) mem.Allocator.Error![]u21 { | 370 | ) Allocator.Error![]u21 { |
| 363 | var dcp_list = std.ArrayList(u21).init(allocator); | 371 | var dcp_list = std.ArrayList(u21).init(allocator); |
| 364 | defer dcp_list.deinit(); | 372 | defer dcp_list.deinit(); |
| 365 | 373 | ||
| @@ -381,10 +389,10 @@ pub fn nfdCodePoints( | |||
| 381 | } | 389 | } |
| 382 | 390 | ||
| 383 | pub fn nfkdCodePoints( | 391 | pub fn nfkdCodePoints( |
| 384 | self: Self, | 392 | self: Normalize, |
| 385 | allocator: mem.Allocator, | 393 | allocator: Allocator, |
| 386 | cps: []const u21, | 394 | cps: []const u21, |
| 387 | ) mem.Allocator.Error![]u21 { | 395 | ) Allocator.Error![]u21 { |
| 388 | var dcp_list = std.ArrayList(u21).init(allocator); | 396 | var dcp_list = std.ArrayList(u21).init(allocator); |
| 389 | defer dcp_list.deinit(); | 397 | defer dcp_list.deinit(); |
| 390 | 398 | ||
| @@ -407,21 +415,21 @@ pub fn nfkdCodePoints( | |||
| 407 | 415 | ||
| 408 | // Composition (NFC, NFKC) | 416 | // Composition (NFC, NFKC) |
| 409 | 417 | ||
| 410 | fn isHangul(self: Self, cp: u21) bool { | 418 | fn isHangul(self: Normalize, cp: u21) bool { |
| 411 | return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none; | 419 | return cp >= 0x1100 and self.hangul_data.syllable(cp) != .none; |
| 412 | } | 420 | } |
| 413 | 421 | ||
| 414 | /// Normalizes `str` to NFC. | 422 | /// Normalizes `str` to NFC. |
| 415 | pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result { | 423 | pub fn nfc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { |
| 416 | return self.nfxc(allocator, str, .nfc); | 424 | return self.nfxc(allocator, str, .nfc); |
| 417 | } | 425 | } |
| 418 | 426 | ||
| 419 | /// Normalizes `str` to NFKC. | 427 | /// Normalizes `str` to NFKC. |
| 420 | pub fn nfkc(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result { | 428 | pub fn nfkc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { |
| 421 | return self.nfxc(allocator, str, .nfkc); | 429 | return self.nfxc(allocator, str, .nfkc); |
| 422 | } | 430 | } |
| 423 | 431 | ||
| 424 | fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error!Result { | 432 | fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { |
| 425 | // Quick checks. | 433 | // Quick checks. |
| 426 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; | 434 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; |
| 427 | if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str }; | 435 | if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str }; |
| @@ -446,7 +454,7 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A | |||
| 446 | block_check: while (i < dcps.len) : (i += 1) { | 454 | block_check: while (i < dcps.len) : (i += 1) { |
| 447 | const C = dcps[i]; | 455 | const C = dcps[i]; |
| 448 | if (C == tombstone) continue :block_check; | 456 | if (C == tombstone) continue :block_check; |
| 449 | const cc_C = self.norm_data.ccc_data.ccc(C); | 457 | const cc_C = self.ccc_data.ccc(C); |
| 450 | var starter_index: ?usize = null; | 458 | var starter_index: ?usize = null; |
| 451 | var j: usize = i; | 459 | var j: usize = i; |
| 452 | 460 | ||
| @@ -456,11 +464,11 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A | |||
| 456 | if (dcps[j] == tombstone) continue; | 464 | if (dcps[j] == tombstone) continue; |
| 457 | 465 | ||
| 458 | // Check for starter. | 466 | // Check for starter. |
| 459 | if (self.norm_data.ccc_data.isStarter(dcps[j])) { | 467 | if (self.ccc_data.isStarter(dcps[j])) { |
| 460 | // Check for blocking conditions. | 468 | // Check for blocking conditions. |
| 461 | for (dcps[(j + 1)..i]) |B| { | 469 | for (dcps[(j + 1)..i]) |B| { |
| 462 | if (B == tombstone) continue; | 470 | if (B == tombstone) continue; |
| 463 | const cc_B = self.norm_data.ccc_data.ccc(B); | 471 | const cc_B = self.ccc_data.ccc(B); |
| 464 | if (cc_B != 0 and self.isHangul(C)) continue :block_check; | 472 | if (cc_B != 0 and self.isHangul(C)) continue :block_check; |
| 465 | if (cc_B >= cc_C) continue :block_check; | 473 | if (cc_B >= cc_C) continue :block_check; |
| 466 | } | 474 | } |
| @@ -484,8 +492,8 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A | |||
| 484 | // them algorithmically if possible. | 492 | // them algorithmically if possible. |
| 485 | if (self.isHangul(L) and self.isHangul(C)) { | 493 | if (self.isHangul(L) and self.isHangul(C)) { |
| 486 | // Get Hangul syllable types. | 494 | // Get Hangul syllable types. |
| 487 | const l_stype = self.norm_data.hangul_data.syllable(L); | 495 | const l_stype = self.hangul_data.syllable(L); |
| 488 | const c_stype = self.norm_data.hangul_data.syllable(C); | 496 | const c_stype = self.hangul_data.syllable(C); |
| 489 | 497 | ||
| 490 | if (l_stype == .LV and c_stype == .T) { | 498 | if (l_stype == .LV and c_stype == .T) { |
| 491 | // LV, T canonical composition. | 499 | // LV, T canonical composition. |
| @@ -508,13 +516,13 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A | |||
| 508 | if (!processed_hangul) { | 516 | if (!processed_hangul) { |
| 509 | // L, C are not Hangul, so check for primary composite | 517 | // L, C are not Hangul, so check for primary composite |
| 510 | // in the Unicode Character Database. | 518 | // in the Unicode Character Database. |
| 511 | if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| { | 519 | if (self.canon_data.toNfc(.{ L, C })) |P| { |
| 512 | // We have a primary composite P for L, C. | 520 | // We have a primary composite P for L, C. |
| 513 | // We must check if P is not in the Full | 521 | // We must check if P is not in the Full |
| 514 | // Composition Exclusions (FCX) list, | 522 | // Composition Exclusions (FCX) list, |
| 515 | // preventing it from appearing in any | 523 | // preventing it from appearing in any |
| 516 | // composed form (NFC, NFKC). | 524 | // composed form (NFC, NFKC). |
| 517 | if (!self.norm_data.normp_data.isFcx(P)) { | 525 | if (!self.normp_data.isFcx(P)) { |
| 518 | dcps[sidx] = P; | 526 | dcps[sidx] = P; |
| 519 | dcps[i] = tombstone; // Mark for deletion. | 527 | dcps[i] = tombstone; // Mark for deletion. |
| 520 | deleted += 1; | 528 | deleted += 1; |
| @@ -544,10 +552,8 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A | |||
| 544 | 552 | ||
| 545 | test "nfc" { | 553 | test "nfc" { |
| 546 | const allocator = testing.allocator; | 554 | const allocator = testing.allocator; |
| 547 | var data: NormData = undefined; | 555 | const n = try Normalize.init(allocator); |
| 548 | try NormData.init(&data, allocator); | 556 | defer n.deinit(allocator); |
| 549 | defer data.deinit(allocator); | ||
| 550 | const n = Self{ .norm_data = &data }; | ||
| 551 | 557 | ||
| 552 | const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); | 558 | const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); |
| 553 | defer result.deinit(allocator); | 559 | defer result.deinit(allocator); |
| @@ -557,10 +563,8 @@ test "nfc" { | |||
| 557 | 563 | ||
| 558 | test "nfkc" { | 564 | test "nfkc" { |
| 559 | const allocator = testing.allocator; | 565 | const allocator = testing.allocator; |
| 560 | var data: NormData = undefined; | 566 | const n = try Normalize.init(allocator); |
| 561 | try NormData.init(&data, allocator); | 567 | defer n.deinit(allocator); |
| 562 | defer data.deinit(allocator); | ||
| 563 | const n = Self{ .norm_data = &data }; | ||
| 564 | 568 | ||
| 565 | const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); | 569 | const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); |
| 566 | defer result.deinit(allocator); | 570 | defer result.deinit(allocator); |
| @@ -569,7 +573,7 @@ test "nfkc" { | |||
| 569 | } | 573 | } |
| 570 | 574 | ||
| 571 | /// Tests for equality of `a` and `b` after normalizing to NFC. | 575 | /// Tests for equality of `a` and `b` after normalizing to NFC. |
| 572 | pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool { | 576 | pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8) !bool { |
| 573 | const norm_result_a = try self.nfc(allocator, a); | 577 | const norm_result_a = try self.nfc(allocator, a); |
| 574 | defer norm_result_a.deinit(allocator); | 578 | defer norm_result_a.deinit(allocator); |
| 575 | const norm_result_b = try self.nfc(allocator, b); | 579 | const norm_result_b = try self.nfc(allocator, b); |
| @@ -580,10 +584,8 @@ pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) ! | |||
| 580 | 584 | ||
| 581 | test "eql" { | 585 | test "eql" { |
| 582 | const allocator = testing.allocator; | 586 | const allocator = testing.allocator; |
| 583 | var data: NormData = undefined; | 587 | const n = try Normalize.init(allocator); |
| 584 | try NormData.init(&data, allocator); | 588 | defer n.deinit(allocator); |
| 585 | defer data.deinit(allocator); | ||
| 586 | const n = Self{ .norm_data = &data }; | ||
| 587 | 589 | ||
| 588 | try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); | 590 | try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); |
| 589 | try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); | 591 | try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); |
| @@ -629,3 +631,24 @@ test "isLatin1Only" { | |||
| 629 | const not_latin1_only = "Héllo, World! \u{3d3}"; | 631 | const not_latin1_only = "Héllo, World! \u{3d3}"; |
| 630 | try testing.expect(!isLatin1Only(not_latin1_only)); | 632 | try testing.expect(!isLatin1Only(not_latin1_only)); |
| 631 | } | 633 | } |
| 634 | |||
| 635 | const std = @import("std"); | ||
| 636 | const debug = std.debug; | ||
| 637 | const assert = debug.assert; | ||
| 638 | const fmt = std.fmt; | ||
| 639 | const heap = std.heap; | ||
| 640 | const mem = std.mem; | ||
| 641 | const simd = std.simd; | ||
| 642 | const testing = std.testing; | ||
| 643 | const unicode = std.unicode; | ||
| 644 | const Allocator = std.mem.Allocator; | ||
| 645 | |||
| 646 | const ascii = @import("ascii"); | ||
| 647 | const CodePointIterator = @import("code_point").Iterator; | ||
| 648 | |||
| 649 | const CanonData = @import("CanonData"); | ||
| 650 | const CccData = @import("CombiningData"); | ||
| 651 | const CompatData = @import("CompatData"); | ||
| 652 | const FoldData = @import("FoldData"); | ||
| 653 | const HangulData = @import("HangulData"); | ||
| 654 | const NormPropsData = @import("NormPropsData"); | ||