diff options
Diffstat (limited to 'src/Normalizer.zig')
| -rw-r--r-- | src/Normalizer.zig | 191 |
1 files changed, 121 insertions, 70 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig index 89cc50c..d32ad52 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig | |||
| @@ -3,7 +3,16 @@ | |||
| 3 | //! NFKC, NFD, and NFKD normalization forms. | 3 | //! NFKC, NFD, and NFKD normalization forms. |
| 4 | 4 | ||
| 5 | const std = @import("std"); | 5 | const std = @import("std"); |
| 6 | const assert = std.debug.assert; | ||
| 7 | const debug = std.debug; | ||
| 8 | const fmt = std.fmt; | ||
| 9 | const fs = std.fs; | ||
| 10 | const heap = std.heap; | ||
| 11 | const io = std.io; | ||
| 12 | const mem = std.mem; | ||
| 13 | const simd = std.simd; | ||
| 6 | const testing = std.testing; | 14 | const testing = std.testing; |
| 15 | const unicode = std.unicode; | ||
| 7 | 16 | ||
| 8 | const ascii = @import("ascii"); | 17 | const ascii = @import("ascii"); |
| 9 | const CodePointIterator = @import("code_point").Iterator; | 18 | const CodePointIterator = @import("code_point").Iterator; |
| @@ -50,20 +59,20 @@ fn decomposeHangul(self: Self, cp: u21, buf: []u21) ?Decomp { | |||
| 50 | } | 59 | } |
| 51 | 60 | ||
| 52 | fn composeHangulCanon(lv: u21, t: u21) u21 { | 61 | fn composeHangulCanon(lv: u21, t: u21) u21 { |
| 53 | std.debug.assert(0x11A8 <= t and t <= 0x11C2); | 62 | assert(0x11A8 <= t and t <= 0x11C2); |
| 54 | return lv + (t - TBase); | 63 | return lv + (t - TBase); |
| 55 | } | 64 | } |
| 56 | 65 | ||
| 57 | fn composeHangulFull(l: u21, v: u21, t: u21) u21 { | 66 | fn composeHangulFull(l: u21, v: u21, t: u21) u21 { |
| 58 | std.debug.assert(0x1100 <= l and l <= 0x1112); | 67 | assert(0x1100 <= l and l <= 0x1112); |
| 59 | std.debug.assert(0x1161 <= v and v <= 0x1175); | 68 | assert(0x1161 <= v and v <= 0x1175); |
| 60 | const LIndex = l - LBase; | 69 | const LIndex = l - LBase; |
| 61 | const VIndex = v - VBase; | 70 | const VIndex = v - VBase; |
| 62 | const LVIndex = LIndex * NCount + VIndex * TCount; | 71 | const LVIndex = LIndex * NCount + VIndex * TCount; |
| 63 | 72 | ||
| 64 | if (t == 0) return SBase + LVIndex; | 73 | if (t == 0) return SBase + LVIndex; |
| 65 | 74 | ||
| 66 | std.debug.assert(0x11A8 <= t and t <= 0x11C2); | 75 | assert(0x11A8 <= t and t <= 0x11C2); |
| 67 | const TIndex = t - TBase; | 76 | const TIndex = t - TBase; |
| 68 | 77 | ||
| 69 | return SBase + LVIndex + TIndex; | 78 | return SBase + LVIndex + TIndex; |
| @@ -175,45 +184,45 @@ test "decompose" { | |||
| 175 | var buf: [18]u21 = undefined; | 184 | var buf: [18]u21 = undefined; |
| 176 | 185 | ||
| 177 | var dc = n.decompose('é', .nfd, &buf); | 186 | var dc = n.decompose('é', .nfd, &buf); |
| 178 | try std.testing.expect(dc.form == .nfd); | 187 | try testing.expect(dc.form == .nfd); |
| 179 | try std.testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]); | 188 | try testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]); |
| 180 | 189 | ||
| 181 | dc = n.decompose('\u{1e0a}', .nfd, &buf); | 190 | dc = n.decompose('\u{1e0a}', .nfd, &buf); |
| 182 | try std.testing.expect(dc.form == .nfd); | 191 | try testing.expect(dc.form == .nfd); |
| 183 | try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); | 192 | try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); |
| 184 | 193 | ||
| 185 | dc = n.decompose('\u{1e0a}', .nfkd, &buf); | 194 | dc = n.decompose('\u{1e0a}', .nfkd, &buf); |
| 186 | try std.testing.expect(dc.form == .nfkd); | 195 | try testing.expect(dc.form == .nfkd); |
| 187 | try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); | 196 | try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); |
| 188 | 197 | ||
| 189 | dc = n.decompose('\u{3189}', .nfd, &buf); | 198 | dc = n.decompose('\u{3189}', .nfd, &buf); |
| 190 | try std.testing.expect(dc.form == .same); | 199 | try testing.expect(dc.form == .same); |
| 191 | try std.testing.expect(dc.cps.len == 0); | 200 | try testing.expect(dc.cps.len == 0); |
| 192 | 201 | ||
| 193 | dc = n.decompose('\u{3189}', .nfkd, &buf); | 202 | dc = n.decompose('\u{3189}', .nfkd, &buf); |
| 194 | try std.testing.expect(dc.form == .nfkd); | 203 | try testing.expect(dc.form == .nfkd); |
| 195 | try std.testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]); | 204 | try testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]); |
| 196 | 205 | ||
| 197 | dc = n.decompose('\u{ace1}', .nfd, &buf); | 206 | dc = n.decompose('\u{ace1}', .nfd, &buf); |
| 198 | try std.testing.expect(dc.form == .nfd); | 207 | try testing.expect(dc.form == .nfd); |
| 199 | try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); | 208 | try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); |
| 200 | 209 | ||
| 201 | dc = n.decompose('\u{ace1}', .nfkd, &buf); | 210 | dc = n.decompose('\u{ace1}', .nfkd, &buf); |
| 202 | try std.testing.expect(dc.form == .nfd); | 211 | try testing.expect(dc.form == .nfd); |
| 203 | try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); | 212 | try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); |
| 204 | 213 | ||
| 205 | dc = n.decompose('\u{3d3}', .nfd, &buf); | 214 | dc = n.decompose('\u{3d3}', .nfd, &buf); |
| 206 | try std.testing.expect(dc.form == .nfd); | 215 | try testing.expect(dc.form == .nfd); |
| 207 | try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]); | 216 | try testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]); |
| 208 | 217 | ||
| 209 | dc = n.decompose('\u{3d3}', .nfkd, &buf); | 218 | dc = n.decompose('\u{3d3}', .nfkd, &buf); |
| 210 | try std.testing.expect(dc.form == .nfkd); | 219 | try testing.expect(dc.form == .nfkd); |
| 211 | try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]); | 220 | try testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]); |
| 212 | } | 221 | } |
| 213 | 222 | ||
| 214 | /// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory. | 223 | /// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory. |
| 215 | pub const Result = struct { | 224 | pub const Result = struct { |
| 216 | allocator: ?std.mem.Allocator = null, | 225 | allocator: ?mem.Allocator = null, |
| 217 | slice: []const u8, | 226 | slice: []const u8, |
| 218 | 227 | ||
| 219 | pub fn deinit(self: *Result) void { | 228 | pub fn deinit(self: *Result) void { |
| @@ -232,25 +241,25 @@ fn canonicalSort(self: Self, cps: []u21) void { | |||
| 232 | while (i < cps.len) : (i += 1) { | 241 | while (i < cps.len) : (i += 1) { |
| 233 | const start: usize = i; | 242 | const start: usize = i; |
| 234 | while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} | 243 | while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} |
| 235 | std.mem.sort(u21, cps[start..i], self, cccLess); | 244 | mem.sort(u21, cps[start..i], self, cccLess); |
| 236 | } | 245 | } |
| 237 | } | 246 | } |
| 238 | 247 | ||
| 239 | /// Normalize `str` to NFD. | 248 | /// Normalize `str` to NFD. |
| 240 | pub fn nfd(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { | 249 | pub fn nfd(self: Self, allocator: mem.Allocator, str: []const u8) !Result { |
| 241 | return self.nfxd(allocator, str, .nfd); | 250 | return self.nfxd(allocator, str, .nfd); |
| 242 | } | 251 | } |
| 243 | 252 | ||
| 244 | /// Normalize `str` to NFKD. | 253 | /// Normalize `str` to NFKD. |
| 245 | pub fn nfkd(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { | 254 | pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) !Result { |
| 246 | return self.nfxd(allocator, str, .nfkd); | 255 | return self.nfxd(allocator, str, .nfkd); |
| 247 | } | 256 | } |
| 248 | 257 | ||
| 249 | fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result { | 258 | fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Result { |
| 250 | // Quick checks. | 259 | // Quick checks. |
| 251 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; | 260 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; |
| 252 | 261 | ||
| 253 | var dcp_list = try std.ArrayList(u21).initCapacity(allocator, str.len * 3); | 262 | var dcp_list = std.ArrayList(u21).init(allocator); |
| 254 | defer dcp_list.deinit(); | 263 | defer dcp_list.deinit(); |
| 255 | 264 | ||
| 256 | var cp_iter = CodePointIterator{ .bytes = str }; | 265 | var cp_iter = CodePointIterator{ .bytes = str }; |
| @@ -272,7 +281,7 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 272 | 281 | ||
| 273 | var buf: [4]u8 = undefined; | 282 | var buf: [4]u8 = undefined; |
| 274 | for (dcp_list.items) |dcp| { | 283 | for (dcp_list.items) |dcp| { |
| 275 | const len = try std.unicode.utf8Encode(dcp, &buf); | 284 | const len = try unicode.utf8Encode(dcp, &buf); |
| 276 | dstr_list.appendSliceAssumeCapacity(buf[0..len]); | 285 | dstr_list.appendSliceAssumeCapacity(buf[0..len]); |
| 277 | } | 286 | } |
| 278 | 287 | ||
| @@ -288,7 +297,7 @@ test "nfd ASCII / no-alloc" { | |||
| 288 | var result = try n.nfd(allocator, "Hello World!"); | 297 | var result = try n.nfd(allocator, "Hello World!"); |
| 289 | defer result.deinit(); | 298 | defer result.deinit(); |
| 290 | 299 | ||
| 291 | try std.testing.expectEqualStrings("Hello World!", result.slice); | 300 | try testing.expectEqualStrings("Hello World!", result.slice); |
| 292 | } | 301 | } |
| 293 | 302 | ||
| 294 | test "nfd !ASCII / alloc" { | 303 | test "nfd !ASCII / alloc" { |
| @@ -300,7 +309,7 @@ test "nfd !ASCII / alloc" { | |||
| 300 | var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); | 309 | var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); |
| 301 | defer result.deinit(); | 310 | defer result.deinit(); |
| 302 | 311 | ||
| 303 | try std.testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice); | 312 | try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice); |
| 304 | } | 313 | } |
| 305 | 314 | ||
| 306 | test "nfkd ASCII / no-alloc" { | 315 | test "nfkd ASCII / no-alloc" { |
| @@ -312,7 +321,7 @@ test "nfkd ASCII / no-alloc" { | |||
| 312 | var result = try n.nfkd(allocator, "Hello World!"); | 321 | var result = try n.nfkd(allocator, "Hello World!"); |
| 313 | defer result.deinit(); | 322 | defer result.deinit(); |
| 314 | 323 | ||
| 315 | try std.testing.expectEqualStrings("Hello World!", result.slice); | 324 | try testing.expectEqualStrings("Hello World!", result.slice); |
| 316 | } | 325 | } |
| 317 | 326 | ||
| 318 | test "nfkd !ASCII / alloc" { | 327 | test "nfkd !ASCII / alloc" { |
| @@ -324,7 +333,7 @@ test "nfkd !ASCII / alloc" { | |||
| 324 | var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); | 333 | var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); |
| 325 | defer result.deinit(); | 334 | defer result.deinit(); |
| 326 | 335 | ||
| 327 | try std.testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); | 336 | try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); |
| 328 | } | 337 | } |
| 329 | 338 | ||
| 330 | // Composition utilities. | 339 | // Composition utilities. |
| @@ -338,18 +347,19 @@ fn isNonHangulStarter(self: Self, cp: u21) bool { | |||
| 338 | } | 347 | } |
| 339 | 348 | ||
| 340 | /// Normalizes `str` to NFC. | 349 | /// Normalizes `str` to NFC. |
| 341 | pub fn nfc(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { | 350 | pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) !Result { |
| 342 | return self.nfxc(allocator, str, .nfc); | 351 | return self.nfxc(allocator, str, .nfc); |
| 343 | } | 352 | } |
| 344 | 353 | ||
| 345 | /// Normalizes `str` to NFKC. | 354 | /// Normalizes `str` to NFKC. |
| 346 | pub fn nfkc(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { | 355 | pub fn nfkc(self: Self, allocator: mem.Allocator, str: []const u8) !Result { |
| 347 | return self.nfxc(allocator, str, .nfkc); | 356 | return self.nfxc(allocator, str, .nfkc); |
| 348 | } | 357 | } |
| 349 | 358 | ||
| 350 | fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result { | 359 | fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Result { |
| 351 | // Quick checks. | 360 | // Quick checks. |
| 352 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; | 361 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; |
| 362 | if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str }; | ||
| 353 | 363 | ||
| 354 | // Decompose first. | 364 | // Decompose first. |
| 355 | var d_result = if (form == .nfc) | 365 | var d_result = if (form == .nfc) |
| @@ -449,7 +459,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 449 | 459 | ||
| 450 | for (d_list.items) |cp| { | 460 | for (d_list.items) |cp| { |
| 451 | if (cp == tombstone) continue; // "Delete" | 461 | if (cp == tombstone) continue; // "Delete" |
| 452 | const len = try std.unicode.utf8Encode(cp, &buf); | 462 | const len = try unicode.utf8Encode(cp, &buf); |
| 453 | cstr_list.appendSliceAssumeCapacity(buf[0..len]); | 463 | cstr_list.appendSliceAssumeCapacity(buf[0..len]); |
| 454 | } | 464 | } |
| 455 | 465 | ||
| @@ -478,7 +488,7 @@ test "nfc" { | |||
| 478 | var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); | 488 | var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); |
| 479 | defer result.deinit(); | 489 | defer result.deinit(); |
| 480 | 490 | ||
| 481 | try std.testing.expectEqualStrings("Complex char: \u{3D3}", result.slice); | 491 | try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice); |
| 482 | } | 492 | } |
| 483 | 493 | ||
| 484 | test "nfkc" { | 494 | test "nfkc" { |
| @@ -490,17 +500,17 @@ test "nfkc" { | |||
| 490 | var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); | 500 | var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); |
| 491 | defer result.deinit(); | 501 | defer result.deinit(); |
| 492 | 502 | ||
| 493 | try std.testing.expectEqualStrings("Complex char: \u{038E}", result.slice); | 503 | try testing.expectEqualStrings("Complex char: \u{038E}", result.slice); |
| 494 | } | 504 | } |
| 495 | 505 | ||
| 496 | /// Tests for equality of `a` and `b` after normalizing to NFD. | 506 | /// Tests for equality of `a` and `b` after normalizing to NFD. |
| 497 | pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u8) !bool { | 507 | pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool { |
| 498 | var norm_result_a = try self.nfd(allocator, a); | 508 | var norm_result_a = try self.nfd(allocator, a); |
| 499 | defer norm_result_a.deinit(); | 509 | defer norm_result_a.deinit(); |
| 500 | var norm_result_b = try self.nfd(allocator, b); | 510 | var norm_result_b = try self.nfd(allocator, b); |
| 501 | defer norm_result_b.deinit(); | 511 | defer norm_result_b.deinit(); |
| 502 | 512 | ||
| 503 | return std.mem.eql(u8, norm_result_a.slice, norm_result_b.slice); | 513 | return mem.eql(u8, norm_result_a.slice, norm_result_b.slice); |
| 504 | } | 514 | } |
| 505 | 515 | ||
| 506 | test "eql" { | 516 | test "eql" { |
| @@ -509,8 +519,8 @@ test "eql" { | |||
| 509 | defer data.deinit(); | 519 | defer data.deinit(); |
| 510 | var n = Self{ .norm_data = &data }; | 520 | var n = Self{ .norm_data = &data }; |
| 511 | 521 | ||
| 512 | try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); | 522 | try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); |
| 513 | try std.testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); | 523 | try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); |
| 514 | } | 524 | } |
| 515 | 525 | ||
| 516 | // FCD | 526 | // FCD |
| @@ -545,17 +555,17 @@ test "isFcd" { | |||
| 545 | var n = Self{ .norm_data = &data }; | 555 | var n = Self{ .norm_data = &data }; |
| 546 | 556 | ||
| 547 | const is_nfc = "José \u{3D3}"; | 557 | const is_nfc = "José \u{3D3}"; |
| 548 | try std.testing.expect(n.isFcd(is_nfc)); | 558 | try testing.expect(n.isFcd(is_nfc)); |
| 549 | 559 | ||
| 550 | const is_nfd = "Jose\u{301} \u{3d2}\u{301}"; | 560 | const is_nfd = "Jose\u{301} \u{3d2}\u{301}"; |
| 551 | try std.testing.expect(n.isFcd(is_nfd)); | 561 | try testing.expect(n.isFcd(is_nfd)); |
| 552 | 562 | ||
| 553 | const not_fcd = "Jose\u{301} \u{3d2}\u{315}\u{301}"; | 563 | const not_fcd = "Jose\u{301} \u{3d2}\u{315}\u{301}"; |
| 554 | try std.testing.expect(!n.isFcd(not_fcd)); | 564 | try testing.expect(!n.isFcd(not_fcd)); |
| 555 | } | 565 | } |
| 556 | 566 | ||
| 557 | test "Unicode normalization tests" { | 567 | test "Unicode normalization tests" { |
| 558 | var arena = std.heap.ArenaAllocator.init(std.testing.allocator); | 568 | var arena = heap.ArenaAllocator.init(testing.allocator); |
| 559 | defer arena.deinit(); | 569 | defer arena.deinit(); |
| 560 | var allocator = arena.allocator(); | 570 | var allocator = arena.allocator(); |
| 561 | 571 | ||
| @@ -563,9 +573,9 @@ test "Unicode normalization tests" { | |||
| 563 | defer data.deinit(); | 573 | defer data.deinit(); |
| 564 | var n = Self{ .norm_data = &data }; | 574 | var n = Self{ .norm_data = &data }; |
| 565 | 575 | ||
| 566 | var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); | 576 | var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); |
| 567 | defer file.close(); | 577 | defer file.close(); |
| 568 | var buf_reader = std.io.bufferedReader(file.reader()); | 578 | var buf_reader = io.bufferedReader(file.reader()); |
| 569 | const input_stream = buf_reader.reader(); | 579 | const input_stream = buf_reader.reader(); |
| 570 | 580 | ||
| 571 | var line_no: usize = 0; | 581 | var line_no: usize = 0; |
| @@ -577,7 +587,7 @@ test "Unicode normalization tests" { | |||
| 577 | // Skip comments or empty lines. | 587 | // Skip comments or empty lines. |
| 578 | if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; | 588 | if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; |
| 579 | // Iterate over fields. | 589 | // Iterate over fields. |
| 580 | var fields = std.mem.split(u8, line, ";"); | 590 | var fields = mem.split(u8, line, ";"); |
| 581 | var field_index: usize = 0; | 591 | var field_index: usize = 0; |
| 582 | var input: []u8 = undefined; | 592 | var input: []u8 = undefined; |
| 583 | defer allocator.free(input); | 593 | defer allocator.free(input); |
| @@ -587,24 +597,24 @@ test "Unicode normalization tests" { | |||
| 587 | var i_buf = std.ArrayList(u8).init(allocator); | 597 | var i_buf = std.ArrayList(u8).init(allocator); |
| 588 | defer i_buf.deinit(); | 598 | defer i_buf.deinit(); |
| 589 | 599 | ||
| 590 | var i_fields = std.mem.split(u8, field, " "); | 600 | var i_fields = mem.split(u8, field, " "); |
| 591 | while (i_fields.next()) |s| { | 601 | while (i_fields.next()) |s| { |
| 592 | const icp = try std.fmt.parseInt(u21, s, 16); | 602 | const icp = try fmt.parseInt(u21, s, 16); |
| 593 | const len = try std.unicode.utf8Encode(icp, &cp_buf); | 603 | const len = try unicode.utf8Encode(icp, &cp_buf); |
| 594 | try i_buf.appendSlice(cp_buf[0..len]); | 604 | try i_buf.appendSlice(cp_buf[0..len]); |
| 595 | } | 605 | } |
| 596 | 606 | ||
| 597 | input = try i_buf.toOwnedSlice(); | 607 | input = try i_buf.toOwnedSlice(); |
| 598 | } else if (field_index == 1) { | 608 | } else if (field_index == 1) { |
| 599 | //std.debug.print("\n*** {s} ***\n", .{line}); | 609 | //debug.print("\n*** {s} ***\n", .{line}); |
| 600 | // NFC, time to test. | 610 | // NFC, time to test. |
| 601 | var w_buf = std.ArrayList(u8).init(allocator); | 611 | var w_buf = std.ArrayList(u8).init(allocator); |
| 602 | defer w_buf.deinit(); | 612 | defer w_buf.deinit(); |
| 603 | 613 | ||
| 604 | var w_fields = std.mem.split(u8, field, " "); | 614 | var w_fields = mem.split(u8, field, " "); |
| 605 | while (w_fields.next()) |s| { | 615 | while (w_fields.next()) |s| { |
| 606 | const wcp = try std.fmt.parseInt(u21, s, 16); | 616 | const wcp = try fmt.parseInt(u21, s, 16); |
| 607 | const len = try std.unicode.utf8Encode(wcp, &cp_buf); | 617 | const len = try unicode.utf8Encode(wcp, &cp_buf); |
| 608 | try w_buf.appendSlice(cp_buf[0..len]); | 618 | try w_buf.appendSlice(cp_buf[0..len]); |
| 609 | } | 619 | } |
| 610 | 620 | ||
| @@ -612,16 +622,16 @@ test "Unicode normalization tests" { | |||
| 612 | var got = try n.nfc(allocator, input); | 622 | var got = try n.nfc(allocator, input); |
| 613 | defer got.deinit(); | 623 | defer got.deinit(); |
| 614 | 624 | ||
| 615 | try std.testing.expectEqualStrings(want, got.slice); | 625 | try testing.expectEqualStrings(want, got.slice); |
| 616 | } else if (field_index == 2) { | 626 | } else if (field_index == 2) { |
| 617 | // NFD, time to test. | 627 | // NFD, time to test. |
| 618 | var w_buf = std.ArrayList(u8).init(allocator); | 628 | var w_buf = std.ArrayList(u8).init(allocator); |
| 619 | defer w_buf.deinit(); | 629 | defer w_buf.deinit(); |
| 620 | 630 | ||
| 621 | var w_fields = std.mem.split(u8, field, " "); | 631 | var w_fields = mem.split(u8, field, " "); |
| 622 | while (w_fields.next()) |s| { | 632 | while (w_fields.next()) |s| { |
| 623 | const wcp = try std.fmt.parseInt(u21, s, 16); | 633 | const wcp = try fmt.parseInt(u21, s, 16); |
| 624 | const len = try std.unicode.utf8Encode(wcp, &cp_buf); | 634 | const len = try unicode.utf8Encode(wcp, &cp_buf); |
| 625 | try w_buf.appendSlice(cp_buf[0..len]); | 635 | try w_buf.appendSlice(cp_buf[0..len]); |
| 626 | } | 636 | } |
| 627 | 637 | ||
| @@ -629,16 +639,16 @@ test "Unicode normalization tests" { | |||
| 629 | var got = try n.nfd(allocator, input); | 639 | var got = try n.nfd(allocator, input); |
| 630 | defer got.deinit(); | 640 | defer got.deinit(); |
| 631 | 641 | ||
| 632 | try std.testing.expectEqualStrings(want, got.slice); | 642 | try testing.expectEqualStrings(want, got.slice); |
| 633 | } else if (field_index == 3) { | 643 | } else if (field_index == 3) { |
| 634 | // NFKC, time to test. | 644 | // NFKC, time to test. |
| 635 | var w_buf = std.ArrayList(u8).init(allocator); | 645 | var w_buf = std.ArrayList(u8).init(allocator); |
| 636 | defer w_buf.deinit(); | 646 | defer w_buf.deinit(); |
| 637 | 647 | ||
| 638 | var w_fields = std.mem.split(u8, field, " "); | 648 | var w_fields = mem.split(u8, field, " "); |
| 639 | while (w_fields.next()) |s| { | 649 | while (w_fields.next()) |s| { |
| 640 | const wcp = try std.fmt.parseInt(u21, s, 16); | 650 | const wcp = try fmt.parseInt(u21, s, 16); |
| 641 | const len = try std.unicode.utf8Encode(wcp, &cp_buf); | 651 | const len = try unicode.utf8Encode(wcp, &cp_buf); |
| 642 | try w_buf.appendSlice(cp_buf[0..len]); | 652 | try w_buf.appendSlice(cp_buf[0..len]); |
| 643 | } | 653 | } |
| 644 | 654 | ||
| @@ -646,16 +656,16 @@ test "Unicode normalization tests" { | |||
| 646 | var got = try n.nfkc(allocator, input); | 656 | var got = try n.nfkc(allocator, input); |
| 647 | defer got.deinit(); | 657 | defer got.deinit(); |
| 648 | 658 | ||
| 649 | try std.testing.expectEqualStrings(want, got.slice); | 659 | try testing.expectEqualStrings(want, got.slice); |
| 650 | } else if (field_index == 4) { | 660 | } else if (field_index == 4) { |
| 651 | // NFKD, time to test. | 661 | // NFKD, time to test. |
| 652 | var w_buf = std.ArrayList(u8).init(allocator); | 662 | var w_buf = std.ArrayList(u8).init(allocator); |
| 653 | defer w_buf.deinit(); | 663 | defer w_buf.deinit(); |
| 654 | 664 | ||
| 655 | var w_fields = std.mem.split(u8, field, " "); | 665 | var w_fields = mem.split(u8, field, " "); |
| 656 | while (w_fields.next()) |s| { | 666 | while (w_fields.next()) |s| { |
| 657 | const wcp = try std.fmt.parseInt(u21, s, 16); | 667 | const wcp = try fmt.parseInt(u21, s, 16); |
| 658 | const len = try std.unicode.utf8Encode(wcp, &cp_buf); | 668 | const len = try unicode.utf8Encode(wcp, &cp_buf); |
| 659 | try w_buf.appendSlice(cp_buf[0..len]); | 669 | try w_buf.appendSlice(cp_buf[0..len]); |
| 660 | } | 670 | } |
| 661 | 671 | ||
| @@ -663,10 +673,51 @@ test "Unicode normalization tests" { | |||
| 663 | var got = try n.nfkd(allocator, input); | 673 | var got = try n.nfkd(allocator, input); |
| 664 | defer got.deinit(); | 674 | defer got.deinit(); |
| 665 | 675 | ||
| 666 | try std.testing.expectEqualStrings(want, got.slice); | 676 | try testing.expectEqualStrings(want, got.slice); |
| 667 | } else { | 677 | } else { |
| 668 | continue; | 678 | continue; |
| 669 | } | 679 | } |
| 670 | } | 680 | } |
| 671 | } | 681 | } |
| 672 | } | 682 | } |
| 683 | |||
| 684 | /// Returns true if `str` only contains Latin-1 Supplement | ||
| 685 | /// code points. Uses SIMD if possible. | ||
| 686 | pub fn isLatin1Only(str: []const u8) bool { | ||
| 687 | var cp_iter = CodePointIterator{ .bytes = str }; | ||
| 688 | |||
| 689 | const vec_len = simd.suggestVectorLength(u21) orelse return blk: { | ||
| 690 | break :blk while (cp_iter.next()) |cp| { | ||
| 691 | if (cp.code > 256) break false; | ||
| 692 | } else true; | ||
| 693 | }; | ||
| 694 | |||
| 695 | const Vec = @Vector(vec_len, u21); | ||
| 696 | |||
| 697 | outer: while (true) { | ||
| 698 | var v1: Vec = undefined; | ||
| 699 | const saved_cp_i = cp_iter.i; | ||
| 700 | |||
| 701 | for (0..vec_len) |i| { | ||
| 702 | if (cp_iter.next()) |cp| { | ||
| 703 | v1[i] = cp.code; | ||
| 704 | } else { | ||
| 705 | cp_iter.i = saved_cp_i; | ||
| 706 | break :outer; | ||
| 707 | } | ||
| 708 | } | ||
| 709 | const v2: Vec = @splat(256); | ||
| 710 | if (@reduce(.Or, v1 > v2)) return false; | ||
| 711 | } | ||
| 712 | |||
| 713 | return while (cp_iter.next()) |cp| { | ||
| 714 | if (cp.code > 256) break false; | ||
| 715 | } else true; | ||
| 716 | } | ||
| 717 | |||
| 718 | test "isLatin1Only" { | ||
| 719 | const latin1_only = "Hello, World! \u{fe} \u{ff}"; | ||
| 720 | try testing.expect(isLatin1Only(latin1_only)); | ||
| 721 | const not_latin1_only = "Héllo, World! \u{3d3}"; | ||
| 722 | try testing.expect(!isLatin1Only(not_latin1_only)); | ||
| 723 | } | ||