diff options
| author | 2024-03-23 20:32:13 -0400 | |
|---|---|---|
| committer | 2024-03-23 20:32:13 -0400 | |
| commit | bcd79d29b316f636af9d21c8ace61e9ba93180d9 (patch) | |
| tree | cbe36a301c87b83506ba0c2183d86820cdd80e59 /src/Normalize.zig | |
| parent | Renamed Caser to Folder (diff) | |
| download | zg-bcd79d29b316f636af9d21c8ace61e9ba93180d9.tar.gz zg-bcd79d29b316f636af9d21c8ace61e9ba93180d9.tar.xz zg-bcd79d29b316f636af9d21c8ace61e9ba93180d9.zip | |
Rename CaseFold and Normalize
Diffstat (limited to 'src/Normalize.zig')
| -rw-r--r-- | src/Normalize.zig | 774 |
1 files changed, 774 insertions, 0 deletions
diff --git a/src/Normalize.zig b/src/Normalize.zig new file mode 100644 index 0000000..b5a54d1 --- /dev/null +++ b/src/Normalize.zig | |||
| @@ -0,0 +1,774 @@ | |||
| 1 | //! Normalizer contains functions and methods that implement | ||
| 2 | //! Unicode Normalization. You can normalize strings into NFC, | ||
| 3 | //! NFKC, NFD, and NFKD normalization forms. | ||
| 4 | |||
| 5 | const std = @import("std"); | ||
| 6 | const assert = std.debug.assert; | ||
| 7 | const debug = std.debug; | ||
| 8 | const fmt = std.fmt; | ||
| 9 | const fs = std.fs; | ||
| 10 | const heap = std.heap; | ||
| 11 | const io = std.io; | ||
| 12 | const mem = std.mem; | ||
| 13 | const simd = std.simd; | ||
| 14 | const testing = std.testing; | ||
| 15 | const unicode = std.unicode; | ||
| 16 | |||
| 17 | const ascii = @import("ascii"); | ||
| 18 | const CodePointIterator = @import("code_point").Iterator; | ||
| 19 | pub const NormData = @import("NormData"); | ||
| 20 | |||
| 21 | norm_data: *const NormData, | ||
| 22 | |||
| 23 | const Self = @This(); | ||
| 24 | |||
| 25 | const SBase: u21 = 0xAC00; | ||
| 26 | const LBase: u21 = 0x1100; | ||
| 27 | const VBase: u21 = 0x1161; | ||
| 28 | const TBase: u21 = 0x11A7; | ||
| 29 | const LCount: u21 = 19; | ||
| 30 | const VCount: u21 = 21; | ||
| 31 | const TCount: u21 = 28; | ||
| 32 | const NCount: u21 = 588; // VCount * TCount | ||
| 33 | const SCount: u21 = 11172; // LCount * NCount | ||
| 34 | |||
| 35 | fn decomposeHangul(self: Self, cp: u21, buf: []u21) ?Decomp { | ||
| 36 | const kind = self.norm_data.hangul_data.syllable(cp); | ||
| 37 | if (kind != .LV and kind != .LVT) return null; | ||
| 38 | |||
| 39 | const SIndex: u21 = cp - SBase; | ||
| 40 | const LIndex: u21 = SIndex / NCount; | ||
| 41 | const VIndex: u21 = (SIndex % NCount) / TCount; | ||
| 42 | const TIndex: u21 = SIndex % TCount; | ||
| 43 | const LPart: u21 = LBase + LIndex; | ||
| 44 | const VPart: u21 = VBase + VIndex; | ||
| 45 | |||
| 46 | var dc = Decomp{ .form = .nfd }; | ||
| 47 | buf[0] = LPart; | ||
| 48 | buf[1] = VPart; | ||
| 49 | |||
| 50 | if (TIndex == 0) { | ||
| 51 | dc.cps = buf[0..2]; | ||
| 52 | return dc; | ||
| 53 | } | ||
| 54 | |||
| 55 | // TPart | ||
| 56 | buf[2] = TBase + TIndex; | ||
| 57 | dc.cps = buf[0..3]; | ||
| 58 | return dc; | ||
| 59 | } | ||
| 60 | |||
| 61 | fn composeHangulCanon(lv: u21, t: u21) u21 { | ||
| 62 | assert(0x11A8 <= t and t <= 0x11C2); | ||
| 63 | return lv + (t - TBase); | ||
| 64 | } | ||
| 65 | |||
| 66 | fn composeHangulFull(l: u21, v: u21, t: u21) u21 { | ||
| 67 | assert(0x1100 <= l and l <= 0x1112); | ||
| 68 | assert(0x1161 <= v and v <= 0x1175); | ||
| 69 | const LIndex = l - LBase; | ||
| 70 | const VIndex = v - VBase; | ||
| 71 | const LVIndex = LIndex * NCount + VIndex * TCount; | ||
| 72 | |||
| 73 | if (t == 0) return SBase + LVIndex; | ||
| 74 | |||
| 75 | assert(0x11A8 <= t and t <= 0x11C2); | ||
| 76 | const TIndex = t - TBase; | ||
| 77 | |||
| 78 | return SBase + LVIndex + TIndex; | ||
| 79 | } | ||
| 80 | |||
| 81 | const Form = enum { | ||
| 82 | nfc, | ||
| 83 | nfd, | ||
| 84 | nfkc, | ||
| 85 | nfkd, | ||
| 86 | same, | ||
| 87 | }; | ||
| 88 | |||
| 89 | const Decomp = struct { | ||
| 90 | form: Form = .same, | ||
| 91 | cps: []const u21 = &.{}, | ||
| 92 | }; | ||
| 93 | |||
| 94 | /// `mapping` retrieves the decomposition mapping for a code point as per the UCD. | ||
| 95 | pub fn mapping(self: Self, cp: u21, form: Form) Decomp { | ||
| 96 | var dc = Decomp{}; | ||
| 97 | |||
| 98 | switch (form) { | ||
| 99 | .nfd => { | ||
| 100 | dc.cps = self.norm_data.canon_data.toNfd(cp); | ||
| 101 | if (dc.cps.len != 0) dc.form = .nfd; | ||
| 102 | }, | ||
| 103 | |||
| 104 | .nfkd => { | ||
| 105 | dc.cps = self.norm_data.compat_data.toNfkd(cp); | ||
| 106 | if (dc.cps.len != 0) { | ||
| 107 | dc.form = .nfkd; | ||
| 108 | } else { | ||
| 109 | dc.cps = self.norm_data.canon_data.toNfd(cp); | ||
| 110 | if (dc.cps.len != 0) dc.form = .nfkd; | ||
| 111 | } | ||
| 112 | }, | ||
| 113 | |||
| 114 | else => @panic("Normalizer.mapping only accepts form .nfd or .nfkd."), | ||
| 115 | } | ||
| 116 | |||
| 117 | return dc; | ||
| 118 | } | ||
| 119 | |||
| 120 | /// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`. | ||
| 121 | pub fn decompose( | ||
| 122 | self: Self, | ||
| 123 | cp: u21, | ||
| 124 | form: Form, | ||
| 125 | buf: []u21, | ||
| 126 | ) Decomp { | ||
| 127 | // ASCII | ||
| 128 | if (cp < 128) return .{}; | ||
| 129 | |||
| 130 | // NFD / NFKD quick checks. | ||
| 131 | switch (form) { | ||
| 132 | .nfd => if (self.norm_data.normp_data.isNfd(cp)) return .{}, | ||
| 133 | .nfkd => if (self.norm_data.normp_data.isNfkd(cp)) return .{}, | ||
| 134 | else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."), | ||
| 135 | } | ||
| 136 | |||
| 137 | // Hangul precomposed syllable full decomposition. | ||
| 138 | if (self.decomposeHangul(cp, buf)) |dc| return dc; | ||
| 139 | |||
| 140 | // Full decomposition. | ||
| 141 | var dc = Decomp{ .form = form }; | ||
| 142 | |||
| 143 | var result_index: usize = 0; | ||
| 144 | var work_index: usize = 1; | ||
| 145 | |||
| 146 | // Start work with argument code point. | ||
| 147 | var work = [_]u21{cp} ++ [_]u21{0} ** 17; | ||
| 148 | |||
| 149 | while (work_index > 0) { | ||
| 150 | // Look at previous code point in work queue. | ||
| 151 | work_index -= 1; | ||
| 152 | const next = work[work_index]; | ||
| 153 | const m = self.mapping(next, form); | ||
| 154 | |||
| 155 | // No more of decompositions for this code point. | ||
| 156 | if (m.form == .same) { | ||
| 157 | buf[result_index] = next; | ||
| 158 | result_index += 1; | ||
| 159 | continue; | ||
| 160 | } | ||
| 161 | |||
| 162 | // Work backwards through decomposition. | ||
| 163 | // `i` starts at 1 because m_last is 1 past the last code point. | ||
| 164 | var i: usize = 1; | ||
| 165 | while (i <= m.cps.len) : ({ | ||
| 166 | i += 1; | ||
| 167 | work_index += 1; | ||
| 168 | }) { | ||
| 169 | work[work_index] = m.cps[m.cps.len - i]; | ||
| 170 | } | ||
| 171 | } | ||
| 172 | |||
| 173 | dc.cps = buf[0..result_index]; | ||
| 174 | |||
| 175 | return dc; | ||
| 176 | } | ||
| 177 | |||
| 178 | test "decompose" { | ||
| 179 | const allocator = testing.allocator; | ||
| 180 | var data = try NormData.init(allocator); | ||
| 181 | defer data.deinit(); | ||
| 182 | var n = Self{ .norm_data = &data }; | ||
| 183 | |||
| 184 | var buf: [18]u21 = undefined; | ||
| 185 | |||
| 186 | var dc = n.decompose('é', .nfd, &buf); | ||
| 187 | try testing.expect(dc.form == .nfd); | ||
| 188 | try testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]); | ||
| 189 | |||
| 190 | dc = n.decompose('\u{1e0a}', .nfd, &buf); | ||
| 191 | try testing.expect(dc.form == .nfd); | ||
| 192 | try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); | ||
| 193 | |||
| 194 | dc = n.decompose('\u{1e0a}', .nfkd, &buf); | ||
| 195 | try testing.expect(dc.form == .nfkd); | ||
| 196 | try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); | ||
| 197 | |||
| 198 | dc = n.decompose('\u{3189}', .nfd, &buf); | ||
| 199 | try testing.expect(dc.form == .same); | ||
| 200 | try testing.expect(dc.cps.len == 0); | ||
| 201 | |||
| 202 | dc = n.decompose('\u{3189}', .nfkd, &buf); | ||
| 203 | try testing.expect(dc.form == .nfkd); | ||
| 204 | try testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]); | ||
| 205 | |||
| 206 | dc = n.decompose('\u{ace1}', .nfd, &buf); | ||
| 207 | try testing.expect(dc.form == .nfd); | ||
| 208 | try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); | ||
| 209 | |||
| 210 | dc = n.decompose('\u{ace1}', .nfkd, &buf); | ||
| 211 | try testing.expect(dc.form == .nfd); | ||
| 212 | try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); | ||
| 213 | |||
| 214 | dc = n.decompose('\u{3d3}', .nfd, &buf); | ||
| 215 | try testing.expect(dc.form == .nfd); | ||
| 216 | try testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]); | ||
| 217 | |||
| 218 | dc = n.decompose('\u{3d3}', .nfkd, &buf); | ||
| 219 | try testing.expect(dc.form == .nfkd); | ||
| 220 | try testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]); | ||
| 221 | } | ||
| 222 | |||
| 223 | /// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory. | ||
| 224 | pub const Result = struct { | ||
| 225 | allocator: ?mem.Allocator = null, | ||
| 226 | slice: []const u8, | ||
| 227 | |||
| 228 | pub fn deinit(self: *Result) void { | ||
| 229 | if (self.allocator) |allocator| allocator.free(self.slice); | ||
| 230 | } | ||
| 231 | }; | ||
| 232 | |||
| 233 | // Compares code points by Canonical Combining Class order. | ||
| 234 | fn cccLess(self: Self, lhs: u21, rhs: u21) bool { | ||
| 235 | return self.norm_data.ccc_data.ccc(lhs) < self.norm_data.ccc_data.ccc(rhs); | ||
| 236 | } | ||
| 237 | |||
| 238 | // Applies the Canonical Sorting Algorithm. | ||
| 239 | fn canonicalSort(self: Self, cps: []u21) void { | ||
| 240 | var i: usize = 0; | ||
| 241 | while (i < cps.len) : (i += 1) { | ||
| 242 | const start: usize = i; | ||
| 243 | while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} | ||
| 244 | mem.sort(u21, cps[start..i], self, cccLess); | ||
| 245 | } | ||
| 246 | } | ||
| 247 | |||
| 248 | /// Normalize `str` to NFD. | ||
| 249 | pub fn nfd(self: Self, allocator: mem.Allocator, str: []const u8) !Result { | ||
| 250 | return self.nfxd(allocator, str, .nfd); | ||
| 251 | } | ||
| 252 | |||
| 253 | /// Normalize `str` to NFKD. | ||
| 254 | pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) !Result { | ||
| 255 | return self.nfxd(allocator, str, .nfkd); | ||
| 256 | } | ||
| 257 | |||
| 258 | pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) ![]u21 { | ||
| 259 | var dcp_list = std.ArrayList(u21).init(allocator); | ||
| 260 | defer dcp_list.deinit(); | ||
| 261 | |||
| 262 | var cp_iter = CodePointIterator{ .bytes = str }; | ||
| 263 | var dc_buf: [18]u21 = undefined; | ||
| 264 | |||
| 265 | while (cp_iter.next()) |cp| { | ||
| 266 | const dc = self.decompose(cp.code, form, &dc_buf); | ||
| 267 | if (dc.form == .same) { | ||
| 268 | try dcp_list.append(cp.code); | ||
| 269 | } else { | ||
| 270 | try dcp_list.appendSlice(dc.cps); | ||
| 271 | } | ||
| 272 | } | ||
| 273 | |||
| 274 | self.canonicalSort(dcp_list.items); | ||
| 275 | |||
| 276 | return try dcp_list.toOwnedSlice(); | ||
| 277 | } | ||
| 278 | |||
| 279 | fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Result { | ||
| 280 | // Quick checks. | ||
| 281 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; | ||
| 282 | |||
| 283 | const dcps = try self.nfxdCodePoints(allocator, str, form); | ||
| 284 | defer allocator.free(dcps); | ||
| 285 | |||
| 286 | var dstr_list = std.ArrayList(u8).init(allocator); | ||
| 287 | defer dstr_list.deinit(); | ||
| 288 | var buf: [4]u8 = undefined; | ||
| 289 | |||
| 290 | for (dcps) |dcp| { | ||
| 291 | const len = try unicode.utf8Encode(dcp, &buf); | ||
| 292 | try dstr_list.appendSlice(buf[0..len]); | ||
| 293 | } | ||
| 294 | |||
| 295 | return Result{ .allocator = allocator, .slice = try dstr_list.toOwnedSlice() }; | ||
| 296 | } | ||
| 297 | |||
| 298 | test "nfd ASCII / no-alloc" { | ||
| 299 | const allocator = testing.allocator; | ||
| 300 | var data = try NormData.init(allocator); | ||
| 301 | defer data.deinit(); | ||
| 302 | var n = Self{ .norm_data = &data }; | ||
| 303 | |||
| 304 | var result = try n.nfd(allocator, "Hello World!"); | ||
| 305 | defer result.deinit(); | ||
| 306 | |||
| 307 | try testing.expectEqualStrings("Hello World!", result.slice); | ||
| 308 | } | ||
| 309 | |||
| 310 | test "nfd !ASCII / alloc" { | ||
| 311 | const allocator = testing.allocator; | ||
| 312 | var data = try NormData.init(allocator); | ||
| 313 | defer data.deinit(); | ||
| 314 | var n = Self{ .norm_data = &data }; | ||
| 315 | |||
| 316 | var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); | ||
| 317 | defer result.deinit(); | ||
| 318 | |||
| 319 | try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice); | ||
| 320 | } | ||
| 321 | |||
| 322 | test "nfkd ASCII / no-alloc" { | ||
| 323 | const allocator = testing.allocator; | ||
| 324 | var data = try NormData.init(allocator); | ||
| 325 | defer data.deinit(); | ||
| 326 | var n = Self{ .norm_data = &data }; | ||
| 327 | |||
| 328 | var result = try n.nfkd(allocator, "Hello World!"); | ||
| 329 | defer result.deinit(); | ||
| 330 | |||
| 331 | try testing.expectEqualStrings("Hello World!", result.slice); | ||
| 332 | } | ||
| 333 | |||
| 334 | test "nfkd !ASCII / alloc" { | ||
| 335 | const allocator = testing.allocator; | ||
| 336 | var data = try NormData.init(allocator); | ||
| 337 | defer data.deinit(); | ||
| 338 | var n = Self{ .norm_data = &data }; | ||
| 339 | |||
| 340 | var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); | ||
| 341 | defer result.deinit(); | ||
| 342 | |||
| 343 | try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); | ||
| 344 | } | ||
| 345 | |||
| 346 | pub fn nfdCodePoints( | ||
| 347 | self: Self, | ||
| 348 | allocator: mem.Allocator, | ||
| 349 | cps: []const u21, | ||
| 350 | ) ![]u21 { | ||
| 351 | var dcp_list = std.ArrayList(u21).init(allocator); | ||
| 352 | defer dcp_list.deinit(); | ||
| 353 | |||
| 354 | var dc_buf: [18]u21 = undefined; | ||
| 355 | |||
| 356 | for (cps) |cp| { | ||
| 357 | const dc = self.decompose(cp, .nfd, &dc_buf); | ||
| 358 | |||
| 359 | if (dc.form == .same) { | ||
| 360 | try dcp_list.append(cp); | ||
| 361 | } else { | ||
| 362 | try dcp_list.appendSlice(dc.cps); | ||
| 363 | } | ||
| 364 | } | ||
| 365 | |||
| 366 | self.canonicalSort(dcp_list.items); | ||
| 367 | |||
| 368 | return try dcp_list.toOwnedSlice(); | ||
| 369 | } | ||
| 370 | |||
| 371 | pub fn nfkdCodePoints( | ||
| 372 | self: Self, | ||
| 373 | allocator: mem.Allocator, | ||
| 374 | cps: []const u21, | ||
| 375 | ) ![]u21 { | ||
| 376 | var dcp_list = std.ArrayList(u21).init(allocator); | ||
| 377 | defer dcp_list.deinit(); | ||
| 378 | |||
| 379 | var dc_buf: [18]u21 = undefined; | ||
| 380 | |||
| 381 | for (cps) |cp| { | ||
| 382 | const dc = self.decompose(cp, .nfkd, &dc_buf); | ||
| 383 | |||
| 384 | if (dc.form == .same) { | ||
| 385 | try dcp_list.append(cp); | ||
| 386 | } else { | ||
| 387 | try dcp_list.appendSlice(dc.cps); | ||
| 388 | } | ||
| 389 | } | ||
| 390 | |||
| 391 | self.canonicalSort(dcp_list.items); | ||
| 392 | |||
| 393 | return try dcp_list.toOwnedSlice(); | ||
| 394 | } | ||
| 395 | |||
| 396 | // Composition (NFC, NFKC) | ||
| 397 | |||
| 398 | fn isHangul(self: Self, cp: u21) bool { | ||
| 399 | return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none; | ||
| 400 | } | ||
| 401 | |||
| 402 | /// Normalizes `str` to NFC. | ||
| 403 | pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) !Result { | ||
| 404 | return self.nfxc(allocator, str, .nfc); | ||
| 405 | } | ||
| 406 | |||
| 407 | /// Normalizes `str` to NFKC. | ||
| 408 | pub fn nfkc(self: Self, allocator: mem.Allocator, str: []const u8) !Result { | ||
| 409 | return self.nfxc(allocator, str, .nfkc); | ||
| 410 | } | ||
| 411 | |||
| 412 | fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Result { | ||
| 413 | // Quick checks. | ||
| 414 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; | ||
| 415 | if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str }; | ||
| 416 | |||
| 417 | // Decompose first. | ||
| 418 | var dcps = if (form == .nfc) | ||
| 419 | try self.nfxdCodePoints(allocator, str, .nfd) | ||
| 420 | else | ||
| 421 | try self.nfxdCodePoints(allocator, str, .nfkd); | ||
| 422 | defer allocator.free(dcps); | ||
| 423 | |||
| 424 | // Compose | ||
| 425 | const tombstone = 0xe000; // Start of BMP Private Use Area | ||
| 426 | |||
| 427 | // Loop over all decomposed code points. | ||
| 428 | while (true) { | ||
| 429 | var i: usize = 1; // start at second code point. | ||
| 430 | var deleted: usize = 0; | ||
| 431 | |||
| 432 | // For each code point, C, find the preceding | ||
| 433 | // starter code point L, if any. | ||
| 434 | block_check: while (i < dcps.len) : (i += 1) { | ||
| 435 | const C = dcps[i]; | ||
| 436 | if (C == tombstone) continue :block_check; | ||
| 437 | const cc_C = self.norm_data.ccc_data.ccc(C); | ||
| 438 | var starter_index: ?usize = null; | ||
| 439 | var j: usize = i; | ||
| 440 | |||
| 441 | // Seek back to find starter L, if any. | ||
| 442 | while (true) { | ||
| 443 | j -= 1; | ||
| 444 | if (dcps[j] == tombstone) continue; | ||
| 445 | |||
| 446 | // Check for starter. | ||
| 447 | if (self.norm_data.ccc_data.isStarter(dcps[j])) { | ||
| 448 | // Check for blocking conditions. | ||
| 449 | for (dcps[(j + 1)..i]) |B| { | ||
| 450 | if (B == tombstone) continue; | ||
| 451 | const cc_B = self.norm_data.ccc_data.ccc(B); | ||
| 452 | if (cc_B != 0 and self.isHangul(C)) continue :block_check; | ||
| 453 | if (cc_B >= cc_C) continue :block_check; | ||
| 454 | } | ||
| 455 | |||
| 456 | // Found starter at j. | ||
| 457 | starter_index = j; | ||
| 458 | break; | ||
| 459 | } | ||
| 460 | |||
| 461 | if (j == 0) break; | ||
| 462 | } | ||
| 463 | |||
| 464 | // If we have a starter L, see if there's a primary | ||
| 465 | // composite, P, for the sequence L, C. If so, we must | ||
| 466 | // repace L with P and delete C. | ||
| 467 | if (starter_index) |sidx| { | ||
| 468 | const L = dcps[sidx]; | ||
| 469 | var processed_hangul = false; | ||
| 470 | |||
| 471 | // If L and C are Hangul syllables, we can compose | ||
| 472 | // them algorithmically if possible. | ||
| 473 | if (self.isHangul(L) and self.isHangul(C)) { | ||
| 474 | // Get Hangul syllable types. | ||
| 475 | const l_stype = self.norm_data.hangul_data.syllable(L); | ||
| 476 | const c_stype = self.norm_data.hangul_data.syllable(C); | ||
| 477 | |||
| 478 | if (l_stype == .LV and c_stype == .T) { | ||
| 479 | // LV, T canonical composition. | ||
| 480 | dcps[sidx] = composeHangulCanon(L, C); | ||
| 481 | dcps[i] = tombstone; // Mark for deletion. | ||
| 482 | processed_hangul = true; | ||
| 483 | } | ||
| 484 | |||
| 485 | if (l_stype == .L and c_stype == .V) { | ||
| 486 | // L, V full composition. L, V, T is handled via main loop. | ||
| 487 | dcps[sidx] = composeHangulFull(L, C, 0); | ||
| 488 | dcps[i] = tombstone; // Mark for deletion. | ||
| 489 | processed_hangul = true; | ||
| 490 | } | ||
| 491 | |||
| 492 | if (processed_hangul) deleted += 1; | ||
| 493 | } | ||
| 494 | |||
| 495 | // If no composition has occurred yet. | ||
| 496 | if (!processed_hangul) { | ||
| 497 | // L, C are not Hangul, so check for primary composite | ||
| 498 | // in the Unicode Character Database. | ||
| 499 | if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| { | ||
| 500 | // We have a primary composite P for L, C. | ||
| 501 | // We must check if P is not in the Full | ||
| 502 | // Composition Exclusions (FCX) list, | ||
| 503 | // preventing it from appearing in any | ||
| 504 | // composed form (NFC, NFKC). | ||
| 505 | if (!self.norm_data.normp_data.isFcx(P)) { | ||
| 506 | dcps[sidx] = P; | ||
| 507 | dcps[i] = tombstone; // Mark for deletion. | ||
| 508 | deleted += 1; | ||
| 509 | } | ||
| 510 | } | ||
| 511 | } | ||
| 512 | } | ||
| 513 | } | ||
| 514 | |||
| 515 | // If we have no deletions. the code point sequence | ||
| 516 | // has been fully composed. | ||
| 517 | if (deleted == 0) { | ||
| 518 | var cstr_list = std.ArrayList(u8).init(allocator); | ||
| 519 | defer cstr_list.deinit(); | ||
| 520 | var buf: [4]u8 = undefined; | ||
| 521 | |||
| 522 | for (dcps) |cp| { | ||
| 523 | if (cp == tombstone) continue; // "Delete" | ||
| 524 | const len = try unicode.utf8Encode(cp, &buf); | ||
| 525 | try cstr_list.appendSlice(buf[0..len]); | ||
| 526 | } | ||
| 527 | |||
| 528 | return Result{ .allocator = allocator, .slice = try cstr_list.toOwnedSlice() }; | ||
| 529 | } | ||
| 530 | } | ||
| 531 | } | ||
| 532 | |||
| 533 | test "nfc" { | ||
| 534 | const allocator = testing.allocator; | ||
| 535 | var data = try NormData.init(allocator); | ||
| 536 | defer data.deinit(); | ||
| 537 | var n = Self{ .norm_data = &data }; | ||
| 538 | |||
| 539 | var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); | ||
| 540 | defer result.deinit(); | ||
| 541 | |||
| 542 | try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice); | ||
| 543 | } | ||
| 544 | |||
| 545 | test "nfkc" { | ||
| 546 | const allocator = testing.allocator; | ||
| 547 | var data = try NormData.init(allocator); | ||
| 548 | defer data.deinit(); | ||
| 549 | var n = Self{ .norm_data = &data }; | ||
| 550 | |||
| 551 | var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); | ||
| 552 | defer result.deinit(); | ||
| 553 | |||
| 554 | try testing.expectEqualStrings("Complex char: \u{038E}", result.slice); | ||
| 555 | } | ||
| 556 | |||
| 557 | /// Tests for equality of `a` and `b` after normalizing to NFC. | ||
| 558 | pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool { | ||
| 559 | var norm_result_a = try self.nfc(allocator, a); | ||
| 560 | defer norm_result_a.deinit(); | ||
| 561 | var norm_result_b = try self.nfc(allocator, b); | ||
| 562 | defer norm_result_b.deinit(); | ||
| 563 | |||
| 564 | return mem.eql(u8, norm_result_a.slice, norm_result_b.slice); | ||
| 565 | } | ||
| 566 | |||
| 567 | test "eql" { | ||
| 568 | const allocator = testing.allocator; | ||
| 569 | var data = try NormData.init(allocator); | ||
| 570 | defer data.deinit(); | ||
| 571 | var n = Self{ .norm_data = &data }; | ||
| 572 | |||
| 573 | try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); | ||
| 574 | try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); | ||
| 575 | } | ||
| 576 | |||
| 577 | // FCD | ||
| 578 | fn getLeadCcc(self: Self, cp: u21) u8 { | ||
| 579 | const dc = self.mapping(cp, .nfd); | ||
| 580 | const dcp = if (dc.form == .same) cp else dc.cps[0]; | ||
| 581 | return self.norm_data.ccc_data.ccc(dcp); | ||
| 582 | } | ||
| 583 | |||
| 584 | fn getTrailCcc(self: Self, cp: u21) u8 { | ||
| 585 | const dc = self.mapping(cp, .nfd); | ||
| 586 | const dcp = if (dc.form == .same) cp else dc.cps[dc.cps.len - 1]; | ||
| 587 | return self.norm_data.ccc_data.ccc(dcp); | ||
| 588 | } | ||
| 589 | |||
| 590 | /// Fast check to detect if a string is already in NFC or NFD form. | ||
| 591 | pub fn isFcd(self: Self, str: []const u8) bool { | ||
| 592 | var prev_ccc: u8 = 0; | ||
| 593 | var cp_iter = CodePointIterator{ .bytes = str }; | ||
| 594 | |||
| 595 | return while (cp_iter.next()) |cp| { | ||
| 596 | const ccc = self.getLeadCcc(cp.code); | ||
| 597 | if (ccc != 0 and ccc < prev_ccc) break false; | ||
| 598 | prev_ccc = self.getTrailCcc(cp.code); | ||
| 599 | } else true; | ||
| 600 | } | ||
| 601 | |||
| 602 | test "isFcd" { | ||
| 603 | const allocator = testing.allocator; | ||
| 604 | var data = try NormData.init(allocator); | ||
| 605 | defer data.deinit(); | ||
| 606 | var n = Self{ .norm_data = &data }; | ||
| 607 | |||
| 608 | const is_nfc = "José \u{3D3}"; | ||
| 609 | try testing.expect(n.isFcd(is_nfc)); | ||
| 610 | |||
| 611 | const is_nfd = "Jose\u{301} \u{3d2}\u{301}"; | ||
| 612 | try testing.expect(n.isFcd(is_nfd)); | ||
| 613 | |||
| 614 | const not_fcd = "Jose\u{301} \u{3d2}\u{315}\u{301}"; | ||
| 615 | try testing.expect(!n.isFcd(not_fcd)); | ||
| 616 | } | ||
| 617 | |||
| 618 | test "Unicode normalization tests" { | ||
| 619 | var arena = heap.ArenaAllocator.init(testing.allocator); | ||
| 620 | defer arena.deinit(); | ||
| 621 | var allocator = arena.allocator(); | ||
| 622 | |||
| 623 | var data = try NormData.init(allocator); | ||
| 624 | defer data.deinit(); | ||
| 625 | var n = Self{ .norm_data = &data }; | ||
| 626 | |||
| 627 | var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); | ||
| 628 | defer file.close(); | ||
| 629 | var buf_reader = io.bufferedReader(file.reader()); | ||
| 630 | const input_stream = buf_reader.reader(); | ||
| 631 | |||
| 632 | var line_no: usize = 0; | ||
| 633 | var buf: [4096]u8 = undefined; | ||
| 634 | var cp_buf: [4]u8 = undefined; | ||
| 635 | |||
| 636 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| { | ||
| 637 | line_no += 1; | ||
| 638 | // Skip comments or empty lines. | ||
| 639 | if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; | ||
| 640 | // Iterate over fields. | ||
| 641 | var fields = mem.split(u8, line, ";"); | ||
| 642 | var field_index: usize = 0; | ||
| 643 | var input: []u8 = undefined; | ||
| 644 | defer allocator.free(input); | ||
| 645 | |||
| 646 | while (fields.next()) |field| : (field_index += 1) { | ||
| 647 | if (field_index == 0) { | ||
| 648 | var i_buf = std.ArrayList(u8).init(allocator); | ||
| 649 | defer i_buf.deinit(); | ||
| 650 | |||
| 651 | var i_fields = mem.split(u8, field, " "); | ||
| 652 | while (i_fields.next()) |s| { | ||
| 653 | const icp = try fmt.parseInt(u21, s, 16); | ||
| 654 | const len = try unicode.utf8Encode(icp, &cp_buf); | ||
| 655 | try i_buf.appendSlice(cp_buf[0..len]); | ||
| 656 | } | ||
| 657 | |||
| 658 | input = try i_buf.toOwnedSlice(); | ||
| 659 | } else if (field_index == 1) { | ||
| 660 | //debug.print("\n*** {s} ***\n", .{line}); | ||
| 661 | // NFC, time to test. | ||
| 662 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 663 | defer w_buf.deinit(); | ||
| 664 | |||
| 665 | var w_fields = mem.split(u8, field, " "); | ||
| 666 | while (w_fields.next()) |s| { | ||
| 667 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 668 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 669 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 670 | } | ||
| 671 | |||
| 672 | const want = w_buf.items; | ||
| 673 | var got = try n.nfc(allocator, input); | ||
| 674 | defer got.deinit(); | ||
| 675 | |||
| 676 | try testing.expectEqualStrings(want, got.slice); | ||
| 677 | } else if (field_index == 2) { | ||
| 678 | // NFD, time to test. | ||
| 679 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 680 | defer w_buf.deinit(); | ||
| 681 | |||
| 682 | var w_fields = mem.split(u8, field, " "); | ||
| 683 | while (w_fields.next()) |s| { | ||
| 684 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 685 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 686 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 687 | } | ||
| 688 | |||
| 689 | const want = w_buf.items; | ||
| 690 | var got = try n.nfd(allocator, input); | ||
| 691 | defer got.deinit(); | ||
| 692 | |||
| 693 | try testing.expectEqualStrings(want, got.slice); | ||
| 694 | } else if (field_index == 3) { | ||
| 695 | // NFKC, time to test. | ||
| 696 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 697 | defer w_buf.deinit(); | ||
| 698 | |||
| 699 | var w_fields = mem.split(u8, field, " "); | ||
| 700 | while (w_fields.next()) |s| { | ||
| 701 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 702 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 703 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 704 | } | ||
| 705 | |||
| 706 | const want = w_buf.items; | ||
| 707 | var got = try n.nfkc(allocator, input); | ||
| 708 | defer got.deinit(); | ||
| 709 | |||
| 710 | try testing.expectEqualStrings(want, got.slice); | ||
| 711 | } else if (field_index == 4) { | ||
| 712 | // NFKD, time to test. | ||
| 713 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 714 | defer w_buf.deinit(); | ||
| 715 | |||
| 716 | var w_fields = mem.split(u8, field, " "); | ||
| 717 | while (w_fields.next()) |s| { | ||
| 718 | const wcp = try fmt.parseInt(u21, s, 16); | ||
| 719 | const len = try unicode.utf8Encode(wcp, &cp_buf); | ||
| 720 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 721 | } | ||
| 722 | |||
| 723 | const want = w_buf.items; | ||
| 724 | var got = try n.nfkd(allocator, input); | ||
| 725 | defer got.deinit(); | ||
| 726 | |||
| 727 | try testing.expectEqualStrings(want, got.slice); | ||
| 728 | } else { | ||
| 729 | continue; | ||
| 730 | } | ||
| 731 | } | ||
| 732 | } | ||
| 733 | } | ||
| 734 | |||
| 735 | /// Returns true if `str` only contains Latin-1 Supplement | ||
| 736 | /// code points. Uses SIMD if possible. | ||
| 737 | pub fn isLatin1Only(str: []const u8) bool { | ||
| 738 | var cp_iter = CodePointIterator{ .bytes = str }; | ||
| 739 | |||
| 740 | const vec_len = simd.suggestVectorLength(u21) orelse return blk: { | ||
| 741 | break :blk while (cp_iter.next()) |cp| { | ||
| 742 | if (cp.code > 256) break false; | ||
| 743 | } else true; | ||
| 744 | }; | ||
| 745 | |||
| 746 | const Vec = @Vector(vec_len, u21); | ||
| 747 | |||
| 748 | outer: while (true) { | ||
| 749 | var v1: Vec = undefined; | ||
| 750 | const saved_cp_i = cp_iter.i; | ||
| 751 | |||
| 752 | for (0..vec_len) |i| { | ||
| 753 | if (cp_iter.next()) |cp| { | ||
| 754 | v1[i] = cp.code; | ||
| 755 | } else { | ||
| 756 | cp_iter.i = saved_cp_i; | ||
| 757 | break :outer; | ||
| 758 | } | ||
| 759 | } | ||
| 760 | const v2: Vec = @splat(256); | ||
| 761 | if (@reduce(.Or, v1 > v2)) return false; | ||
| 762 | } | ||
| 763 | |||
| 764 | return while (cp_iter.next()) |cp| { | ||
| 765 | if (cp.code > 256) break false; | ||
| 766 | } else true; | ||
| 767 | } | ||
| 768 | |||
| 769 | test "isLatin1Only" { | ||
| 770 | const latin1_only = "Hello, World! \u{fe} \u{ff}"; | ||
| 771 | try testing.expect(isLatin1Only(latin1_only)); | ||
| 772 | const not_latin1_only = "Héllo, World! \u{3d3}"; | ||
| 773 | try testing.expect(!isLatin1Only(not_latin1_only)); | ||
| 774 | } | ||