diff options
| author | 2024-02-20 09:13:36 -0400 | |
|---|---|---|
| committer | 2024-02-20 09:13:36 -0400 | |
| commit | 134a7df8206aa66ca7b0abbc7f31f08410b502d2 (patch) | |
| tree | e3c520dce4f529de5290bc145847e78fb5583bee /src/Normalizer.zig | |
| parent | Cleaned up directory structure (diff) | |
| download | zg-134a7df8206aa66ca7b0abbc7f31f08410b502d2.tar.gz zg-134a7df8206aa66ca7b0abbc7f31f08410b502d2.tar.xz zg-134a7df8206aa66ca7b0abbc7f31f08410b502d2.zip | |
Replaced ccc_map with table. 20ms faster
Diffstat (limited to 'src/Normalizer.zig')
| -rw-r--r-- | src/Normalizer.zig | 863 |
1 files changed, 863 insertions, 0 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig new file mode 100644 index 0000000..1b4a2d5 --- /dev/null +++ b/src/Normalizer.zig | |||
| @@ -0,0 +1,863 @@ | |||
| 1 | //! Normalizer contains functions and methods that implement Unicode Normalization algorithms. You can normalize strings | ||
| 2 | //! into NFC, NFKC, NFD, and NFKD normalization forms (see `nfc`, `nfkc`, `nfd`, and `nfkd`). You can also test for | ||
| 3 | //! string equality under different parameters related to normalization (see `eql`, `eqlCaseless`, `eqlIdentifiers`). | ||
| 4 | |||
| 5 | const std = @import("std"); | ||
| 6 | |||
| 7 | const CodePointIterator = @import("code_point").Iterator; | ||
| 8 | const case_fold_map = @import("ziglyph").case_folding; | ||
| 9 | const hangul_map = @import("ziglyph").hangul; | ||
| 10 | const norm_props = @import("ziglyph").normalization_props; | ||
| 11 | const normp = @import("normp"); | ||
| 12 | |||
| 13 | const Self = @This(); | ||
| 14 | |||
| 15 | nfc_map: std.AutoHashMap([2]u21, u21), | ||
| 16 | nfd_map: std.AutoHashMap(u21, [2]u21), | ||
| 17 | nfkd_map: std.AutoHashMap(u21, [18]u21), | ||
| 18 | |||
| 19 | pub fn init(allocator: std.mem.Allocator) !Self { | ||
| 20 | var self = Self{ | ||
| 21 | .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator), | ||
| 22 | .nfd_map = std.AutoHashMap(u21, [2]u21).init(allocator), | ||
| 23 | .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), | ||
| 24 | }; | ||
| 25 | errdefer self.deinit(); | ||
| 26 | |||
| 27 | // Canonical compositions | ||
| 28 | const decompressor = std.compress.deflate.decompressor; | ||
| 29 | const comp_file = @embedFile("autogen/canonical_compositions.txt.deflate"); | ||
| 30 | var comp_stream = std.io.fixedBufferStream(comp_file); | ||
| 31 | var comp_decomp = try decompressor(allocator, comp_stream.reader(), null); | ||
| 32 | defer comp_decomp.deinit(); | ||
| 33 | |||
| 34 | var comp_buf = std.io.bufferedReader(comp_decomp.reader()); | ||
| 35 | const comp_reader = comp_buf.reader(); | ||
| 36 | var buf: [4096]u8 = undefined; | ||
| 37 | |||
| 38 | while (try comp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| { | ||
| 39 | if (line.len == 0) continue; | ||
| 40 | var fields = std.mem.split(u8, line, ";"); | ||
| 41 | const cp_a = try std.fmt.parseInt(u21, fields.next().?, 16); | ||
| 42 | const cp_b = try std.fmt.parseInt(u21, fields.next().?, 16); | ||
| 43 | const cp_c = try std.fmt.parseInt(u21, fields.next().?, 16); | ||
| 44 | try self.nfc_map.put(.{ cp_a, cp_b }, cp_c); | ||
| 45 | } | ||
| 46 | |||
| 47 | // Canonical decompositions | ||
| 48 | const decomp_file = @embedFile("autogen/canonical_decompositions.txt.deflate"); | ||
| 49 | var decomp_stream = std.io.fixedBufferStream(decomp_file); | ||
| 50 | var decomp_decomp = try decompressor(allocator, decomp_stream.reader(), null); | ||
| 51 | defer decomp_decomp.deinit(); | ||
| 52 | |||
| 53 | var decomp_buf = std.io.bufferedReader(decomp_decomp.reader()); | ||
| 54 | const decomp_reader = decomp_buf.reader(); | ||
| 55 | |||
| 56 | while (try decomp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| { | ||
| 57 | if (line.len == 0) continue; | ||
| 58 | var fields = std.mem.split(u8, line, ";"); | ||
| 59 | const cp_a = try std.fmt.parseInt(u21, fields.next().?, 16); | ||
| 60 | const cp_b = try std.fmt.parseInt(u21, fields.next().?, 16); | ||
| 61 | const cp_c = try std.fmt.parseInt(u21, fields.next().?, 16); | ||
| 62 | try self.nfd_map.put(cp_a, .{ cp_b, cp_c }); | ||
| 63 | } | ||
| 64 | |||
| 65 | // Compatibility decompositions | ||
| 66 | const dekomp_file = @embedFile("autogen/compatibility_decompositions.txt.deflate"); | ||
| 67 | var dekomp_stream = std.io.fixedBufferStream(dekomp_file); | ||
| 68 | var dekomp_decomp = try decompressor(allocator, dekomp_stream.reader(), null); | ||
| 69 | defer dekomp_decomp.deinit(); | ||
| 70 | |||
| 71 | var dekomp_buf = std.io.bufferedReader(dekomp_decomp.reader()); | ||
| 72 | const dekomp_reader = dekomp_buf.reader(); | ||
| 73 | |||
| 74 | while (try dekomp_reader.readUntilDelimiterOrEof(&buf, '\n')) |line| { | ||
| 75 | if (line.len == 0) continue; | ||
| 76 | var fields = std.mem.split(u8, line, ";"); | ||
| 77 | const cp_a = try std.fmt.parseInt(u21, fields.next().?, 16); | ||
| 78 | var cps = [_]u21{0} ** 18; | ||
| 79 | var i: usize = 0; | ||
| 80 | |||
| 81 | while (fields.next()) |cp| : (i += 1) { | ||
| 82 | cps[i] = try std.fmt.parseInt(u21, cp, 16); | ||
| 83 | } | ||
| 84 | |||
| 85 | try self.nfkd_map.put(cp_a, cps); | ||
| 86 | } | ||
| 87 | |||
| 88 | return self; | ||
| 89 | } | ||
| 90 | |||
| 91 | pub fn deinit(self: *Self) void { | ||
| 92 | self.nfc_map.deinit(); | ||
| 93 | self.nfd_map.deinit(); | ||
| 94 | self.nfkd_map.deinit(); | ||
| 95 | } | ||
| 96 | |||
| 97 | test "init / deinit" { | ||
| 98 | var n = try init(std.testing.allocator); | ||
| 99 | defer n.deinit(); | ||
| 100 | } | ||
| 101 | |||
| 102 | // Hangul processing utilities. | ||
| 103 | fn isHangulPrecomposed(cp: u21) bool { | ||
| 104 | if (hangul_map.syllableType(cp)) |kind| return kind == .LV or kind == .LVT; | ||
| 105 | return false; | ||
| 106 | } | ||
| 107 | |||
| 108 | const SBase: u21 = 0xAC00; | ||
| 109 | const LBase: u21 = 0x1100; | ||
| 110 | const VBase: u21 = 0x1161; | ||
| 111 | const TBase: u21 = 0x11A7; | ||
| 112 | const LCount: u21 = 19; | ||
| 113 | const VCount: u21 = 21; | ||
| 114 | const TCount: u21 = 28; | ||
| 115 | const NCount: u21 = 588; // VCount * TCount | ||
| 116 | const SCount: u21 = 11172; // LCount * NCount | ||
| 117 | |||
| 118 | fn decomposeHangul(cp: u21) [3]u21 { | ||
| 119 | const SIndex: u21 = cp - SBase; | ||
| 120 | const LIndex: u21 = SIndex / NCount; | ||
| 121 | const VIndex: u21 = (SIndex % NCount) / TCount; | ||
| 122 | const TIndex: u21 = SIndex % TCount; | ||
| 123 | const LPart: u21 = LBase + LIndex; | ||
| 124 | const VPart: u21 = VBase + VIndex; | ||
| 125 | var TPart: u21 = 0; | ||
| 126 | if (TIndex != 0) TPart = TBase + TIndex; | ||
| 127 | |||
| 128 | return [3]u21{ LPart, VPart, TPart }; | ||
| 129 | } | ||
| 130 | |||
| 131 | fn composeHangulCanon(lv: u21, t: u21) u21 { | ||
| 132 | std.debug.assert(0x11A8 <= t and t <= 0x11C2); | ||
| 133 | return lv + (t - TBase); | ||
| 134 | } | ||
| 135 | |||
| 136 | fn composeHangulFull(l: u21, v: u21, t: u21) u21 { | ||
| 137 | std.debug.assert(0x1100 <= l and l <= 0x1112); | ||
| 138 | std.debug.assert(0x1161 <= v and v <= 0x1175); | ||
| 139 | const LIndex = l - LBase; | ||
| 140 | const VIndex = v - VBase; | ||
| 141 | const LVIndex = LIndex * NCount + VIndex * TCount; | ||
| 142 | |||
| 143 | if (t == 0) return SBase + LVIndex; | ||
| 144 | |||
| 145 | std.debug.assert(0x11A8 <= t and t <= 0x11C2); | ||
| 146 | const TIndex = t - TBase; | ||
| 147 | |||
| 148 | return SBase + LVIndex + TIndex; | ||
| 149 | } | ||
| 150 | |||
| 151 | const Form = enum { | ||
| 152 | nfc, | ||
| 153 | nfd, | ||
| 154 | nfkc, | ||
| 155 | nfkd, | ||
| 156 | same, | ||
| 157 | }; | ||
| 158 | |||
| 159 | const Decomp = struct { | ||
| 160 | form: Form = .nfd, | ||
| 161 | cps: [18]u21 = [_]u21{0} ** 18, | ||
| 162 | }; | ||
| 163 | |||
| 164 | /// `mapping` retrieves the decomposition mapping for a code point as per the UCD. | ||
| 165 | pub fn mapping(self: Self, cp: u21, form: Form) Decomp { | ||
| 166 | std.debug.assert(form == .nfd or form == .nfkd); | ||
| 167 | |||
| 168 | var dc = Decomp{ .form = .same }; | ||
| 169 | dc.cps[0] = cp; | ||
| 170 | |||
| 171 | if (self.nfkd_map.get(cp)) |array| { | ||
| 172 | if (form != .nfd) { | ||
| 173 | dc.form = .nfkd; | ||
| 174 | @memcpy(dc.cps[0..array.len], &array); | ||
| 175 | } | ||
| 176 | } else if (self.nfd_map.get(cp)) |array| { | ||
| 177 | dc.form = .nfd; | ||
| 178 | @memcpy(dc.cps[0..array.len], &array); | ||
| 179 | } | ||
| 180 | |||
| 181 | return dc; | ||
| 182 | } | ||
| 183 | |||
| 184 | /// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`. | ||
| 185 | pub fn decompose(self: Self, cp: u21, form: Form) Decomp { | ||
| 186 | std.debug.assert(form == .nfd or form == .nfkd); | ||
| 187 | |||
| 188 | var dc = Decomp{ .form = form }; | ||
| 189 | |||
| 190 | // ASCII or NFD / NFKD quick checks. | ||
| 191 | if (cp <= 127 or (form == .nfd and norm_props.isNfd(cp)) or (form == .nfkd and norm_props.isNfkd(cp))) { | ||
| 192 | dc.cps[0] = cp; | ||
| 193 | return dc; | ||
| 194 | } | ||
| 195 | |||
| 196 | // Hangul precomposed syllable full decomposition. | ||
| 197 | if (isHangulPrecomposed(cp)) { | ||
| 198 | const cps = decomposeHangul(cp); | ||
| 199 | @memcpy(dc.cps[0..cps.len], &cps); | ||
| 200 | return dc; | ||
| 201 | } | ||
| 202 | |||
| 203 | // Full decomposition. | ||
| 204 | var result_index: usize = 0; | ||
| 205 | var work_index: usize = 1; | ||
| 206 | |||
| 207 | // Start work with argument code point. | ||
| 208 | var work = [_]u21{cp} ++ [_]u21{0} ** 17; | ||
| 209 | |||
| 210 | while (work_index > 0) { | ||
| 211 | // Look at previous code point in work queue. | ||
| 212 | work_index -= 1; | ||
| 213 | const next = work[work_index]; | ||
| 214 | const m = self.mapping(next, form); | ||
| 215 | |||
| 216 | // No more of decompositions for this code point. | ||
| 217 | if (m.form == .same) { | ||
| 218 | dc.cps[result_index] = m.cps[0]; | ||
| 219 | result_index += 1; | ||
| 220 | continue; | ||
| 221 | } | ||
| 222 | |||
| 223 | // Find last index of decomposition. | ||
| 224 | const m_last = for (m.cps, 0..) |mcp, i| { | ||
| 225 | if (mcp == 0) break i; | ||
| 226 | } else m.cps.len; | ||
| 227 | |||
| 228 | // Work backwards through decomposition. | ||
| 229 | // `i` starts at 1 because m_last is 1 past the last code point. | ||
| 230 | var i: usize = 1; | ||
| 231 | while (i <= m_last) : ({ | ||
| 232 | i += 1; | ||
| 233 | work_index += 1; | ||
| 234 | }) { | ||
| 235 | work[work_index] = m.cps[m_last - i]; | ||
| 236 | } | ||
| 237 | } | ||
| 238 | |||
| 239 | return dc; | ||
| 240 | } | ||
| 241 | |||
| 242 | test "decompose" { | ||
| 243 | const allocator = std.testing.allocator; | ||
| 244 | var n = try init(allocator); | ||
| 245 | defer n.deinit(); | ||
| 246 | |||
| 247 | var dc = n.decompose('é', .nfd); | ||
| 248 | try std.testing.expect(dc.form == .nfd); | ||
| 249 | try std.testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]); | ||
| 250 | |||
| 251 | dc = n.decompose('\u{1e0a}', .nfd); | ||
| 252 | try std.testing.expect(dc.form == .nfd); | ||
| 253 | try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); | ||
| 254 | |||
| 255 | dc = n.decompose('\u{1e0a}', .nfkd); | ||
| 256 | try std.testing.expect(dc.form == .nfkd); | ||
| 257 | try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); | ||
| 258 | |||
| 259 | dc = n.decompose('\u{3189}', .nfd); | ||
| 260 | try std.testing.expect(dc.form == .nfd); | ||
| 261 | try std.testing.expectEqualSlices(u21, &[_]u21{'\u{3189}'}, dc.cps[0..1]); | ||
| 262 | |||
| 263 | dc = n.decompose('\u{3189}', .nfkd); | ||
| 264 | try std.testing.expect(dc.form == .nfkd); | ||
| 265 | try std.testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]); | ||
| 266 | |||
| 267 | dc = n.decompose('\u{ace1}', .nfd); | ||
| 268 | try std.testing.expect(dc.form == .nfd); | ||
| 269 | try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); | ||
| 270 | |||
| 271 | dc = n.decompose('\u{ace1}', .nfkd); | ||
| 272 | try std.testing.expect(dc.form == .nfkd); | ||
| 273 | try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); | ||
| 274 | |||
| 275 | dc = n.decompose('\u{3d3}', .nfd); | ||
| 276 | try std.testing.expect(dc.form == .nfd); | ||
| 277 | try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]); | ||
| 278 | |||
| 279 | dc = n.decompose('\u{3d3}', .nfkd); | ||
| 280 | try std.testing.expect(dc.form == .nfkd); | ||
| 281 | try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]); | ||
| 282 | } | ||
| 283 | |||
| 284 | // Some quick checks. | ||
| 285 | |||
| 286 | fn onlyAscii(str: []const u8) bool { | ||
| 287 | return for (str) |b| { | ||
| 288 | if (b > 127) break false; | ||
| 289 | } else true; | ||
| 290 | } | ||
| 291 | |||
| 292 | fn onlyLatin1(str: []const u8) bool { | ||
| 293 | var cp_iter = CodePointIterator{ .bytes = str }; | ||
| 294 | return while (cp_iter.next()) |cp| { | ||
| 295 | if (cp.code > 256) break false; | ||
| 296 | } else true; | ||
| 297 | } | ||
| 298 | |||
| 299 | /// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory. | ||
| 300 | pub const Result = struct { | ||
| 301 | allocator: ?std.mem.Allocator = null, | ||
| 302 | slice: []const u8, | ||
| 303 | |||
| 304 | pub fn deinit(self: *Result) void { | ||
| 305 | if (self.allocator) |allocator| allocator.free(self.slice); | ||
| 306 | } | ||
| 307 | }; | ||
| 308 | |||
| 309 | // Compares code points by Canonical Combining Class order. | ||
| 310 | fn cccLess(_: void, lhs: u21, rhs: u21) bool { | ||
| 311 | const lcc = normp.stage_2[normp.stage_1[lhs >> 8] + (lhs & 0xff)]; | ||
| 312 | const rcc = normp.stage_2[normp.stage_1[rhs >> 8] + (rhs & 0xff)]; | ||
| 313 | return lcc < rcc; | ||
| 314 | } | ||
| 315 | |||
| 316 | // Applies the Canonical Sorting Algorithm. | ||
| 317 | fn canonicalSort(cps: []u21) void { | ||
| 318 | var i: usize = 0; | ||
| 319 | while (i < cps.len) : (i += 1) { | ||
| 320 | const start: usize = i; | ||
| 321 | while (i < cps.len and normp.stage_2[normp.stage_1[cps[i] >> 8] + (cps[i] & 0xff)] != 0) : (i += 1) {} | ||
| 322 | std.mem.sort(u21, cps[start..i], {}, cccLess); | ||
| 323 | } | ||
| 324 | } | ||
| 325 | |||
| 326 | /// Normalize `str` to NFD. | ||
| 327 | pub fn nfd(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { | ||
| 328 | return self.nfxd(allocator, str, .nfd); | ||
| 329 | } | ||
| 330 | |||
| 331 | /// Normalize `str` to NFKD. | ||
| 332 | pub fn nfkd(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { | ||
| 333 | return self.nfxd(allocator, str, .nfkd); | ||
| 334 | } | ||
| 335 | |||
| 336 | fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result { | ||
| 337 | // Quick checks. | ||
| 338 | if (onlyAscii(str)) return Result{ .slice = str }; | ||
| 339 | |||
| 340 | var dcp_list = try std.ArrayList(u21).initCapacity(allocator, str.len + str.len / 2); | ||
| 341 | defer dcp_list.deinit(); | ||
| 342 | |||
| 343 | var cp_iter = CodePointIterator{ .bytes = str }; | ||
| 344 | while (cp_iter.next()) |cp| { | ||
| 345 | const dc = self.decompose(cp.code, form); | ||
| 346 | const slice = for (dc.cps, 0..) |dcp, i| { | ||
| 347 | if (dcp == 0) break dc.cps[0..i]; | ||
| 348 | } else dc.cps[0..]; | ||
| 349 | try dcp_list.appendSlice(slice); | ||
| 350 | } | ||
| 351 | |||
| 352 | canonicalSort(dcp_list.items); | ||
| 353 | |||
| 354 | var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4); | ||
| 355 | defer dstr_list.deinit(); | ||
| 356 | |||
| 357 | var buf: [4]u8 = undefined; | ||
| 358 | for (dcp_list.items) |dcp| { | ||
| 359 | const len = try std.unicode.utf8Encode(dcp, &buf); | ||
| 360 | dstr_list.appendSliceAssumeCapacity(buf[0..len]); | ||
| 361 | } | ||
| 362 | |||
| 363 | return Result{ .allocator = allocator, .slice = try dstr_list.toOwnedSlice() }; | ||
| 364 | } | ||
| 365 | |||
| 366 | test "nfd ASCII / no-alloc" { | ||
| 367 | const allocator = std.testing.allocator; | ||
| 368 | var n = try init(allocator); | ||
| 369 | defer n.deinit(); | ||
| 370 | |||
| 371 | var result = try n.nfd(allocator, "Hello World!"); | ||
| 372 | defer result.deinit(); | ||
| 373 | |||
| 374 | try std.testing.expectEqualStrings("Hello World!", result.slice); | ||
| 375 | } | ||
| 376 | |||
| 377 | test "nfd !ASCII / alloc" { | ||
| 378 | const allocator = std.testing.allocator; | ||
| 379 | var n = try init(allocator); | ||
| 380 | defer n.deinit(); | ||
| 381 | |||
| 382 | var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); | ||
| 383 | defer result.deinit(); | ||
| 384 | |||
| 385 | try std.testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice); | ||
| 386 | } | ||
| 387 | |||
| 388 | test "nfkd ASCII / no-alloc" { | ||
| 389 | const allocator = std.testing.allocator; | ||
| 390 | var n = try init(allocator); | ||
| 391 | defer n.deinit(); | ||
| 392 | |||
| 393 | var result = try n.nfkd(allocator, "Hello World!"); | ||
| 394 | defer result.deinit(); | ||
| 395 | |||
| 396 | try std.testing.expectEqualStrings("Hello World!", result.slice); | ||
| 397 | } | ||
| 398 | |||
| 399 | test "nfkd !ASCII / alloc" { | ||
| 400 | const allocator = std.testing.allocator; | ||
| 401 | var n = try init(allocator); | ||
| 402 | defer n.deinit(); | ||
| 403 | |||
| 404 | var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); | ||
| 405 | defer result.deinit(); | ||
| 406 | |||
| 407 | try std.testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); | ||
| 408 | } | ||
| 409 | |||
| 410 | // Composition utilities. | ||
| 411 | |||
| 412 | fn isHangul(cp: u21) bool { | ||
| 413 | return cp >= 0x1100 and hangul_map.syllableType(cp) != null; | ||
| 414 | } | ||
| 415 | |||
| 416 | fn isStarter(cp: u21) bool { | ||
| 417 | return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] == 0; | ||
| 418 | } | ||
| 419 | |||
| 420 | fn isCombining(cp: u21) bool { | ||
| 421 | return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] != 0; | ||
| 422 | } | ||
| 423 | |||
| 424 | fn isNonHangulStarter(cp: u21) bool { | ||
| 425 | return !isHangul(cp) and isStarter(cp); | ||
| 426 | } | ||
| 427 | |||
| 428 | /// Normalizes `str` to NFC. | ||
| 429 | pub fn nfc(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { | ||
| 430 | return self.nfxc(allocator, str, .nfc); | ||
| 431 | } | ||
| 432 | |||
| 433 | /// Normalizes `str` to NFKC. | ||
| 434 | pub fn nfkc(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { | ||
| 435 | return self.nfxc(allocator, str, .nfkc); | ||
| 436 | } | ||
| 437 | |||
| 438 | fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result { | ||
| 439 | // Quick checks. | ||
| 440 | if (onlyAscii(str)) return Result{ .slice = str }; | ||
| 441 | if (form == .nfc and onlyLatin1(str)) return Result{ .slice = str }; | ||
| 442 | |||
| 443 | // Decompose first. | ||
| 444 | var d_result = if (form == .nfc) | ||
| 445 | try self.nfd(allocator, str) | ||
| 446 | else | ||
| 447 | try self.nfkd(allocator, str); | ||
| 448 | defer d_result.deinit(); | ||
| 449 | |||
| 450 | // Get code points. | ||
| 451 | var cp_iter = CodePointIterator{ .bytes = d_result.slice }; | ||
| 452 | |||
| 453 | var d_list = try std.ArrayList(u21).initCapacity(allocator, d_result.slice.len); | ||
| 454 | defer d_list.deinit(); | ||
| 455 | |||
| 456 | while (cp_iter.next()) |cp| d_list.appendAssumeCapacity(cp.code); | ||
| 457 | |||
| 458 | // Compose | ||
| 459 | const tombstone = 0xe000; // Start of BMP Private Use Area | ||
| 460 | |||
| 461 | while (true) { | ||
| 462 | var i: usize = 1; // start at second code point. | ||
| 463 | var deleted: usize = 0; | ||
| 464 | |||
| 465 | block_check: while (i < d_list.items.len) : (i += 1) { | ||
| 466 | const C = d_list.items[i]; | ||
| 467 | const cc_C = normp.stage_2[normp.stage_1[C >> 8] + (C & 0xff)]; | ||
| 468 | var starter_index: ?usize = null; | ||
| 469 | var j: usize = i; | ||
| 470 | |||
| 471 | while (true) { | ||
| 472 | j -= 1; | ||
| 473 | |||
| 474 | // Check for starter. | ||
| 475 | if (isStarter(d_list.items[j])) { | ||
| 476 | if (i - j > 1) { // If there's distance between the starting point and the current position. | ||
| 477 | for (d_list.items[(j + 1)..i]) |B| { | ||
| 478 | // Check for blocking conditions. | ||
| 479 | if (isHangul(C)) { | ||
| 480 | if (isCombining(B) or isNonHangulStarter(B)) continue :block_check; | ||
| 481 | } | ||
| 482 | const cc_B = normp.stage_2[normp.stage_1[B >> 8] + (B & 0xff)]; | ||
| 483 | if (cc_B >= cc_C) continue :block_check; | ||
| 484 | } | ||
| 485 | } | ||
| 486 | |||
| 487 | // Found starter at j. | ||
| 488 | starter_index = j; | ||
| 489 | break; | ||
| 490 | } | ||
| 491 | |||
| 492 | if (j == 0) break; | ||
| 493 | } | ||
| 494 | |||
| 495 | if (starter_index) |sidx| { | ||
| 496 | const L = d_list.items[sidx]; | ||
| 497 | var processed_hangul = false; | ||
| 498 | |||
| 499 | if (isHangul(L) and isHangul(C)) { | ||
| 500 | const l_stype = hangul_map.syllableType(L).?; | ||
| 501 | const c_stype = hangul_map.syllableType(C).?; | ||
| 502 | |||
| 503 | if (l_stype == .LV and c_stype == .T) { | ||
| 504 | // LV, T | ||
| 505 | d_list.items[sidx] = composeHangulCanon(L, C); | ||
| 506 | d_list.items[i] = tombstone; // Mark for deletion. | ||
| 507 | processed_hangul = true; | ||
| 508 | } | ||
| 509 | |||
| 510 | if (l_stype == .L and c_stype == .V) { | ||
| 511 | // Handle L, V. L, V, T is handled via main loop. | ||
| 512 | d_list.items[sidx] = composeHangulFull(L, C, 0); | ||
| 513 | d_list.items[i] = tombstone; // Mark for deletion. | ||
| 514 | processed_hangul = true; | ||
| 515 | } | ||
| 516 | |||
| 517 | if (processed_hangul) deleted += 1; | ||
| 518 | } | ||
| 519 | |||
| 520 | if (!processed_hangul) { | ||
| 521 | // L -> C not Hangul. | ||
| 522 | if (self.nfc_map.get(.{ L, C })) |P| { | ||
| 523 | if (!norm_props.isFcx(P)) { | ||
| 524 | d_list.items[sidx] = P; | ||
| 525 | d_list.items[i] = tombstone; // Mark for deletion. | ||
| 526 | deleted += 1; | ||
| 527 | } | ||
| 528 | } | ||
| 529 | } | ||
| 530 | } | ||
| 531 | } | ||
| 532 | |||
| 533 | // Check if finished. | ||
| 534 | if (deleted == 0) { | ||
| 535 | var cstr_list = try std.ArrayList(u8).initCapacity(allocator, d_list.items.len * 4); | ||
| 536 | defer cstr_list.deinit(); | ||
| 537 | var buf: [4]u8 = undefined; | ||
| 538 | |||
| 539 | for (d_list.items) |cp| { | ||
| 540 | if (cp == tombstone) continue; // "Delete" | ||
| 541 | const len = try std.unicode.utf8Encode(cp, &buf); | ||
| 542 | cstr_list.appendSliceAssumeCapacity(buf[0..len]); | ||
| 543 | } | ||
| 544 | |||
| 545 | return Result{ .allocator = allocator, .slice = try cstr_list.toOwnedSlice() }; | ||
| 546 | } | ||
| 547 | |||
| 548 | // Otherwise update code points list. | ||
| 549 | var tmp_d_list = try std.ArrayList(u21).initCapacity(allocator, d_list.items.len - deleted); | ||
| 550 | defer tmp_d_list.deinit(); | ||
| 551 | |||
| 552 | for (d_list.items) |cp| { | ||
| 553 | if (cp != tombstone) tmp_d_list.appendAssumeCapacity(cp); | ||
| 554 | } | ||
| 555 | |||
| 556 | d_list.clearRetainingCapacity(); | ||
| 557 | d_list.appendSliceAssumeCapacity(tmp_d_list.items); | ||
| 558 | } | ||
| 559 | } | ||
| 560 | |||
| 561 | test "nfc" { | ||
| 562 | const allocator = std.testing.allocator; | ||
| 563 | var n = try init(allocator); | ||
| 564 | defer n.deinit(); | ||
| 565 | |||
| 566 | var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); | ||
| 567 | defer result.deinit(); | ||
| 568 | |||
| 569 | try std.testing.expectEqualStrings("Complex char: \u{3D3}", result.slice); | ||
| 570 | } | ||
| 571 | |||
| 572 | test "nfkc" { | ||
| 573 | const allocator = std.testing.allocator; | ||
| 574 | var n = try init(allocator); | ||
| 575 | defer n.deinit(); | ||
| 576 | |||
| 577 | var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); | ||
| 578 | defer result.deinit(); | ||
| 579 | |||
| 580 | try std.testing.expectEqualStrings("Complex char: \u{038E}", result.slice); | ||
| 581 | } | ||
| 582 | |||
| 583 | /// Tests for equality as per Unicode rules for Identifiers. | ||
| 584 | pub fn eqlIdentifiers(allocator: std.mem.Allocator, a: []const u8, b: []const u8) !bool { | ||
| 585 | var list_a = try std.ArrayList(u21).initCapacity(allocator, a.len); | ||
| 586 | defer list_a.deinit(); | ||
| 587 | var list_b = try std.ArrayList(u21).initCapacity(allocator, b.len); | ||
| 588 | defer list_b.deinit(); | ||
| 589 | |||
| 590 | const Item = struct { | ||
| 591 | str: []const u8, | ||
| 592 | list: *std.ArrayList(u21), | ||
| 593 | }; | ||
| 594 | |||
| 595 | const items = [_]Item{ | ||
| 596 | .{ .str = a, .list = &list_a }, | ||
| 597 | .{ .str = b, .list = &list_b }, | ||
| 598 | }; | ||
| 599 | |||
| 600 | for (items) |item| { | ||
| 601 | var cp_iter = CodePointIterator{ .bytes = item.str }; | ||
| 602 | while (cp_iter.next()) |cp| { | ||
| 603 | if (norm_props.toNfkcCaseFold(cp.code)) |nfkcf| { | ||
| 604 | for (nfkcf) |c| { | ||
| 605 | if (c == 0) break; | ||
| 606 | item.list.appendAssumeCapacity(c); | ||
| 607 | } | ||
| 608 | } else { | ||
| 609 | item.list.appendAssumeCapacity(cp.code); // maps to itself | ||
| 610 | } | ||
| 611 | } | ||
| 612 | } | ||
| 613 | |||
| 614 | return std.mem.eql(u21, list_a.items, list_b.items); | ||
| 615 | } | ||
| 616 | |||
| 617 | test "eqlIdentifiers" { | ||
| 618 | try std.testing.expect(try eqlIdentifiers(std.testing.allocator, "Foé", "foé")); | ||
| 619 | } | ||
| 620 | |||
| 621 | /// Tests for equality of `a` and `b` after normalizing to NFD. | ||
| 622 | pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u8) !bool { | ||
| 623 | var norm_result_a = try self.nfd(allocator, a); | ||
| 624 | defer norm_result_a.deinit(); | ||
| 625 | var norm_result_b = try self.nfd(allocator, b); | ||
| 626 | defer norm_result_b.deinit(); | ||
| 627 | |||
| 628 | return std.mem.eql(u8, norm_result_a.slice, norm_result_b.slice); | ||
| 629 | } | ||
| 630 | |||
| 631 | test "eql" { | ||
| 632 | const allocator = std.testing.allocator; | ||
| 633 | var n = try init(allocator); | ||
| 634 | defer n.deinit(); | ||
| 635 | |||
| 636 | try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); | ||
| 637 | try std.testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); | ||
| 638 | } | ||
| 639 | |||
| 640 | fn requiresNfdBeforeCaseFold(cp: u21) bool { | ||
| 641 | return switch (cp) { | ||
| 642 | 0x0345 => true, | ||
| 643 | 0x1F80...0x1FAF => true, | ||
| 644 | 0x1FB2...0x1FB4 => true, | ||
| 645 | 0x1FB7 => true, | ||
| 646 | 0x1FBC => true, | ||
| 647 | 0x1FC2...0x1FC4 => true, | ||
| 648 | 0x1FC7 => true, | ||
| 649 | 0x1FCC => true, | ||
| 650 | 0x1FF2...0x1FF4 => true, | ||
| 651 | 0x1FF7 => true, | ||
| 652 | 0x1FFC => true, | ||
| 653 | else => false, | ||
| 654 | }; | ||
| 655 | } | ||
| 656 | |||
| 657 | fn requiresPreNfd(str: []const u8) bool { | ||
| 658 | var cp_iter = CodePointIterator{ .bytes = str }; | ||
| 659 | |||
| 660 | return while (cp_iter.next()) |cp| { | ||
| 661 | if (requiresNfdBeforeCaseFold(cp.code)) break true; | ||
| 662 | } else false; | ||
| 663 | } | ||
| 664 | |||
| 665 | /// `eqlCaseless` tests for equality of `a` and `b` after normalizing to NFD and ignoring letter case. | ||
| 666 | pub fn eqlCaseless(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u8) !bool { | ||
| 667 | // The long winding road of normalized caseless matching... | ||
| 668 | // NFD(CaseFold(NFD(str))) or NFD(CaseFold(str)) | ||
| 669 | var norm_result_a: Result = Result{ .slice = a }; | ||
| 670 | if (requiresPreNfd(a)) { | ||
| 671 | if (!self.isFcd(a)) { | ||
| 672 | norm_result_a = try self.nfd(allocator, a); | ||
| 673 | } | ||
| 674 | } | ||
| 675 | defer norm_result_a.deinit(); | ||
| 676 | |||
| 677 | const cf_a = try case_fold_map.caseFoldStr(allocator, norm_result_a.slice); | ||
| 678 | defer allocator.free(cf_a); | ||
| 679 | norm_result_a.deinit(); | ||
| 680 | norm_result_a = try self.nfd(allocator, cf_a); | ||
| 681 | |||
| 682 | var norm_result_b: Result = Result{ .slice = b }; | ||
| 683 | if (requiresPreNfd(b)) { | ||
| 684 | if (!self.isFcd(b)) { | ||
| 685 | norm_result_b = try self.nfd(allocator, b); | ||
| 686 | } | ||
| 687 | } | ||
| 688 | defer norm_result_b.deinit(); | ||
| 689 | |||
| 690 | const cf_b = try case_fold_map.caseFoldStr(allocator, norm_result_b.slice); | ||
| 691 | defer allocator.free(cf_b); | ||
| 692 | norm_result_b.deinit(); | ||
| 693 | norm_result_b = try self.nfd(allocator, cf_b); | ||
| 694 | |||
| 695 | return std.mem.eql(u8, norm_result_a.slice, norm_result_b.slice); | ||
| 696 | } | ||
| 697 | |||
| 698 | test "eqlCaseless" { | ||
| 699 | const allocator = std.testing.allocator; | ||
| 700 | var n = try init(allocator); | ||
| 701 | defer n.deinit(); | ||
| 702 | |||
| 703 | try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}")); | ||
| 704 | try std.testing.expect(try n.eqlCaseless(allocator, "FOÉ", "foe\u{0301}")); // foÉ == foé | ||
| 705 | } | ||
| 706 | |||
| 707 | // FCD | ||
| 708 | fn getLeadCcc(self: Self, cp: u21) u8 { | ||
| 709 | const dc = self.mapping(cp, .nfd); | ||
| 710 | return normp.stage_2[normp.stage_1[dc.cps[0] >> 8] + (dc.cps[0] & 0xff)]; | ||
| 711 | } | ||
| 712 | |||
| 713 | fn getTrailCcc(self: Self, cp: u21) u8 { | ||
| 714 | const dc = self.mapping(cp, .nfd); | ||
| 715 | const len = for (dc.cps, 0..) |dcp, i| { | ||
| 716 | if (dcp == 0) break i; | ||
| 717 | } else dc.cps.len; | ||
| 718 | const tcp = dc.cps[len -| 1]; | ||
| 719 | return normp.stage_2[normp.stage_1[tcp >> 8] + (tcp & 0xff)]; | ||
| 720 | } | ||
| 721 | |||
| 722 | /// Fast check to detect if a string is already in NFC or NFD form. | ||
| 723 | pub fn isFcd(self: Self, str: []const u8) bool { | ||
| 724 | var prev_ccc: u8 = 0; | ||
| 725 | var cp_iter = CodePointIterator{ .bytes = str }; | ||
| 726 | |||
| 727 | return while (cp_iter.next()) |cp| { | ||
| 728 | const ccc = self.getLeadCcc(cp.code); | ||
| 729 | if (ccc != 0 and ccc < prev_ccc) break false; | ||
| 730 | prev_ccc = self.getTrailCcc(cp.code); | ||
| 731 | } else true; | ||
| 732 | } | ||
| 733 | |||
| 734 | test "isFcd" { | ||
| 735 | const allocator = std.testing.allocator; | ||
| 736 | var n = try init(allocator); | ||
| 737 | defer n.deinit(); | ||
| 738 | |||
| 739 | const is_nfc = "José \u{3D3}"; | ||
| 740 | try std.testing.expect(n.isFcd(is_nfc)); | ||
| 741 | |||
| 742 | const is_nfd = "Jose\u{301} \u{3d2}\u{301}"; | ||
| 743 | try std.testing.expect(n.isFcd(is_nfd)); | ||
| 744 | |||
| 745 | const not_fcd = "Jose\u{301} \u{3d2}\u{315}\u{301}"; | ||
| 746 | try std.testing.expect(!n.isFcd(not_fcd)); | ||
| 747 | } | ||
| 748 | |||
| 749 | test "Unicode normalization tests" { | ||
| 750 | var arena = std.heap.ArenaAllocator.init(std.testing.allocator); | ||
| 751 | defer arena.deinit(); | ||
| 752 | var allocator = arena.allocator(); | ||
| 753 | |||
| 754 | var n = try init(allocator); | ||
| 755 | defer n.deinit(); | ||
| 756 | |||
| 757 | var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); | ||
| 758 | defer file.close(); | ||
| 759 | var buf_reader = std.io.bufferedReader(file.reader()); | ||
| 760 | const input_stream = buf_reader.reader(); | ||
| 761 | |||
| 762 | var line_no: usize = 0; | ||
| 763 | var buf: [4096]u8 = undefined; | ||
| 764 | var cp_buf: [4]u8 = undefined; | ||
| 765 | |||
| 766 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| { | ||
| 767 | line_no += 1; | ||
| 768 | // Skip comments or empty lines. | ||
| 769 | if (line.len == 0 or line[0] == '#' or line[0] == '@') continue; | ||
| 770 | // Iterate over fields. | ||
| 771 | var fields = std.mem.split(u8, line, ";"); | ||
| 772 | var field_index: usize = 0; | ||
| 773 | var input: []u8 = undefined; | ||
| 774 | defer allocator.free(input); | ||
| 775 | |||
| 776 | while (fields.next()) |field| : (field_index += 1) { | ||
| 777 | if (field_index == 0) { | ||
| 778 | var i_buf = std.ArrayList(u8).init(allocator); | ||
| 779 | defer i_buf.deinit(); | ||
| 780 | |||
| 781 | var i_fields = std.mem.split(u8, field, " "); | ||
| 782 | while (i_fields.next()) |s| { | ||
| 783 | const icp = try std.fmt.parseInt(u21, s, 16); | ||
| 784 | const len = try std.unicode.utf8Encode(icp, &cp_buf); | ||
| 785 | try i_buf.appendSlice(cp_buf[0..len]); | ||
| 786 | } | ||
| 787 | |||
| 788 | input = try i_buf.toOwnedSlice(); | ||
| 789 | } else if (field_index == 1) { | ||
| 790 | //std.debug.print("\n*** {s} ***\n", .{line}); | ||
| 791 | // NFC, time to test. | ||
| 792 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 793 | defer w_buf.deinit(); | ||
| 794 | |||
| 795 | var w_fields = std.mem.split(u8, field, " "); | ||
| 796 | while (w_fields.next()) |s| { | ||
| 797 | const wcp = try std.fmt.parseInt(u21, s, 16); | ||
| 798 | const len = try std.unicode.utf8Encode(wcp, &cp_buf); | ||
| 799 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 800 | } | ||
| 801 | |||
| 802 | const want = w_buf.items; | ||
| 803 | var got = try n.nfc(allocator, input); | ||
| 804 | defer got.deinit(); | ||
| 805 | |||
| 806 | try std.testing.expectEqualStrings(want, got.slice); | ||
| 807 | } else if (field_index == 2) { | ||
| 808 | // NFD, time to test. | ||
| 809 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 810 | defer w_buf.deinit(); | ||
| 811 | |||
| 812 | var w_fields = std.mem.split(u8, field, " "); | ||
| 813 | while (w_fields.next()) |s| { | ||
| 814 | const wcp = try std.fmt.parseInt(u21, s, 16); | ||
| 815 | const len = try std.unicode.utf8Encode(wcp, &cp_buf); | ||
| 816 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 817 | } | ||
| 818 | |||
| 819 | const want = w_buf.items; | ||
| 820 | var got = try n.nfd(allocator, input); | ||
| 821 | defer got.deinit(); | ||
| 822 | |||
| 823 | try std.testing.expectEqualStrings(want, got.slice); | ||
| 824 | } else if (field_index == 3) { | ||
| 825 | // NFKC, time to test. | ||
| 826 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 827 | defer w_buf.deinit(); | ||
| 828 | |||
| 829 | var w_fields = std.mem.split(u8, field, " "); | ||
| 830 | while (w_fields.next()) |s| { | ||
| 831 | const wcp = try std.fmt.parseInt(u21, s, 16); | ||
| 832 | const len = try std.unicode.utf8Encode(wcp, &cp_buf); | ||
| 833 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 834 | } | ||
| 835 | |||
| 836 | const want = w_buf.items; | ||
| 837 | var got = try n.nfkc(allocator, input); | ||
| 838 | defer got.deinit(); | ||
| 839 | |||
| 840 | try std.testing.expectEqualStrings(want, got.slice); | ||
| 841 | } else if (field_index == 4) { | ||
| 842 | // NFKD, time to test. | ||
| 843 | var w_buf = std.ArrayList(u8).init(allocator); | ||
| 844 | defer w_buf.deinit(); | ||
| 845 | |||
| 846 | var w_fields = std.mem.split(u8, field, " "); | ||
| 847 | while (w_fields.next()) |s| { | ||
| 848 | const wcp = try std.fmt.parseInt(u21, s, 16); | ||
| 849 | const len = try std.unicode.utf8Encode(wcp, &cp_buf); | ||
| 850 | try w_buf.appendSlice(cp_buf[0..len]); | ||
| 851 | } | ||
| 852 | |||
| 853 | const want = w_buf.items; | ||
| 854 | var got = try n.nfkd(allocator, input); | ||
| 855 | defer got.deinit(); | ||
| 856 | |||
| 857 | try std.testing.expectEqualStrings(want, got.slice); | ||
| 858 | } else { | ||
| 859 | continue; | ||
| 860 | } | ||
| 861 | } | ||
| 862 | } | ||
| 863 | } | ||