diff options
| author | 2026-02-05 07:07:40 -0500 | |
|---|---|---|
| committer | 2026-02-05 07:07:40 -0500 | |
| commit | 95f9487f6a7bde2d7266399bdf6843b97cc1b301 (patch) | |
| tree | 122cd20fa574861e807844974b49eb2f91285d3c /src/Normalize.zig | |
| parent | Teasing out canonicalization (diff) | |
| download | zg-95f9487f6a7bde2d7266399bdf6843b97cc1b301.tar.gz zg-95f9487f6a7bde2d7266399bdf6843b97cc1b301.tar.xz zg-95f9487f6a7bde2d7266399bdf6843b97cc1b301.zip | |
Base units do not allocate
CanonData included. I may still sort out caseless matching without
allocation, but that's a stretch goal.
Closes #86
Closes #85
Diffstat (limited to 'src/Normalize.zig')
| -rw-r--r-- | src/Normalize.zig | 143 |
1 files changed, 48 insertions, 95 deletions
diff --git a/src/Normalize.zig b/src/Normalize.zig index 3191a8c..865318f 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig | |||
| @@ -2,25 +2,8 @@ | |||
| 2 | //! Unicode Normalization. You can normalize strings into NFC, | 2 | //! Unicode Normalization. You can normalize strings into NFC, |
| 3 | //! NFKC, NFD, and NFKD normalization forms. | 3 | //! NFKC, NFD, and NFKD normalization forms. |
| 4 | 4 | ||
| 5 | canon_data: CanonData = undefined, | ||
| 6 | |||
| 7 | const Normalize = @This(); | 5 | const Normalize = @This(); |
| 8 | 6 | ||
| 9 | pub fn init(allocator: Allocator) !Normalize { | ||
| 10 | var norm: Normalize = undefined; | ||
| 11 | try norm.setup(allocator); | ||
| 12 | return norm; | ||
| 13 | } | ||
| 14 | |||
| 15 | pub fn setup(self: *Normalize, allocator: Allocator) !void { | ||
| 16 | self.canon_data = try CanonData.init(allocator); | ||
| 17 | } | ||
| 18 | |||
| 19 | pub fn deinit(norm: *const Normalize, allocator: Allocator) void { | ||
| 20 | const mut_norm = @constCast(norm); | ||
| 21 | mut_norm.canon_data.deinit(allocator); | ||
| 22 | } | ||
| 23 | |||
| 24 | const SBase: u21 = 0xAC00; | 7 | const SBase: u21 = 0xAC00; |
| 25 | const LBase: u21 = 0x1100; | 8 | const LBase: u21 = 0x1100; |
| 26 | const VBase: u21 = 0x1161; | 9 | const VBase: u21 = 0x1161; |
| @@ -91,12 +74,12 @@ const Decomp = struct { | |||
| 91 | }; | 74 | }; |
| 92 | 75 | ||
| 93 | // `mapping` retrieves the decomposition mapping for a code point as per the UCD. | 76 | // `mapping` retrieves the decomposition mapping for a code point as per the UCD. |
| 94 | fn mapping(self: Normalize, cp: u21, form: Form) Decomp { | 77 | fn mapping(cp: u21, form: Form) Decomp { |
| 95 | var dc = Decomp{}; | 78 | var dc = Decomp{}; |
| 96 | 79 | ||
| 97 | switch (form) { | 80 | switch (form) { |
| 98 | .nfd => { | 81 | .nfd => { |
| 99 | dc.cps = self.canon_data.toNfd(cp); | 82 | dc.cps = CanonData.toNfd(cp); |
| 100 | if (dc.cps.len != 0) dc.form = .nfd; | 83 | if (dc.cps.len != 0) dc.form = .nfd; |
| 101 | }, | 84 | }, |
| 102 | 85 | ||
| @@ -105,7 +88,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp { | |||
| 105 | if (dc.cps.len != 0) { | 88 | if (dc.cps.len != 0) { |
| 106 | dc.form = .nfkd; | 89 | dc.form = .nfkd; |
| 107 | } else { | 90 | } else { |
| 108 | dc.cps = self.canon_data.toNfd(cp); | 91 | dc.cps = CanonData.toNfd(cp); |
| 109 | if (dc.cps.len != 0) dc.form = .nfkd; | 92 | if (dc.cps.len != 0) dc.form = .nfkd; |
| 110 | } | 93 | } |
| 111 | }, | 94 | }, |
| @@ -117,12 +100,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp { | |||
| 117 | } | 100 | } |
| 118 | 101 | ||
| 119 | // `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`. | 102 | // `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`. |
| 120 | fn decompose( | 103 | fn decompose(cp: u21, form: Form, buf: []u21) Decomp { |
| 121 | self: Normalize, | ||
| 122 | cp: u21, | ||
| 123 | form: Form, | ||
| 124 | buf: []u21, | ||
| 125 | ) Decomp { | ||
| 126 | // ASCII | 104 | // ASCII |
| 127 | if (cp < 128) return .{}; | 105 | if (cp < 128) return .{}; |
| 128 | 106 | ||
| @@ -149,7 +127,7 @@ fn decompose( | |||
| 149 | // Look at previous code point in work queue. | 127 | // Look at previous code point in work queue. |
| 150 | work_index -= 1; | 128 | work_index -= 1; |
| 151 | const next = work[work_index]; | 129 | const next = work[work_index]; |
| 152 | const m = self.mapping(next, form); | 130 | const m = Normalize.mapping(next, form); |
| 153 | 131 | ||
| 154 | // No more of decompositions for this code point. | 132 | // No more of decompositions for this code point. |
| 155 | if (m.form == .same) { | 133 | if (m.form == .same) { |
| @@ -175,44 +153,41 @@ fn decompose( | |||
| 175 | } | 153 | } |
| 176 | 154 | ||
| 177 | test "decompose" { | 155 | test "decompose" { |
| 178 | const allocator = testing.allocator; | ||
| 179 | var n = try Normalize.init(allocator); | ||
| 180 | defer n.deinit(allocator); | ||
| 181 | var buf: [18]u21 = undefined; | 156 | var buf: [18]u21 = undefined; |
| 182 | 157 | ||
| 183 | var dc = n.decompose('é', .nfd, &buf); | 158 | var dc = Normalize.decompose('é', .nfd, &buf); |
| 184 | try testing.expect(dc.form == .nfd); | 159 | try testing.expect(dc.form == .nfd); |
| 185 | try testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]); | 160 | try testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]); |
| 186 | 161 | ||
| 187 | dc = n.decompose('\u{1e0a}', .nfd, &buf); | 162 | dc = Normalize.decompose('\u{1e0a}', .nfd, &buf); |
| 188 | try testing.expect(dc.form == .nfd); | 163 | try testing.expect(dc.form == .nfd); |
| 189 | try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); | 164 | try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); |
| 190 | 165 | ||
| 191 | dc = n.decompose('\u{1e0a}', .nfkd, &buf); | 166 | dc = Normalize.decompose('\u{1e0a}', .nfkd, &buf); |
| 192 | try testing.expect(dc.form == .nfkd); | 167 | try testing.expect(dc.form == .nfkd); |
| 193 | try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); | 168 | try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); |
| 194 | 169 | ||
| 195 | dc = n.decompose('\u{3189}', .nfd, &buf); | 170 | dc = Normalize.decompose('\u{3189}', .nfd, &buf); |
| 196 | try testing.expect(dc.form == .same); | 171 | try testing.expect(dc.form == .same); |
| 197 | try testing.expect(dc.cps.len == 0); | 172 | try testing.expect(dc.cps.len == 0); |
| 198 | 173 | ||
| 199 | dc = n.decompose('\u{3189}', .nfkd, &buf); | 174 | dc = Normalize.decompose('\u{3189}', .nfkd, &buf); |
| 200 | try testing.expect(dc.form == .nfkd); | 175 | try testing.expect(dc.form == .nfkd); |
| 201 | try testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]); | 176 | try testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]); |
| 202 | 177 | ||
| 203 | dc = n.decompose('\u{ace1}', .nfd, &buf); | 178 | dc = Normalize.decompose('\u{ace1}', .nfd, &buf); |
| 204 | try testing.expect(dc.form == .nfd); | 179 | try testing.expect(dc.form == .nfd); |
| 205 | try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); | 180 | try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); |
| 206 | 181 | ||
| 207 | dc = n.decompose('\u{ace1}', .nfkd, &buf); | 182 | dc = Normalize.decompose('\u{ace1}', .nfkd, &buf); |
| 208 | try testing.expect(dc.form == .nfd); | 183 | try testing.expect(dc.form == .nfd); |
| 209 | try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); | 184 | try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); |
| 210 | 185 | ||
| 211 | dc = n.decompose('\u{3d3}', .nfd, &buf); | 186 | dc = Normalize.decompose('\u{3d3}', .nfd, &buf); |
| 212 | try testing.expect(dc.form == .nfd); | 187 | try testing.expect(dc.form == .nfd); |
| 213 | try testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]); | 188 | try testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]); |
| 214 | 189 | ||
| 215 | dc = n.decompose('\u{3d3}', .nfkd, &buf); | 190 | dc = Normalize.decompose('\u{3d3}', .nfkd, &buf); |
| 216 | try testing.expect(dc.form == .nfkd); | 191 | try testing.expect(dc.form == .nfkd); |
| 217 | try testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]); | 192 | try testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]); |
| 218 | } | 193 | } |
| @@ -231,8 +206,8 @@ pub const Result = struct { | |||
| 231 | return .{ .allocated = true, .slice = try allocator.dupe(u8, result.slice) }; | 206 | return .{ .allocated = true, .slice = try allocator.dupe(u8, result.slice) }; |
| 232 | } | 207 | } |
| 233 | 208 | ||
| 234 | pub fn deinit(self: *const Result, allocator: Allocator) void { | 209 | pub fn deinit(result: *const Result, allocator: Allocator) void { |
| 235 | if (self.allocated) allocator.free(self.slice); | 210 | if (result.allocated) allocator.free(result.slice); |
| 236 | } | 211 | } |
| 237 | }; | 212 | }; |
| 238 | 213 | ||
| @@ -252,16 +227,16 @@ fn canonicalSort(cps: []u21) void { | |||
| 252 | } | 227 | } |
| 253 | 228 | ||
| 254 | /// Normalize `str` to NFD. | 229 | /// Normalize `str` to NFD. |
| 255 | pub fn nfd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { | 230 | pub fn nfd(allocator: Allocator, str: []const u8) Allocator.Error!Result { |
| 256 | return self.nfxd(allocator, str, .nfd); | 231 | return Normalize.nfxd(allocator, str, .nfd); |
| 257 | } | 232 | } |
| 258 | 233 | ||
| 259 | /// Normalize `str` to NFKD. | 234 | /// Normalize `str` to NFKD. |
| 260 | pub fn nfkd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { | 235 | pub fn nfkd(allocator: Allocator, str: []const u8) Allocator.Error!Result { |
| 261 | return self.nfxd(allocator, str, .nfkd); | 236 | return Normalize.nfxd(allocator, str, .nfkd); |
| 262 | } | 237 | } |
| 263 | 238 | ||
| 264 | pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error![]u21 { | 239 | pub fn nfxdCodePoints(allocator: Allocator, str: []const u8, form: Form) Allocator.Error![]u21 { |
| 265 | var dcp_list = std.array_list.Managed(u21).init(allocator); | 240 | var dcp_list = std.array_list.Managed(u21).init(allocator); |
| 266 | defer dcp_list.deinit(); | 241 | defer dcp_list.deinit(); |
| 267 | 242 | ||
| @@ -269,7 +244,7 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo | |||
| 269 | var dc_buf: [18]u21 = undefined; | 244 | var dc_buf: [18]u21 = undefined; |
| 270 | 245 | ||
| 271 | while (cp_iter.next()) |cp| { | 246 | while (cp_iter.next()) |cp| { |
| 272 | const dc = self.decompose(cp.code, form, &dc_buf); | 247 | const dc = Normalize.decompose(cp.code, form, &dc_buf); |
| 273 | if (dc.form == .same) { | 248 | if (dc.form == .same) { |
| 274 | try dcp_list.append(cp.code); | 249 | try dcp_list.append(cp.code); |
| 275 | } else { | 250 | } else { |
| @@ -282,11 +257,11 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo | |||
| 282 | return try dcp_list.toOwnedSlice(); | 257 | return try dcp_list.toOwnedSlice(); |
| 283 | } | 258 | } |
| 284 | 259 | ||
| 285 | fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { | 260 | fn nfxd(allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { |
| 286 | // Quick checks. | 261 | // Quick checks. |
| 287 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; | 262 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; |
| 288 | 263 | ||
| 289 | const dcps = try self.nfxdCodePoints(allocator, str, form); | 264 | const dcps = try Normalize.nfxdCodePoints(allocator, str, form); |
| 290 | defer allocator.free(dcps); | 265 | defer allocator.free(dcps); |
| 291 | 266 | ||
| 292 | var dstr_list = std.array_list.Managed(u8).init(allocator); | 267 | var dstr_list = std.array_list.Managed(u8).init(allocator); |
| @@ -303,10 +278,8 @@ fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo | |||
| 303 | 278 | ||
| 304 | test "nfd ASCII / no-alloc" { | 279 | test "nfd ASCII / no-alloc" { |
| 305 | const allocator = testing.allocator; | 280 | const allocator = testing.allocator; |
| 306 | var n = try Normalize.init(allocator); | ||
| 307 | defer n.deinit(allocator); | ||
| 308 | 281 | ||
| 309 | const result = try n.nfd(allocator, "Hello World!"); | 282 | const result = try Normalize.nfd(allocator, "Hello World!"); |
| 310 | defer result.deinit(allocator); | 283 | defer result.deinit(allocator); |
| 311 | 284 | ||
| 312 | try testing.expectEqualStrings("Hello World!", result.slice); | 285 | try testing.expectEqualStrings("Hello World!", result.slice); |
| @@ -314,10 +287,8 @@ test "nfd ASCII / no-alloc" { | |||
| 314 | 287 | ||
| 315 | test "nfd !ASCII / alloc" { | 288 | test "nfd !ASCII / alloc" { |
| 316 | const allocator = testing.allocator; | 289 | const allocator = testing.allocator; |
| 317 | var n = try Normalize.init(allocator); | ||
| 318 | defer n.deinit(allocator); | ||
| 319 | 290 | ||
| 320 | const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); | 291 | const result = try Normalize.nfd(allocator, "Héllo World! \u{3d3}"); |
| 321 | defer result.deinit(allocator); | 292 | defer result.deinit(allocator); |
| 322 | 293 | ||
| 323 | try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice); | 294 | try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice); |
| @@ -325,10 +296,8 @@ test "nfd !ASCII / alloc" { | |||
| 325 | 296 | ||
| 326 | test "nfkd ASCII / no-alloc" { | 297 | test "nfkd ASCII / no-alloc" { |
| 327 | const allocator = testing.allocator; | 298 | const allocator = testing.allocator; |
| 328 | var n = try Normalize.init(allocator); | ||
| 329 | defer n.deinit(allocator); | ||
| 330 | 299 | ||
| 331 | const result = try n.nfkd(allocator, "Hello World!"); | 300 | const result = try Normalize.nfkd(allocator, "Hello World!"); |
| 332 | defer result.deinit(allocator); | 301 | defer result.deinit(allocator); |
| 333 | 302 | ||
| 334 | try testing.expectEqualStrings("Hello World!", result.slice); | 303 | try testing.expectEqualStrings("Hello World!", result.slice); |
| @@ -336,27 +305,21 @@ test "nfkd ASCII / no-alloc" { | |||
| 336 | 305 | ||
| 337 | test "nfkd !ASCII / alloc" { | 306 | test "nfkd !ASCII / alloc" { |
| 338 | const allocator = testing.allocator; | 307 | const allocator = testing.allocator; |
| 339 | var n = try Normalize.init(allocator); | ||
| 340 | defer n.deinit(allocator); | ||
| 341 | 308 | ||
| 342 | const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); | 309 | const result = try Normalize.nfkd(allocator, "Héllo World! \u{3d3}"); |
| 343 | defer result.deinit(allocator); | 310 | defer result.deinit(allocator); |
| 344 | 311 | ||
| 345 | try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); | 312 | try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); |
| 346 | } | 313 | } |
| 347 | 314 | ||
| 348 | pub fn nfdCodePoints( | 315 | pub fn nfdCodePoints(allocator: Allocator, cps: []const u21) Allocator.Error![]u21 { |
| 349 | self: Normalize, | ||
| 350 | allocator: Allocator, | ||
| 351 | cps: []const u21, | ||
| 352 | ) Allocator.Error![]u21 { | ||
| 353 | var dcp_list = std.array_list.Managed(u21).init(allocator); | 316 | var dcp_list = std.array_list.Managed(u21).init(allocator); |
| 354 | defer dcp_list.deinit(); | 317 | defer dcp_list.deinit(); |
| 355 | 318 | ||
| 356 | var dc_buf: [18]u21 = undefined; | 319 | var dc_buf: [18]u21 = undefined; |
| 357 | 320 | ||
| 358 | for (cps) |cp| { | 321 | for (cps) |cp| { |
| 359 | const dc = self.decompose(cp, .nfd, &dc_buf); | 322 | const dc = Normalize.decompose(cp, .nfd, &dc_buf); |
| 360 | 323 | ||
| 361 | if (dc.form == .same) { | 324 | if (dc.form == .same) { |
| 362 | try dcp_list.append(cp); | 325 | try dcp_list.append(cp); |
| @@ -370,18 +333,14 @@ pub fn nfdCodePoints( | |||
| 370 | return try dcp_list.toOwnedSlice(); | 333 | return try dcp_list.toOwnedSlice(); |
| 371 | } | 334 | } |
| 372 | 335 | ||
| 373 | pub fn nfkdCodePoints( | 336 | pub fn nfkdCodePoints(allocator: Allocator, cps: []const u21) Allocator.Error![]u21 { |
| 374 | self: Normalize, | ||
| 375 | allocator: Allocator, | ||
| 376 | cps: []const u21, | ||
| 377 | ) Allocator.Error![]u21 { | ||
| 378 | var dcp_list = std.array_list.Managed(u21).init(allocator); | 337 | var dcp_list = std.array_list.Managed(u21).init(allocator); |
| 379 | defer dcp_list.deinit(); | 338 | defer dcp_list.deinit(); |
| 380 | 339 | ||
| 381 | var dc_buf: [18]u21 = undefined; | 340 | var dc_buf: [18]u21 = undefined; |
| 382 | 341 | ||
| 383 | for (cps) |cp| { | 342 | for (cps) |cp| { |
| 384 | const dc = self.decompose(cp, .nfkd, &dc_buf); | 343 | const dc = Normalize.decompose(cp, .nfkd, &dc_buf); |
| 385 | 344 | ||
| 386 | if (dc.form == .same) { | 345 | if (dc.form == .same) { |
| 387 | try dcp_list.append(cp); | 346 | try dcp_list.append(cp); |
| @@ -402,29 +361,29 @@ fn isHangul(cp: u21) bool { | |||
| 402 | } | 361 | } |
| 403 | 362 | ||
| 404 | /// Normalizes `str` to NFC. | 363 | /// Normalizes `str` to NFC. |
| 405 | pub fn nfc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { | 364 | pub fn nfc(allocator: Allocator, str: []const u8) Allocator.Error!Result { |
| 406 | return self.nfxc(allocator, str, .nfc); | 365 | return Normalize.nfxc(allocator, str, .nfc); |
| 407 | } | 366 | } |
| 408 | 367 | ||
| 409 | /// Normalizes `str` to NFKC. | 368 | /// Normalizes `str` to NFKC. |
| 410 | pub fn nfkc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { | 369 | pub fn nfkc(allocator: Allocator, str: []const u8) Allocator.Error!Result { |
| 411 | return self.nfxc(allocator, str, .nfkc); | 370 | return Normalize.nfxc(allocator, str, .nfkc); |
| 412 | } | 371 | } |
| 413 | 372 | ||
| 414 | fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { | 373 | fn nfxc(allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { |
| 415 | // Quick checks. | 374 | // Quick checks. |
| 416 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; | 375 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; |
| 417 | if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str }; | 376 | if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str }; |
| 418 | 377 | ||
| 419 | // Decompose first. | 378 | // Decompose first. |
| 420 | var dcps = if (form == .nfc) | 379 | var dcps = if (form == .nfc) |
| 421 | try self.nfxdCodePoints(allocator, str, .nfd) | 380 | try Normalize.nfxdCodePoints(allocator, str, .nfd) |
| 422 | else | 381 | else |
| 423 | try self.nfxdCodePoints(allocator, str, .nfkd); | 382 | try Normalize.nfxdCodePoints(allocator, str, .nfkd); |
| 424 | defer allocator.free(dcps); | 383 | defer allocator.free(dcps); |
| 425 | 384 | ||
| 426 | // Compose | 385 | // Compose |
| 427 | const tombstone = 0xe000; // Start of BMP Private Use Area | 386 | const tombstone = 0x1FFFF; // Convenient Cn noncharacter point |
| 428 | 387 | ||
| 429 | // Loop over all decomposed code points. | 388 | // Loop over all decomposed code points. |
| 430 | while (true) { | 389 | while (true) { |
| @@ -498,7 +457,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo | |||
| 498 | if (!processed_hangul) { | 457 | if (!processed_hangul) { |
| 499 | // L, C are not Hangul, so check for primary composite | 458 | // L, C are not Hangul, so check for primary composite |
| 500 | // in the Unicode Character Database. | 459 | // in the Unicode Character Database. |
| 501 | if (self.canon_data.toNfc(.{ L, C })) |P| { | 460 | if (CanonData.toNfc(.{ L, C })) |P| { |
| 502 | // We have a primary composite P for L, C. | 461 | // We have a primary composite P for L, C. |
| 503 | // We must check if P is not in the Full | 462 | // We must check if P is not in the Full |
| 504 | // Composition Exclusions (FCX) list, | 463 | // Composition Exclusions (FCX) list, |
| @@ -534,10 +493,8 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo | |||
| 534 | 493 | ||
| 535 | test "nfc" { | 494 | test "nfc" { |
| 536 | const allocator = testing.allocator; | 495 | const allocator = testing.allocator; |
| 537 | var n = try Normalize.init(allocator); | ||
| 538 | defer n.deinit(allocator); | ||
| 539 | 496 | ||
| 540 | const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); | 497 | const result = try Normalize.nfc(allocator, "Complex char: \u{3D2}\u{301}"); |
| 541 | defer result.deinit(allocator); | 498 | defer result.deinit(allocator); |
| 542 | 499 | ||
| 543 | try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice); | 500 | try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice); |
| @@ -545,20 +502,18 @@ test "nfc" { | |||
| 545 | 502 | ||
| 546 | test "nfkc" { | 503 | test "nfkc" { |
| 547 | const allocator = testing.allocator; | 504 | const allocator = testing.allocator; |
| 548 | var n = try Normalize.init(allocator); | ||
| 549 | defer n.deinit(allocator); | ||
| 550 | 505 | ||
| 551 | const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); | 506 | const result = try Normalize.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); |
| 552 | defer result.deinit(allocator); | 507 | defer result.deinit(allocator); |
| 553 | 508 | ||
| 554 | try testing.expectEqualStrings("Complex char: \u{038E}", result.slice); | 509 | try testing.expectEqualStrings("Complex char: \u{038E}", result.slice); |
| 555 | } | 510 | } |
| 556 | 511 | ||
| 557 | /// Tests for equality of `a` and `b` after normalizing to NFC. | 512 | /// Tests for equality of `a` and `b` after normalizing to NFC. |
| 558 | pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8) !bool { | 513 | pub fn eql(allocator: Allocator, a: []const u8, b: []const u8) !bool { |
| 559 | const norm_result_a = try self.nfc(allocator, a); | 514 | const norm_result_a = try Normalize.nfc(allocator, a); |
| 560 | defer norm_result_a.deinit(allocator); | 515 | defer norm_result_a.deinit(allocator); |
| 561 | const norm_result_b = try self.nfc(allocator, b); | 516 | const norm_result_b = try Normalize.nfc(allocator, b); |
| 562 | defer norm_result_b.deinit(allocator); | 517 | defer norm_result_b.deinit(allocator); |
| 563 | 518 | ||
| 564 | return mem.eql(u8, norm_result_a.slice, norm_result_b.slice); | 519 | return mem.eql(u8, norm_result_a.slice, norm_result_b.slice); |
| @@ -566,11 +521,9 @@ pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8) | |||
| 566 | 521 | ||
| 567 | test "eql" { | 522 | test "eql" { |
| 568 | const allocator = testing.allocator; | 523 | const allocator = testing.allocator; |
| 569 | var n = try Normalize.init(allocator); | ||
| 570 | defer n.deinit(allocator); | ||
| 571 | 524 | ||
| 572 | try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); | 525 | try testing.expect(try Normalize.eql(allocator, "foé", "foe\u{0301}")); |
| 573 | try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); | 526 | try testing.expect(try Normalize.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); |
| 574 | } | 527 | } |
| 575 | 528 | ||
| 576 | /// Returns true if `str` only contains Latin-1 Supplement | 529 | /// Returns true if `str` only contains Latin-1 Supplement |