From 95f9487f6a7bde2d7266399bdf6843b97cc1b301 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Thu, 5 Feb 2026 07:07:40 -0500 Subject: Base units do not allocate CanonData included. I may still sort out caseless matching without allocation, but that's a stretch goal. Closes #86 Closes #85 --- src/Normalize.zig | 143 ++++++++++++++++++------------------------------------ 1 file changed, 48 insertions(+), 95 deletions(-) (limited to 'src/Normalize.zig') diff --git a/src/Normalize.zig b/src/Normalize.zig index 3191a8c..865318f 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig @@ -2,25 +2,8 @@ //! Unicode Normalization. You can normalize strings into NFC, //! NFKC, NFD, and NFKD normalization forms. -canon_data: CanonData = undefined, - const Normalize = @This(); -pub fn init(allocator: Allocator) !Normalize { - var norm: Normalize = undefined; - try norm.setup(allocator); - return norm; -} - -pub fn setup(self: *Normalize, allocator: Allocator) !void { - self.canon_data = try CanonData.init(allocator); -} - -pub fn deinit(norm: *const Normalize, allocator: Allocator) void { - const mut_norm = @constCast(norm); - mut_norm.canon_data.deinit(allocator); -} - const SBase: u21 = 0xAC00; const LBase: u21 = 0x1100; const VBase: u21 = 0x1161; @@ -91,12 +74,12 @@ const Decomp = struct { }; // `mapping` retrieves the decomposition mapping for a code point as per the UCD. -fn mapping(self: Normalize, cp: u21, form: Form) Decomp { +fn mapping(cp: u21, form: Form) Decomp { var dc = Decomp{}; switch (form) { .nfd => { - dc.cps = self.canon_data.toNfd(cp); + dc.cps = CanonData.toNfd(cp); if (dc.cps.len != 0) dc.form = .nfd; }, @@ -105,7 +88,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp { if (dc.cps.len != 0) { dc.form = .nfkd; } else { - dc.cps = self.canon_data.toNfd(cp); + dc.cps = CanonData.toNfd(cp); if (dc.cps.len != 0) dc.form = .nfkd; } }, @@ -117,12 +100,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp { } // `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`. -fn decompose( - self: Normalize, - cp: u21, - form: Form, - buf: []u21, -) Decomp { +fn decompose(cp: u21, form: Form, buf: []u21) Decomp { // ASCII if (cp < 128) return .{}; @@ -149,7 +127,7 @@ fn decompose( // Look at previous code point in work queue. work_index -= 1; const next = work[work_index]; - const m = self.mapping(next, form); + const m = Normalize.mapping(next, form); // No more of decompositions for this code point. if (m.form == .same) { @@ -175,44 +153,41 @@ fn decompose( } test "decompose" { - const allocator = testing.allocator; - var n = try Normalize.init(allocator); - defer n.deinit(allocator); var buf: [18]u21 = undefined; - var dc = n.decompose('é', .nfd, &buf); + var dc = Normalize.decompose('é', .nfd, &buf); try testing.expect(dc.form == .nfd); try testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]); - dc = n.decompose('\u{1e0a}', .nfd, &buf); + dc = Normalize.decompose('\u{1e0a}', .nfd, &buf); try testing.expect(dc.form == .nfd); try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); - dc = n.decompose('\u{1e0a}', .nfkd, &buf); + dc = Normalize.decompose('\u{1e0a}', .nfkd, &buf); try testing.expect(dc.form == .nfkd); try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); - dc = n.decompose('\u{3189}', .nfd, &buf); + dc = Normalize.decompose('\u{3189}', .nfd, &buf); try testing.expect(dc.form == .same); try testing.expect(dc.cps.len == 0); - dc = n.decompose('\u{3189}', .nfkd, &buf); + dc = Normalize.decompose('\u{3189}', .nfkd, &buf); try testing.expect(dc.form == .nfkd); try testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]); - dc = n.decompose('\u{ace1}', .nfd, &buf); + dc = Normalize.decompose('\u{ace1}', .nfd, &buf); try testing.expect(dc.form == .nfd); try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); - dc = n.decompose('\u{ace1}', .nfkd, &buf); + dc = Normalize.decompose('\u{ace1}', .nfkd, &buf); try testing.expect(dc.form == .nfd); try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); - dc = n.decompose('\u{3d3}', .nfd, &buf); + dc = Normalize.decompose('\u{3d3}', .nfd, &buf); try testing.expect(dc.form == .nfd); try testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]); - dc = n.decompose('\u{3d3}', .nfkd, &buf); + dc = Normalize.decompose('\u{3d3}', .nfkd, &buf); try testing.expect(dc.form == .nfkd); try testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]); } @@ -231,8 +206,8 @@ pub const Result = struct { return .{ .allocated = true, .slice = try allocator.dupe(u8, result.slice) }; } - pub fn deinit(self: *const Result, allocator: Allocator) void { - if (self.allocated) allocator.free(self.slice); + pub fn deinit(result: *const Result, allocator: Allocator) void { + if (result.allocated) allocator.free(result.slice); } }; @@ -252,16 +227,16 @@ fn canonicalSort(cps: []u21) void { } /// Normalize `str` to NFD. -pub fn nfd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { - return self.nfxd(allocator, str, .nfd); +pub fn nfd(allocator: Allocator, str: []const u8) Allocator.Error!Result { + return Normalize.nfxd(allocator, str, .nfd); } /// Normalize `str` to NFKD. -pub fn nfkd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { - return self.nfxd(allocator, str, .nfkd); +pub fn nfkd(allocator: Allocator, str: []const u8) Allocator.Error!Result { + return Normalize.nfxd(allocator, str, .nfkd); } -pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error![]u21 { +pub fn nfxdCodePoints(allocator: Allocator, str: []const u8, form: Form) Allocator.Error![]u21 { var dcp_list = std.array_list.Managed(u21).init(allocator); defer dcp_list.deinit(); @@ -269,7 +244,7 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo var dc_buf: [18]u21 = undefined; while (cp_iter.next()) |cp| { - const dc = self.decompose(cp.code, form, &dc_buf); + const dc = Normalize.decompose(cp.code, form, &dc_buf); if (dc.form == .same) { try dcp_list.append(cp.code); } else { @@ -282,11 +257,11 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo return try dcp_list.toOwnedSlice(); } -fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { +fn nfxd(allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { // Quick checks. if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; - const dcps = try self.nfxdCodePoints(allocator, str, form); + const dcps = try Normalize.nfxdCodePoints(allocator, str, form); defer allocator.free(dcps); var dstr_list = std.array_list.Managed(u8).init(allocator); @@ -303,10 +278,8 @@ fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo test "nfd ASCII / no-alloc" { const allocator = testing.allocator; - var n = try Normalize.init(allocator); - defer n.deinit(allocator); - const result = try n.nfd(allocator, "Hello World!"); + const result = try Normalize.nfd(allocator, "Hello World!"); defer result.deinit(allocator); try testing.expectEqualStrings("Hello World!", result.slice); @@ -314,10 +287,8 @@ test "nfd ASCII / no-alloc" { test "nfd !ASCII / alloc" { const allocator = testing.allocator; - var n = try Normalize.init(allocator); - defer n.deinit(allocator); - const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); + const result = try Normalize.nfd(allocator, "Héllo World! \u{3d3}"); defer result.deinit(allocator); try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice); @@ -325,10 +296,8 @@ test "nfd !ASCII / alloc" { test "nfkd ASCII / no-alloc" { const allocator = testing.allocator; - var n = try Normalize.init(allocator); - defer n.deinit(allocator); - const result = try n.nfkd(allocator, "Hello World!"); + const result = try Normalize.nfkd(allocator, "Hello World!"); defer result.deinit(allocator); try testing.expectEqualStrings("Hello World!", result.slice); @@ -336,27 +305,21 @@ test "nfkd ASCII / no-alloc" { test "nfkd !ASCII / alloc" { const allocator = testing.allocator; - var n = try Normalize.init(allocator); - defer n.deinit(allocator); - const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); + const result = try Normalize.nfkd(allocator, "Héllo World! \u{3d3}"); defer result.deinit(allocator); try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); } -pub fn nfdCodePoints( - self: Normalize, - allocator: Allocator, - cps: []const u21, -) Allocator.Error![]u21 { +pub fn nfdCodePoints(allocator: Allocator, cps: []const u21) Allocator.Error![]u21 { var dcp_list = std.array_list.Managed(u21).init(allocator); defer dcp_list.deinit(); var dc_buf: [18]u21 = undefined; for (cps) |cp| { - const dc = self.decompose(cp, .nfd, &dc_buf); + const dc = Normalize.decompose(cp, .nfd, &dc_buf); if (dc.form == .same) { try dcp_list.append(cp); @@ -370,18 +333,14 @@ pub fn nfdCodePoints( return try dcp_list.toOwnedSlice(); } -pub fn nfkdCodePoints( - self: Normalize, - allocator: Allocator, - cps: []const u21, -) Allocator.Error![]u21 { +pub fn nfkdCodePoints(allocator: Allocator, cps: []const u21) Allocator.Error![]u21 { var dcp_list = std.array_list.Managed(u21).init(allocator); defer dcp_list.deinit(); var dc_buf: [18]u21 = undefined; for (cps) |cp| { - const dc = self.decompose(cp, .nfkd, &dc_buf); + const dc = Normalize.decompose(cp, .nfkd, &dc_buf); if (dc.form == .same) { try dcp_list.append(cp); @@ -402,29 +361,29 @@ fn isHangul(cp: u21) bool { } /// Normalizes `str` to NFC. -pub fn nfc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { - return self.nfxc(allocator, str, .nfc); +pub fn nfc(allocator: Allocator, str: []const u8) Allocator.Error!Result { + return Normalize.nfxc(allocator, str, .nfc); } /// Normalizes `str` to NFKC. -pub fn nfkc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { - return self.nfxc(allocator, str, .nfkc); +pub fn nfkc(allocator: Allocator, str: []const u8) Allocator.Error!Result { + return Normalize.nfxc(allocator, str, .nfkc); } -fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { +fn nfxc(allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { // Quick checks. if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str }; // Decompose first. var dcps = if (form == .nfc) - try self.nfxdCodePoints(allocator, str, .nfd) + try Normalize.nfxdCodePoints(allocator, str, .nfd) else - try self.nfxdCodePoints(allocator, str, .nfkd); + try Normalize.nfxdCodePoints(allocator, str, .nfkd); defer allocator.free(dcps); // Compose - const tombstone = 0xe000; // Start of BMP Private Use Area + const tombstone = 0x1FFFF; // Convenient Cn noncharacter point // Loop over all decomposed code points. while (true) { @@ -498,7 +457,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo if (!processed_hangul) { // L, C are not Hangul, so check for primary composite // in the Unicode Character Database. - if (self.canon_data.toNfc(.{ L, C })) |P| { + if (CanonData.toNfc(.{ L, C })) |P| { // We have a primary composite P for L, C. // We must check if P is not in the Full // Composition Exclusions (FCX) list, @@ -534,10 +493,8 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo test "nfc" { const allocator = testing.allocator; - var n = try Normalize.init(allocator); - defer n.deinit(allocator); - const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); + const result = try Normalize.nfc(allocator, "Complex char: \u{3D2}\u{301}"); defer result.deinit(allocator); try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice); @@ -545,20 +502,18 @@ test "nfc" { test "nfkc" { const allocator = testing.allocator; - var n = try Normalize.init(allocator); - defer n.deinit(allocator); - const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); + const result = try Normalize.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); defer result.deinit(allocator); try testing.expectEqualStrings("Complex char: \u{038E}", result.slice); } /// Tests for equality of `a` and `b` after normalizing to NFC. -pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8) !bool { - const norm_result_a = try self.nfc(allocator, a); +pub fn eql(allocator: Allocator, a: []const u8, b: []const u8) !bool { + const norm_result_a = try Normalize.nfc(allocator, a); defer norm_result_a.deinit(allocator); - const norm_result_b = try self.nfc(allocator, b); + const norm_result_b = try Normalize.nfc(allocator, b); defer norm_result_b.deinit(allocator); return mem.eql(u8, norm_result_a.slice, norm_result_b.slice); @@ -566,11 +521,9 @@ pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8) test "eql" { const allocator = testing.allocator; - var n = try Normalize.init(allocator); - defer n.deinit(allocator); - try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); - try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); + try testing.expect(try Normalize.eql(allocator, "foé", "foe\u{0301}")); + try testing.expect(try Normalize.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); } /// Returns true if `str` only contains Latin-1 Supplement -- cgit v1.2.3