From 3c2c30bfbe861c6c48acd8d7507886787197a788 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 30 Apr 2025 12:58:26 -0400 Subject: Merge NormData with Normalize --- src/Normalize.zig | 193 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 108 insertions(+), 85 deletions(-) (limited to 'src/Normalize.zig') diff --git a/src/Normalize.zig b/src/Normalize.zig index b738b27..4f014cf 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig @@ -2,23 +2,41 @@ //! Unicode Normalization. You can normalize strings into NFC, //! NFKC, NFD, and NFKD normalization forms. -const std = @import("std"); -const debug = std.debug; -const assert = debug.assert; -const fmt = std.fmt; -const heap = std.heap; -const mem = std.mem; -const simd = std.simd; -const testing = std.testing; -const unicode = std.unicode; - -const ascii = @import("ascii"); -const CodePointIterator = @import("code_point").Iterator; -pub const NormData = @import("NormData"); +canon_data: CanonData = undefined, +ccc_data: CccData = undefined, +compat_data: CompatData = undefined, +hangul_data: HangulData = undefined, +normp_data: NormPropsData = undefined, + +const Normalize = @This(); + +pub fn init(allocator: Allocator) !Normalize { + var norm: Normalize = undefined; + try norm.setup(allocator); + return norm; +} -norm_data: *const NormData, +pub fn setup(self: *Normalize, allocator: Allocator) !void { + self.canon_data = try CanonData.init(allocator); + errdefer self.canon_data.deinit(allocator); + self.ccc_data = try CccData.init(allocator); + errdefer self.ccc_data.deinit(allocator); + self.compat_data = try CompatData.init(allocator); + errdefer self.compat_data.deinit(allocator); + self.hangul_data = try HangulData.init(allocator); + errdefer self.hangul_data.deinit(allocator); + self.normp_data = try NormPropsData.init(allocator); +} -const Self = @This(); +pub fn deinit(norm: *const Normalize, allocator: Allocator) void { + // Reasonably safe (?) + var mut_norm = @constCast(norm); + mut_norm.canon_data.deinit(allocator); + mut_norm.ccc_data.deinit(allocator); + mut_norm.compat_data.deinit(allocator); + mut_norm.hangul_data.deinit(allocator); + mut_norm.normp_data.deinit(allocator); +} const SBase: u21 = 0xAC00; const LBase: u21 = 0x1100; @@ -30,8 +48,8 @@ const TCount: u21 = 28; const NCount: u21 = 588; // VCount * TCount const SCount: u21 = 11172; // LCount * NCount -fn decomposeHangul(self: Self, cp: u21, buf: []u21) ?Decomp { - const kind = self.norm_data.hangul_data.syllable(cp); +fn decomposeHangul(self: Normalize, cp: u21, buf: []u21) ?Decomp { + const kind = self.hangul_data.syllable(cp); if (kind != .LV and kind != .LVT) return null; const SIndex: u21 = cp - SBase; @@ -90,21 +108,21 @@ const Decomp = struct { }; // `mapping` retrieves the decomposition mapping for a code point as per the UCD. -fn mapping(self: Self, cp: u21, form: Form) Decomp { +fn mapping(self: Normalize, cp: u21, form: Form) Decomp { var dc = Decomp{}; switch (form) { .nfd => { - dc.cps = self.norm_data.canon_data.toNfd(cp); + dc.cps = self.canon_data.toNfd(cp); if (dc.cps.len != 0) dc.form = .nfd; }, .nfkd => { - dc.cps = self.norm_data.compat_data.toNfkd(cp); + dc.cps = self.compat_data.toNfkd(cp); if (dc.cps.len != 0) { dc.form = .nfkd; } else { - dc.cps = self.norm_data.canon_data.toNfd(cp); + dc.cps = self.canon_data.toNfd(cp); if (dc.cps.len != 0) dc.form = .nfkd; } }, @@ -117,7 +135,7 @@ fn mapping(self: Self, cp: u21, form: Form) Decomp { // `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`. fn decompose( - self: Self, + self: Normalize, cp: u21, form: Form, buf: []u21, @@ -127,8 +145,8 @@ fn decompose( // NFD / NFKD quick checks. switch (form) { - .nfd => if (self.norm_data.normp_data.isNfd(cp)) return .{}, - .nfkd => if (self.norm_data.normp_data.isNfkd(cp)) return .{}, + .nfd => if (self.normp_data.isNfd(cp)) return .{}, + .nfkd => if (self.normp_data.isNfkd(cp)) return .{}, else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."), } @@ -175,10 +193,8 @@ fn decompose( test "decompose" { const allocator = testing.allocator; - var data: NormData = undefined; - try NormData.init(&data, allocator); - defer data.deinit(allocator); - var n = Self{ .norm_data = &data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); var buf: [18]u21 = undefined; @@ -228,42 +244,42 @@ pub const Result = struct { slice: []const u8, /// Ensures that the slice result is a copy of the input, by making a copy if it was not. - pub fn toOwned(result: Result, allocator: mem.Allocator) error{OutOfMemory}!Result { + pub fn toOwned(result: Result, allocator: Allocator) error{OutOfMemory}!Result { if (result.allocated) return result; return .{ .allocated = true, .slice = try allocator.dupe(u8, result.slice) }; } - pub fn deinit(self: *const Result, allocator: mem.Allocator) void { + pub fn deinit(self: *const Result, allocator: Allocator) void { if (self.allocated) allocator.free(self.slice); } }; // Compares code points by Canonical Combining Class order. -fn cccLess(self: Self, lhs: u21, rhs: u21) bool { - return self.norm_data.ccc_data.ccc(lhs) < self.norm_data.ccc_data.ccc(rhs); +fn cccLess(self: Normalize, lhs: u21, rhs: u21) bool { + return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs); } // Applies the Canonical Sorting Algorithm. -fn canonicalSort(self: Self, cps: []u21) void { +fn canonicalSort(self: Normalize, cps: []u21) void { var i: usize = 0; while (i < cps.len) : (i += 1) { const start: usize = i; - while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} + while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} mem.sort(u21, cps[start..i], self, cccLess); } } /// Normalize `str` to NFD. -pub fn nfd(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result { +pub fn nfd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { return self.nfxd(allocator, str, .nfd); } /// Normalize `str` to NFKD. -pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result { +pub fn nfkd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { return self.nfxd(allocator, str, .nfkd); } -pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error![]u21 { +pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error![]u21 { var dcp_list = std.ArrayList(u21).init(allocator); defer dcp_list.deinit(); @@ -284,7 +300,7 @@ pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, for return try dcp_list.toOwnedSlice(); } -fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error!Result { +fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { // Quick checks. if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; @@ -305,10 +321,8 @@ fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A test "nfd ASCII / no-alloc" { const allocator = testing.allocator; - var data: NormData = undefined; - try NormData.init(&data, allocator); - defer data.deinit(allocator); - const n = Self{ .norm_data = &data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); const result = try n.nfd(allocator, "Hello World!"); defer result.deinit(allocator); @@ -318,10 +332,8 @@ test "nfd ASCII / no-alloc" { test "nfd !ASCII / alloc" { const allocator = testing.allocator; - var data: NormData = undefined; - try NormData.init(&data, allocator); - defer data.deinit(allocator); - const n = Self{ .norm_data = &data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); defer result.deinit(allocator); @@ -331,10 +343,8 @@ test "nfd !ASCII / alloc" { test "nfkd ASCII / no-alloc" { const allocator = testing.allocator; - var data: NormData = undefined; - try NormData.init(&data, allocator); - defer data.deinit(allocator); - const n = Self{ .norm_data = &data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); const result = try n.nfkd(allocator, "Hello World!"); defer result.deinit(allocator); @@ -344,10 +354,8 @@ test "nfkd ASCII / no-alloc" { test "nfkd !ASCII / alloc" { const allocator = testing.allocator; - var data: NormData = undefined; - try NormData.init(&data, allocator); - defer data.deinit(allocator); - const n = Self{ .norm_data = &data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); defer result.deinit(allocator); @@ -356,10 +364,10 @@ test "nfkd !ASCII / alloc" { } pub fn nfdCodePoints( - self: Self, - allocator: mem.Allocator, + self: Normalize, + allocator: Allocator, cps: []const u21, -) mem.Allocator.Error![]u21 { +) Allocator.Error![]u21 { var dcp_list = std.ArrayList(u21).init(allocator); defer dcp_list.deinit(); @@ -381,10 +389,10 @@ pub fn nfdCodePoints( } pub fn nfkdCodePoints( - self: Self, - allocator: mem.Allocator, + self: Normalize, + allocator: Allocator, cps: []const u21, -) mem.Allocator.Error![]u21 { +) Allocator.Error![]u21 { var dcp_list = std.ArrayList(u21).init(allocator); defer dcp_list.deinit(); @@ -407,21 +415,21 @@ pub fn nfkdCodePoints( // Composition (NFC, NFKC) -fn isHangul(self: Self, cp: u21) bool { - return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none; +fn isHangul(self: Normalize, cp: u21) bool { + return cp >= 0x1100 and self.hangul_data.syllable(cp) != .none; } /// Normalizes `str` to NFC. -pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result { +pub fn nfc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { return self.nfxc(allocator, str, .nfc); } /// Normalizes `str` to NFKC. -pub fn nfkc(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result { +pub fn nfkc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { return self.nfxc(allocator, str, .nfkc); } -fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error!Result { +fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { // Quick checks. if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str }; @@ -446,7 +454,7 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A block_check: while (i < dcps.len) : (i += 1) { const C = dcps[i]; if (C == tombstone) continue :block_check; - const cc_C = self.norm_data.ccc_data.ccc(C); + const cc_C = self.ccc_data.ccc(C); var starter_index: ?usize = null; var j: usize = i; @@ -456,11 +464,11 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A if (dcps[j] == tombstone) continue; // Check for starter. - if (self.norm_data.ccc_data.isStarter(dcps[j])) { + if (self.ccc_data.isStarter(dcps[j])) { // Check for blocking conditions. for (dcps[(j + 1)..i]) |B| { if (B == tombstone) continue; - const cc_B = self.norm_data.ccc_data.ccc(B); + const cc_B = self.ccc_data.ccc(B); if (cc_B != 0 and self.isHangul(C)) continue :block_check; if (cc_B >= cc_C) continue :block_check; } @@ -484,8 +492,8 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A // them algorithmically if possible. if (self.isHangul(L) and self.isHangul(C)) { // Get Hangul syllable types. - const l_stype = self.norm_data.hangul_data.syllable(L); - const c_stype = self.norm_data.hangul_data.syllable(C); + const l_stype = self.hangul_data.syllable(L); + const c_stype = self.hangul_data.syllable(C); if (l_stype == .LV and c_stype == .T) { // LV, T canonical composition. @@ -508,13 +516,13 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A if (!processed_hangul) { // L, C are not Hangul, so check for primary composite // in the Unicode Character Database. - if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| { + if (self.canon_data.toNfc(.{ L, C })) |P| { // We have a primary composite P for L, C. // We must check if P is not in the Full // Composition Exclusions (FCX) list, // preventing it from appearing in any // composed form (NFC, NFKC). - if (!self.norm_data.normp_data.isFcx(P)) { + if (!self.normp_data.isFcx(P)) { dcps[sidx] = P; dcps[i] = tombstone; // Mark for deletion. deleted += 1; @@ -544,10 +552,8 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A test "nfc" { const allocator = testing.allocator; - var data: NormData = undefined; - try NormData.init(&data, allocator); - defer data.deinit(allocator); - const n = Self{ .norm_data = &data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); defer result.deinit(allocator); @@ -557,10 +563,8 @@ test "nfc" { test "nfkc" { const allocator = testing.allocator; - var data: NormData = undefined; - try NormData.init(&data, allocator); - defer data.deinit(allocator); - const n = Self{ .norm_data = &data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); defer result.deinit(allocator); @@ -569,7 +573,7 @@ test "nfkc" { } /// Tests for equality of `a` and `b` after normalizing to NFC. -pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool { +pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8) !bool { const norm_result_a = try self.nfc(allocator, a); defer norm_result_a.deinit(allocator); const norm_result_b = try self.nfc(allocator, b); @@ -580,10 +584,8 @@ pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) ! test "eql" { const allocator = testing.allocator; - var data: NormData = undefined; - try NormData.init(&data, allocator); - defer data.deinit(allocator); - const n = Self{ .norm_data = &data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); @@ -629,3 +631,24 @@ test "isLatin1Only" { const not_latin1_only = "Héllo, World! \u{3d3}"; try testing.expect(!isLatin1Only(not_latin1_only)); } + +const std = @import("std"); +const debug = std.debug; +const assert = debug.assert; +const fmt = std.fmt; +const heap = std.heap; +const mem = std.mem; +const simd = std.simd; +const testing = std.testing; +const unicode = std.unicode; +const Allocator = std.mem.Allocator; + +const ascii = @import("ascii"); +const CodePointIterator = @import("code_point").Iterator; + +const CanonData = @import("CanonData"); +const CccData = @import("CombiningData"); +const CompatData = @import("CompatData"); +const FoldData = @import("FoldData"); +const HangulData = @import("HangulData"); +const NormPropsData = @import("NormPropsData"); -- cgit v1.2.3