From 3c2c30bfbe861c6c48acd8d7507886787197a788 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 30 Apr 2025 12:58:26 -0400 Subject: Merge NormData with Normalize --- src/CanonData.zig | 50 ++++++------- src/CaseFold.zig | 12 ++-- src/CombiningData.zig | 44 ++++++------ src/CompatData.zig | 36 +++++----- src/FoldData.zig | 78 ++++++++++---------- src/HangulData.zig | 42 +++++------ src/NormData.zig | 37 ---------- src/NormPropsData.zig | 50 ++++++------- src/Normalize.zig | 193 ++++++++++++++++++++++++++++---------------------- src/unicode_tests.zig | 5 +- 10 files changed, 269 insertions(+), 278 deletions(-) delete mode 100644 src/NormData.zig (limited to 'src') diff --git a/src/CanonData.zig b/src/CanonData.zig index 794748c..c67d1d6 100644 --- a/src/CanonData.zig +++ b/src/CanonData.zig @@ -1,14 +1,11 @@ -const std = @import("std"); -const builtin = @import("builtin"); -const compress = std.compress; -const mem = std.mem; +//! Canonicalization Data nfc: std.AutoHashMapUnmanaged([2]u21, u21), nfd: [][]u21 = undefined, -const Self = @This(); +const CanonData = @This(); -pub fn init(allocator: mem.Allocator) !Self { +pub fn init(allocator: mem.Allocator) !CanonData { const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("canon"); var in_fbs = std.io.fixedBufferStream(in_bytes); @@ -16,49 +13,54 @@ pub fn init(allocator: mem.Allocator) !Self { var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); - var self = Self{ - .nfc = .{}, + var cdata = CanonData{ + .nfc = .empty, .nfd = try allocator.alloc([]u21, 0x110000), }; var slices: usize = 0; errdefer { - self.nfc.deinit(allocator); - for (self.nfd[0..slices]) |slice| allocator.free(slice); - allocator.free(self.nfd); + cdata.nfc.deinit(allocator); + for (cdata.nfd[0..slices]) |slice| allocator.free(slice); + allocator.free(cdata.nfd); } - @memset(self.nfd, &.{}); + @memset(cdata.nfd, &.{}); while (true) { const len: u8 = try reader.readInt(u8, endian); if (len == 0) break; const cp = try reader.readInt(u24, endian); - self.nfd[cp] = try allocator.alloc(u21, len - 1); + cdata.nfd[cp] = try allocator.alloc(u21, len - 1); slices += 1; for (0..len - 1) |i| { - self.nfd[cp][i] = @intCast(try reader.readInt(u24, endian)); + cdata.nfd[cp][i] = @intCast(try reader.readInt(u24, endian)); } if (len == 3) { - try self.nfc.put(allocator, self.nfd[cp][0..2].*, @intCast(cp)); + try cdata.nfc.put(allocator, cdata.nfd[cp][0..2].*, @intCast(cp)); } } - return self; + return cdata; } -pub fn deinit(self: *Self, allocator: mem.Allocator) void { - self.nfc.deinit(allocator); - for (self.nfd) |slice| allocator.free(slice); - allocator.free(self.nfd); +pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void { + cdata.nfc.deinit(allocator); + for (cdata.nfd) |slice| allocator.free(slice); + allocator.free(cdata.nfd); } /// Returns canonical decomposition for `cp`. -pub fn toNfd(self: Self, cp: u21) []const u21 { - return self.nfd[cp]; +pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 { + return cdata.nfd[cp]; } // Returns the primary composite for the codepoints in `cp`. -pub fn toNfc(self: Self, cps: [2]u21) ?u21 { - return self.nfc.get(cps); +pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 { + return cdata.nfc.get(cps); } + +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; diff --git a/src/CaseFold.zig b/src/CaseFold.zig index c84a420..6490aea 100644 --- a/src/CaseFold.zig +++ b/src/CaseFold.zig @@ -95,10 +95,8 @@ pub fn compatCaselessMatch( test "compatCaselessMatch" { const allocator = testing.allocator; - var norm_data = Normalize.NormData{}; - try norm_data.init(allocator); - defer norm_data.deinit(allocator); - const n = Normalize{ .norm_data = &norm_data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); const fold_data = try FoldData.init(allocator); defer fold_data.deinit(allocator); @@ -171,10 +169,8 @@ pub fn canonCaselessMatch( test "canonCaselessMatch" { const allocator = testing.allocator; - var norm_data = Normalize.NormData{}; - try norm_data.init(allocator); - defer norm_data.deinit(allocator); - const n = Normalize{ .norm_data = &norm_data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); const fold_data = try FoldData.init(allocator); defer fold_data.deinit(allocator); diff --git a/src/CombiningData.zig b/src/CombiningData.zig index b5e227a..fd64a3b 100644 --- a/src/CombiningData.zig +++ b/src/CombiningData.zig @@ -1,14 +1,11 @@ -const std = @import("std"); -const builtin = @import("builtin"); -const compress = std.compress; -const mem = std.mem; +//! Combining Class Data s1: []u16 = undefined, s2: []u8 = undefined, -const Self = @This(); +const CombiningData = @This(); -pub fn init(allocator: mem.Allocator) !Self { +pub fn init(allocator: mem.Allocator) !CombiningData { const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("ccc"); var in_fbs = std.io.fixedBufferStream(in_bytes); @@ -17,32 +14,37 @@ pub fn init(allocator: mem.Allocator) !Self { const endian = builtin.cpu.arch.endian(); - var self = Self{}; + var cbdata = CombiningData{}; const stage_1_len: u16 = try reader.readInt(u16, endian); - self.s1 = try allocator.alloc(u16, stage_1_len); - errdefer allocator.free(self.s1); - for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); + cbdata.s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(cbdata.s1); + for (0..stage_1_len) |i| cbdata.s1[i] = try reader.readInt(u16, endian); const stage_2_len: u16 = try reader.readInt(u16, endian); - self.s2 = try allocator.alloc(u8, stage_2_len); - errdefer allocator.free(self.s2); - _ = try reader.readAll(self.s2); + cbdata.s2 = try allocator.alloc(u8, stage_2_len); + errdefer allocator.free(cbdata.s2); + _ = try reader.readAll(cbdata.s2); - return self; + return cbdata; } -pub fn deinit(self: *const Self, allocator: mem.Allocator) void { - allocator.free(self.s1); - allocator.free(self.s2); +pub fn deinit(cbdata: *const CombiningData, allocator: mem.Allocator) void { + allocator.free(cbdata.s1); + allocator.free(cbdata.s2); } /// Returns the canonical combining class for a code point. -pub fn ccc(self: Self, cp: u21) u8 { - return self.s2[self.s1[cp >> 8] + (cp & 0xff)]; +pub fn ccc(cbdata: CombiningData, cp: u21) u8 { + return cbdata.s2[cbdata.s1[cp >> 8] + (cp & 0xff)]; } /// True if `cp` is a starter code point, not a combining character. -pub fn isStarter(self: Self, cp: u21) bool { - return self.s2[self.s1[cp >> 8] + (cp & 0xff)] == 0; +pub fn isStarter(cbdata: CombiningData, cp: u21) bool { + return cbdata.s2[cbdata.s1[cp >> 8] + (cp & 0xff)] == 0; } + +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; diff --git a/src/CompatData.zig b/src/CompatData.zig index ac08048..d787103 100644 --- a/src/CompatData.zig +++ b/src/CompatData.zig @@ -1,13 +1,10 @@ -const std = @import("std"); -const builtin = @import("builtin"); -const compress = std.compress; -const mem = std.mem; +//! Compatibility Data nfkd: [][]u21 = undefined, -const Self = @This(); +const CompatData = @This(); -pub fn init(allocator: mem.Allocator) !Self { +pub fn init(allocator: mem.Allocator) !CompatData { const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("compat"); var in_fbs = std.io.fixedBufferStream(in_bytes); @@ -15,34 +12,39 @@ pub fn init(allocator: mem.Allocator) !Self { var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); - var self = Self{ + var cpdata = CompatData{ .nfkd = try allocator.alloc([]u21, 0x110000), }; - errdefer self.deinit(allocator); + errdefer cpdata.deinit(allocator); - @memset(self.nfkd, &.{}); + @memset(cpdata.nfkd, &.{}); while (true) { const len: u8 = try reader.readInt(u8, endian); if (len == 0) break; const cp = try reader.readInt(u24, endian); - self.nfkd[cp] = try allocator.alloc(u21, len - 1); + cpdata.nfkd[cp] = try allocator.alloc(u21, len - 1); for (0..len - 1) |i| { - self.nfkd[cp][i] = @intCast(try reader.readInt(u24, endian)); + cpdata.nfkd[cp][i] = @intCast(try reader.readInt(u24, endian)); } } - return self; + return cpdata; } -pub fn deinit(self: *const Self, allocator: mem.Allocator) void { - for (self.nfkd) |slice| { +pub fn deinit(cpdata: *const CompatData, allocator: mem.Allocator) void { + for (cpdata.nfkd) |slice| { if (slice.len != 0) allocator.free(slice); } - allocator.free(self.nfkd); + allocator.free(cpdata.nfkd); } /// Returns compatibility decomposition for `cp`. -pub fn toNfkd(self: Self, cp: u21) []u21 { - return self.nfkd[cp]; +pub fn toNfkd(cpdata: *const CompatData, cp: u21) []u21 { + return cpdata.nfkd[cp]; } + +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; diff --git a/src/FoldData.zig b/src/FoldData.zig index e44e714..b7fdceb 100644 --- a/src/FoldData.zig +++ b/src/FoldData.zig @@ -12,9 +12,9 @@ stage1: []u8 = undefined, stage2: []u8 = undefined, stage3: []i24 = undefined, -const Self = @This(); +const FoldData = @This(); -pub fn init(allocator: mem.Allocator) !Self { +pub fn init(allocator: mem.Allocator) !FoldData { const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("fold"); var in_fbs = std.io.fixedBufferStream(in_bytes); @@ -23,61 +23,61 @@ pub fn init(allocator: mem.Allocator) !Self { const endian = builtin.cpu.arch.endian(); - var self = Self{}; - self.cutoff = @intCast(try reader.readInt(u24, endian)); - self.multiple_start = @intCast(try reader.readInt(u24, endian)); + var fdata = FoldData{}; + fdata.cutoff = @intCast(try reader.readInt(u24, endian)); + fdata.multiple_start = @intCast(try reader.readInt(u24, endian)); var len = try reader.readInt(u16, endian); - self.stage1 = try allocator.alloc(u8, len); - errdefer allocator.free(self.stage1); - for (0..len) |i| self.stage1[i] = try reader.readInt(u8, endian); + fdata.stage1 = try allocator.alloc(u8, len); + errdefer allocator.free(fdata.stage1); + for (0..len) |i| fdata.stage1[i] = try reader.readInt(u8, endian); len = try reader.readInt(u16, endian); - self.stage2 = try allocator.alloc(u8, len); - errdefer allocator.free(self.stage2); - for (0..len) |i| self.stage2[i] = try reader.readInt(u8, endian); + fdata.stage2 = try allocator.alloc(u8, len); + errdefer allocator.free(fdata.stage2); + for (0..len) |i| fdata.stage2[i] = try reader.readInt(u8, endian); len = try reader.readInt(u16, endian); - self.stage3 = try allocator.alloc(i24, len); - errdefer allocator.free(self.stage3); - for (0..len) |i| self.stage3[i] = try reader.readInt(i24, endian); + fdata.stage3 = try allocator.alloc(i24, len); + errdefer allocator.free(fdata.stage3); + for (0..len) |i| fdata.stage3[i] = try reader.readInt(i24, endian); - self.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian)); - self.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian)); + fdata.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian)); + fdata.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian)); len = try reader.readInt(u16, endian); - self.cwcf_exceptions = try allocator.alloc(u21, len); - errdefer allocator.free(self.cwcf_exceptions); - for (0..len) |i| self.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian)); + fdata.cwcf_exceptions = try allocator.alloc(u21, len); + errdefer allocator.free(fdata.cwcf_exceptions); + for (0..len) |i| fdata.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian)); - return self; + return fdata; } -pub fn deinit(self: *const Self, allocator: mem.Allocator) void { - allocator.free(self.stage1); - allocator.free(self.stage2); - allocator.free(self.stage3); - allocator.free(self.cwcf_exceptions); +pub fn deinit(fdata: *const FoldData, allocator: mem.Allocator) void { + allocator.free(fdata.stage1); + allocator.free(fdata.stage2); + allocator.free(fdata.stage3); + allocator.free(fdata.cwcf_exceptions); } /// Returns the case fold for `cp`. -pub fn caseFold(self: Self, cp: u21, buf: []u21) []const u21 { - if (cp >= self.cutoff) return &.{}; +pub fn caseFold(fdata: *const FoldData, cp: u21, buf: []u21) []const u21 { + if (cp >= fdata.cutoff) return &.{}; - const stage1_val = self.stage1[cp >> 8]; + const stage1_val = fdata.stage1[cp >> 8]; if (stage1_val == 0) return &.{}; const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF); - const stage3_index = self.stage2[stage2_index]; + const stage3_index = fdata.stage2[stage2_index]; if (stage3_index & 0x80 != 0) { - const real_index = @as(usize, self.multiple_start) + (stage3_index ^ 0x80) * 3; - const mapping = mem.sliceTo(self.stage3[real_index..][0..3], 0); + const real_index = @as(usize, fdata.multiple_start) + (stage3_index ^ 0x80) * 3; + const mapping = mem.sliceTo(fdata.stage3[real_index..][0..3], 0); for (mapping, 0..) |c, i| buf[i] = @intCast(c); return buf[0..mapping.len]; } - const offset = self.stage3[stage3_index]; + const offset = fdata.stage3[stage3_index]; if (offset == 0) return &.{}; buf[0] = @intCast(@as(i32, cp) + offset); @@ -86,14 +86,14 @@ pub fn caseFold(self: Self, cp: u21, buf: []u21) []const u21 { } /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). -pub fn changesWhenCaseFolded(self: Self, cp: u21) bool { +pub fn changesWhenCaseFolded(fdata: *const FoldData, cp: u21) bool { var buf: [3]u21 = undefined; - const has_mapping = self.caseFold(cp, &buf).len != 0; - return has_mapping and !self.isCwcfException(cp); + const has_mapping = fdata.caseFold(cp, &buf).len != 0; + return has_mapping and !fdata.isCwcfException(cp); } -fn isCwcfException(self: Self, cp: u21) bool { - return cp >= self.cwcf_exceptions_min and - cp <= self.cwcf_exceptions_max and - std.mem.indexOfScalar(u21, self.cwcf_exceptions, cp) != null; +fn isCwcfException(fdata: *const FoldData, cp: u21) bool { + return cp >= fdata.cwcf_exceptions_min and + cp <= fdata.cwcf_exceptions_max and + std.mem.indexOfScalar(u21, fdata.cwcf_exceptions, cp) != null; } diff --git a/src/HangulData.zig b/src/HangulData.zig index 4bccbe6..8c5f3ad 100644 --- a/src/HangulData.zig +++ b/src/HangulData.zig @@ -1,8 +1,4 @@ -const std = @import("std"); -const builtin = @import("builtin"); -const compress = std.compress; -const mem = std.mem; -const testing = std.testing; +//! Hangul Data pub const Syllable = enum { none, @@ -16,9 +12,9 @@ pub const Syllable = enum { s1: []u16 = undefined, s2: []u3 = undefined, -const Self = @This(); +const Hangul = @This(); -pub fn init(allocator: mem.Allocator) !Self { +pub fn init(allocator: mem.Allocator) !Hangul { const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("hangul"); var in_fbs = std.io.fixedBufferStream(in_bytes); @@ -26,27 +22,33 @@ pub fn init(allocator: mem.Allocator) !Self { var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); - var self = Self{}; + var hangul = Hangul{}; const stage_1_len: u16 = try reader.readInt(u16, endian); - self.s1 = try allocator.alloc(u16, stage_1_len); - errdefer allocator.free(self.s1); - for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); + hangul.s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(hangul.s1); + for (0..stage_1_len) |i| hangul.s1[i] = try reader.readInt(u16, endian); const stage_2_len: u16 = try reader.readInt(u16, endian); - self.s2 = try allocator.alloc(u3, stage_2_len); - errdefer allocator.free(self.s2); - for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian)); + hangul.s2 = try allocator.alloc(u3, stage_2_len); + errdefer allocator.free(hangul.s2); + for (0..stage_2_len) |i| hangul.s2[i] = @intCast(try reader.readInt(u8, endian)); - return self; + return hangul; } -pub fn deinit(self: *const Self, allocator: mem.Allocator) void { - allocator.free(self.s1); - allocator.free(self.s2); +pub fn deinit(hangul: *const Hangul, allocator: mem.Allocator) void { + allocator.free(hangul.s1); + allocator.free(hangul.s2); } /// Returns the Hangul syllable type for `cp`. -pub fn syllable(self: Self, cp: u21) Syllable { - return @enumFromInt(self.s2[self.s1[cp >> 8] + (cp & 0xff)]); +pub fn syllable(hangul: *const Hangul, cp: u21) Syllable { + return @enumFromInt(hangul.s2[hangul.s1[cp >> 8] + (cp & 0xff)]); } + +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; +const testing = std.testing; diff --git a/src/NormData.zig b/src/NormData.zig deleted file mode 100644 index a123860..0000000 --- a/src/NormData.zig +++ /dev/null @@ -1,37 +0,0 @@ -const std = @import("std"); -const mem = std.mem; - -const CanonData = @import("CanonData"); -const CccData = @import("CombiningData"); -const CompatData = @import("CompatData"); -const FoldData = @import("FoldData"); -const HangulData = @import("HangulData"); -const NormPropsData = @import("NormPropsData"); - -canon_data: CanonData = undefined, -ccc_data: CccData = undefined, -compat_data: CompatData = undefined, -hangul_data: HangulData = undefined, -normp_data: NormPropsData = undefined, - -const Self = @This(); - -pub fn init(self: *Self, allocator: std.mem.Allocator) !void { - self.canon_data = try CanonData.init(allocator); - errdefer self.canon_data.deinit(allocator); - self.ccc_data = try CccData.init(allocator); - errdefer self.ccc_data.deinit(allocator); - self.compat_data = try CompatData.init(allocator); - errdefer self.compat_data.deinit(allocator); - self.hangul_data = try HangulData.init(allocator); - errdefer self.hangul_data.deinit(allocator); - self.normp_data = try NormPropsData.init(allocator); -} - -pub fn deinit(self: *Self, allocator: mem.Allocator) void { - self.canon_data.deinit(allocator); - self.ccc_data.deinit(allocator); - self.compat_data.deinit(allocator); - self.hangul_data.deinit(allocator); - self.normp_data.deinit(allocator); -} diff --git a/src/NormPropsData.zig b/src/NormPropsData.zig index e79ae91..ca69569 100644 --- a/src/NormPropsData.zig +++ b/src/NormPropsData.zig @@ -1,15 +1,11 @@ -const std = @import("std"); -const builtin = @import("builtin"); -const compress = std.compress; -const mem = std.mem; -const testing = std.testing; +//! Normalization Properties Data s1: []u16 = undefined, s2: []u4 = undefined, -const Self = @This(); +const NormProps = @This(); -pub fn init(allocator: mem.Allocator) !Self { +pub fn init(allocator: mem.Allocator) !NormProps { const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("normp"); var in_fbs = std.io.fixedBufferStream(in_bytes); @@ -17,37 +13,43 @@ pub fn init(allocator: mem.Allocator) !Self { var reader = in_decomp.reader(); const endian = builtin.cpu.arch.endian(); - var self = Self{}; + var norms = NormProps{}; const stage_1_len: u16 = try reader.readInt(u16, endian); - self.s1 = try allocator.alloc(u16, stage_1_len); - errdefer allocator.free(self.s1); - for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); + norms.s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(norms.s1); + for (0..stage_1_len) |i| norms.s1[i] = try reader.readInt(u16, endian); const stage_2_len: u16 = try reader.readInt(u16, endian); - self.s2 = try allocator.alloc(u4, stage_2_len); - errdefer allocator.free(self.s2); - for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian)); + norms.s2 = try allocator.alloc(u4, stage_2_len); + errdefer allocator.free(norms.s2); + for (0..stage_2_len) |i| norms.s2[i] = @intCast(try reader.readInt(u8, endian)); - return self; + return norms; } -pub fn deinit(self: *const Self, allocator: mem.Allocator) void { - allocator.free(self.s1); - allocator.free(self.s2); +pub fn deinit(norms: *const NormProps, allocator: mem.Allocator) void { + allocator.free(norms.s1); + allocator.free(norms.s2); } /// Returns true if `cp` is already in NFD form. -pub fn isNfd(self: Self, cp: u21) bool { - return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 1 == 0; +pub fn isNfd(norms: *const NormProps, cp: u21) bool { + return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 1 == 0; } /// Returns true if `cp` is already in NFKD form. -pub fn isNfkd(self: Self, cp: u21) bool { - return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 2 == 0; +pub fn isNfkd(norms: *const NormProps, cp: u21) bool { + return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 2 == 0; } /// Returns true if `cp` is not allowed in any normalized form. -pub fn isFcx(self: Self, cp: u21) bool { - return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 4 == 4; +pub fn isFcx(norms: *const NormProps, cp: u21) bool { + return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 4 == 4; } + +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; +const testing = std.testing; diff --git a/src/Normalize.zig b/src/Normalize.zig index b738b27..4f014cf 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig @@ -2,23 +2,41 @@ //! Unicode Normalization. You can normalize strings into NFC, //! NFKC, NFD, and NFKD normalization forms. -const std = @import("std"); -const debug = std.debug; -const assert = debug.assert; -const fmt = std.fmt; -const heap = std.heap; -const mem = std.mem; -const simd = std.simd; -const testing = std.testing; -const unicode = std.unicode; - -const ascii = @import("ascii"); -const CodePointIterator = @import("code_point").Iterator; -pub const NormData = @import("NormData"); +canon_data: CanonData = undefined, +ccc_data: CccData = undefined, +compat_data: CompatData = undefined, +hangul_data: HangulData = undefined, +normp_data: NormPropsData = undefined, + +const Normalize = @This(); + +pub fn init(allocator: Allocator) !Normalize { + var norm: Normalize = undefined; + try norm.setup(allocator); + return norm; +} -norm_data: *const NormData, +pub fn setup(self: *Normalize, allocator: Allocator) !void { + self.canon_data = try CanonData.init(allocator); + errdefer self.canon_data.deinit(allocator); + self.ccc_data = try CccData.init(allocator); + errdefer self.ccc_data.deinit(allocator); + self.compat_data = try CompatData.init(allocator); + errdefer self.compat_data.deinit(allocator); + self.hangul_data = try HangulData.init(allocator); + errdefer self.hangul_data.deinit(allocator); + self.normp_data = try NormPropsData.init(allocator); +} -const Self = @This(); +pub fn deinit(norm: *const Normalize, allocator: Allocator) void { + // Reasonably safe (?) + var mut_norm = @constCast(norm); + mut_norm.canon_data.deinit(allocator); + mut_norm.ccc_data.deinit(allocator); + mut_norm.compat_data.deinit(allocator); + mut_norm.hangul_data.deinit(allocator); + mut_norm.normp_data.deinit(allocator); +} const SBase: u21 = 0xAC00; const LBase: u21 = 0x1100; @@ -30,8 +48,8 @@ const TCount: u21 = 28; const NCount: u21 = 588; // VCount * TCount const SCount: u21 = 11172; // LCount * NCount -fn decomposeHangul(self: Self, cp: u21, buf: []u21) ?Decomp { - const kind = self.norm_data.hangul_data.syllable(cp); +fn decomposeHangul(self: Normalize, cp: u21, buf: []u21) ?Decomp { + const kind = self.hangul_data.syllable(cp); if (kind != .LV and kind != .LVT) return null; const SIndex: u21 = cp - SBase; @@ -90,21 +108,21 @@ const Decomp = struct { }; // `mapping` retrieves the decomposition mapping for a code point as per the UCD. -fn mapping(self: Self, cp: u21, form: Form) Decomp { +fn mapping(self: Normalize, cp: u21, form: Form) Decomp { var dc = Decomp{}; switch (form) { .nfd => { - dc.cps = self.norm_data.canon_data.toNfd(cp); + dc.cps = self.canon_data.toNfd(cp); if (dc.cps.len != 0) dc.form = .nfd; }, .nfkd => { - dc.cps = self.norm_data.compat_data.toNfkd(cp); + dc.cps = self.compat_data.toNfkd(cp); if (dc.cps.len != 0) { dc.form = .nfkd; } else { - dc.cps = self.norm_data.canon_data.toNfd(cp); + dc.cps = self.canon_data.toNfd(cp); if (dc.cps.len != 0) dc.form = .nfkd; } }, @@ -117,7 +135,7 @@ fn mapping(self: Self, cp: u21, form: Form) Decomp { // `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`. fn decompose( - self: Self, + self: Normalize, cp: u21, form: Form, buf: []u21, @@ -127,8 +145,8 @@ fn decompose( // NFD / NFKD quick checks. switch (form) { - .nfd => if (self.norm_data.normp_data.isNfd(cp)) return .{}, - .nfkd => if (self.norm_data.normp_data.isNfkd(cp)) return .{}, + .nfd => if (self.normp_data.isNfd(cp)) return .{}, + .nfkd => if (self.normp_data.isNfkd(cp)) return .{}, else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."), } @@ -175,10 +193,8 @@ fn decompose( test "decompose" { const allocator = testing.allocator; - var data: NormData = undefined; - try NormData.init(&data, allocator); - defer data.deinit(allocator); - var n = Self{ .norm_data = &data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); var buf: [18]u21 = undefined; @@ -228,42 +244,42 @@ pub const Result = struct { slice: []const u8, /// Ensures that the slice result is a copy of the input, by making a copy if it was not. - pub fn toOwned(result: Result, allocator: mem.Allocator) error{OutOfMemory}!Result { + pub fn toOwned(result: Result, allocator: Allocator) error{OutOfMemory}!Result { if (result.allocated) return result; return .{ .allocated = true, .slice = try allocator.dupe(u8, result.slice) }; } - pub fn deinit(self: *const Result, allocator: mem.Allocator) void { + pub fn deinit(self: *const Result, allocator: Allocator) void { if (self.allocated) allocator.free(self.slice); } }; // Compares code points by Canonical Combining Class order. -fn cccLess(self: Self, lhs: u21, rhs: u21) bool { - return self.norm_data.ccc_data.ccc(lhs) < self.norm_data.ccc_data.ccc(rhs); +fn cccLess(self: Normalize, lhs: u21, rhs: u21) bool { + return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs); } // Applies the Canonical Sorting Algorithm. -fn canonicalSort(self: Self, cps: []u21) void { +fn canonicalSort(self: Normalize, cps: []u21) void { var i: usize = 0; while (i < cps.len) : (i += 1) { const start: usize = i; - while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} + while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} mem.sort(u21, cps[start..i], self, cccLess); } } /// Normalize `str` to NFD. -pub fn nfd(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result { +pub fn nfd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { return self.nfxd(allocator, str, .nfd); } /// Normalize `str` to NFKD. -pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result { +pub fn nfkd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { return self.nfxd(allocator, str, .nfkd); } -pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error![]u21 { +pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error![]u21 { var dcp_list = std.ArrayList(u21).init(allocator); defer dcp_list.deinit(); @@ -284,7 +300,7 @@ pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, for return try dcp_list.toOwnedSlice(); } -fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error!Result { +fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { // Quick checks. if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; @@ -305,10 +321,8 @@ fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A test "nfd ASCII / no-alloc" { const allocator = testing.allocator; - var data: NormData = undefined; - try NormData.init(&data, allocator); - defer data.deinit(allocator); - const n = Self{ .norm_data = &data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); const result = try n.nfd(allocator, "Hello World!"); defer result.deinit(allocator); @@ -318,10 +332,8 @@ test "nfd ASCII / no-alloc" { test "nfd !ASCII / alloc" { const allocator = testing.allocator; - var data: NormData = undefined; - try NormData.init(&data, allocator); - defer data.deinit(allocator); - const n = Self{ .norm_data = &data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); defer result.deinit(allocator); @@ -331,10 +343,8 @@ test "nfd !ASCII / alloc" { test "nfkd ASCII / no-alloc" { const allocator = testing.allocator; - var data: NormData = undefined; - try NormData.init(&data, allocator); - defer data.deinit(allocator); - const n = Self{ .norm_data = &data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); const result = try n.nfkd(allocator, "Hello World!"); defer result.deinit(allocator); @@ -344,10 +354,8 @@ test "nfkd ASCII / no-alloc" { test "nfkd !ASCII / alloc" { const allocator = testing.allocator; - var data: NormData = undefined; - try NormData.init(&data, allocator); - defer data.deinit(allocator); - const n = Self{ .norm_data = &data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); defer result.deinit(allocator); @@ -356,10 +364,10 @@ test "nfkd !ASCII / alloc" { } pub fn nfdCodePoints( - self: Self, - allocator: mem.Allocator, + self: Normalize, + allocator: Allocator, cps: []const u21, -) mem.Allocator.Error![]u21 { +) Allocator.Error![]u21 { var dcp_list = std.ArrayList(u21).init(allocator); defer dcp_list.deinit(); @@ -381,10 +389,10 @@ pub fn nfdCodePoints( } pub fn nfkdCodePoints( - self: Self, - allocator: mem.Allocator, + self: Normalize, + allocator: Allocator, cps: []const u21, -) mem.Allocator.Error![]u21 { +) Allocator.Error![]u21 { var dcp_list = std.ArrayList(u21).init(allocator); defer dcp_list.deinit(); @@ -407,21 +415,21 @@ pub fn nfkdCodePoints( // Composition (NFC, NFKC) -fn isHangul(self: Self, cp: u21) bool { - return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none; +fn isHangul(self: Normalize, cp: u21) bool { + return cp >= 0x1100 and self.hangul_data.syllable(cp) != .none; } /// Normalizes `str` to NFC. -pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result { +pub fn nfc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { return self.nfxc(allocator, str, .nfc); } /// Normalizes `str` to NFKC. -pub fn nfkc(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result { +pub fn nfkc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { return self.nfxc(allocator, str, .nfkc); } -fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error!Result { +fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { // Quick checks. if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str }; @@ -446,7 +454,7 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A block_check: while (i < dcps.len) : (i += 1) { const C = dcps[i]; if (C == tombstone) continue :block_check; - const cc_C = self.norm_data.ccc_data.ccc(C); + const cc_C = self.ccc_data.ccc(C); var starter_index: ?usize = null; var j: usize = i; @@ -456,11 +464,11 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A if (dcps[j] == tombstone) continue; // Check for starter. - if (self.norm_data.ccc_data.isStarter(dcps[j])) { + if (self.ccc_data.isStarter(dcps[j])) { // Check for blocking conditions. for (dcps[(j + 1)..i]) |B| { if (B == tombstone) continue; - const cc_B = self.norm_data.ccc_data.ccc(B); + const cc_B = self.ccc_data.ccc(B); if (cc_B != 0 and self.isHangul(C)) continue :block_check; if (cc_B >= cc_C) continue :block_check; } @@ -484,8 +492,8 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A // them algorithmically if possible. if (self.isHangul(L) and self.isHangul(C)) { // Get Hangul syllable types. - const l_stype = self.norm_data.hangul_data.syllable(L); - const c_stype = self.norm_data.hangul_data.syllable(C); + const l_stype = self.hangul_data.syllable(L); + const c_stype = self.hangul_data.syllable(C); if (l_stype == .LV and c_stype == .T) { // LV, T canonical composition. @@ -508,13 +516,13 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A if (!processed_hangul) { // L, C are not Hangul, so check for primary composite // in the Unicode Character Database. - if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| { + if (self.canon_data.toNfc(.{ L, C })) |P| { // We have a primary composite P for L, C. // We must check if P is not in the Full // Composition Exclusions (FCX) list, // preventing it from appearing in any // composed form (NFC, NFKC). - if (!self.norm_data.normp_data.isFcx(P)) { + if (!self.normp_data.isFcx(P)) { dcps[sidx] = P; dcps[i] = tombstone; // Mark for deletion. deleted += 1; @@ -544,10 +552,8 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A test "nfc" { const allocator = testing.allocator; - var data: NormData = undefined; - try NormData.init(&data, allocator); - defer data.deinit(allocator); - const n = Self{ .norm_data = &data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); defer result.deinit(allocator); @@ -557,10 +563,8 @@ test "nfc" { test "nfkc" { const allocator = testing.allocator; - var data: NormData = undefined; - try NormData.init(&data, allocator); - defer data.deinit(allocator); - const n = Self{ .norm_data = &data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); defer result.deinit(allocator); @@ -569,7 +573,7 @@ test "nfkc" { } /// Tests for equality of `a` and `b` after normalizing to NFC. -pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool { +pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8) !bool { const norm_result_a = try self.nfc(allocator, a); defer norm_result_a.deinit(allocator); const norm_result_b = try self.nfc(allocator, b); @@ -580,10 +584,8 @@ pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) ! test "eql" { const allocator = testing.allocator; - var data: NormData = undefined; - try NormData.init(&data, allocator); - defer data.deinit(allocator); - const n = Self{ .norm_data = &data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); @@ -629,3 +631,24 @@ test "isLatin1Only" { const not_latin1_only = "Héllo, World! \u{3d3}"; try testing.expect(!isLatin1Only(not_latin1_only)); } + +const std = @import("std"); +const debug = std.debug; +const assert = debug.assert; +const fmt = std.fmt; +const heap = std.heap; +const mem = std.mem; +const simd = std.simd; +const testing = std.testing; +const unicode = std.unicode; +const Allocator = std.mem.Allocator; + +const ascii = @import("ascii"); +const CodePointIterator = @import("code_point").Iterator; + +const CanonData = @import("CanonData"); +const CccData = @import("CombiningData"); +const CompatData = @import("CompatData"); +const FoldData = @import("FoldData"); +const HangulData = @import("HangulData"); +const NormPropsData = @import("NormPropsData"); diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 3cb5df5..8b9069a 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -43,9 +43,8 @@ test "Unicode normalization tests" { defer arena.deinit(); var allocator = arena.allocator(); - var norm_data: Normalize.NormData = undefined; - try Normalize.NormData.init(&norm_data, allocator); - const n = Normalize{ .norm_data = &norm_data }; + const n = try Normalize.init(allocator); + defer n.deinit(allocator); var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); defer file.close(); -- cgit v1.2.3