From 3c2c30bfbe861c6c48acd8d7507886787197a788 Mon Sep 17 00:00:00 2001
From: Sam Atman
Date: Wed, 30 Apr 2025 12:58:26 -0400
Subject: Merge NormData with Normalize

---
 src/CanonData.zig     |  50 ++++++-------
 src/CaseFold.zig      |  12 ++--
 src/CombiningData.zig |  44 ++++++------
 src/CompatData.zig    |  36 +++++-----
 src/FoldData.zig      |  78 ++++++++++----------
 src/HangulData.zig    |  42 +++++------
 src/NormData.zig      |  37 ----------
 src/NormPropsData.zig |  50 ++++++-------
 src/Normalize.zig     | 193 ++++++++++++++++++++++++++++----------------------
 src/unicode_tests.zig |   5 +-
 10 files changed, 269 insertions(+), 278 deletions(-)
 delete mode 100644 src/NormData.zig

(limited to 'src')

diff --git a/src/CanonData.zig b/src/CanonData.zig
index 794748c..c67d1d6 100644
--- a/src/CanonData.zig
+++ b/src/CanonData.zig
@@ -1,14 +1,11 @@
-const std = @import("std");
-const builtin = @import("builtin");
-const compress = std.compress;
-const mem = std.mem;
+//! Canonicalization Data
 
 nfc: std.AutoHashMapUnmanaged([2]u21, u21),
 nfd: [][]u21 = undefined,
 
-const Self = @This();
+const CanonData = @This();
 
-pub fn init(allocator: mem.Allocator) !Self {
+pub fn init(allocator: mem.Allocator) !CanonData {
     const decompressor = compress.flate.inflate.decompressor;
     const in_bytes = @embedFile("canon");
     var in_fbs = std.io.fixedBufferStream(in_bytes);
@@ -16,49 +13,54 @@ pub fn init(allocator: mem.Allocator) !Self {
     var reader = in_decomp.reader();
 
     const endian = builtin.cpu.arch.endian();
-    var self = Self{
-        .nfc = .{},
+    var cdata = CanonData{
+        .nfc = .empty,
         .nfd = try allocator.alloc([]u21, 0x110000),
     };
 
     var slices: usize = 0;
     errdefer {
-        self.nfc.deinit(allocator);
-        for (self.nfd[0..slices]) |slice| allocator.free(slice);
-        allocator.free(self.nfd);
+        cdata.nfc.deinit(allocator);
+        for (cdata.nfd[0..slices]) |slice| allocator.free(slice);
+        allocator.free(cdata.nfd);
     }
 
-    @memset(self.nfd, &.{});
+    @memset(cdata.nfd, &.{});
 
     while (true) {
         const len: u8 = try reader.readInt(u8, endian);
         if (len == 0) break;
         const cp = try reader.readInt(u24, endian);
-        self.nfd[cp] = try allocator.alloc(u21, len - 1);
+        cdata.nfd[cp] = try allocator.alloc(u21, len - 1);
         slices += 1;
         for (0..len - 1) |i| {
-            self.nfd[cp][i] = @intCast(try reader.readInt(u24, endian));
+            cdata.nfd[cp][i] = @intCast(try reader.readInt(u24, endian));
         }
         if (len == 3) {
-            try self.nfc.put(allocator, self.nfd[cp][0..2].*, @intCast(cp));
+            try cdata.nfc.put(allocator, cdata.nfd[cp][0..2].*, @intCast(cp));
         }
     }
 
-    return self;
+    return cdata;
 }
 
-pub fn deinit(self: *Self, allocator: mem.Allocator) void {
-    self.nfc.deinit(allocator);
-    for (self.nfd) |slice| allocator.free(slice);
-    allocator.free(self.nfd);
+pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void {
+    cdata.nfc.deinit(allocator);
+    for (cdata.nfd) |slice| allocator.free(slice);
+    allocator.free(cdata.nfd);
 }
 
 /// Returns canonical decomposition for `cp`.
-pub fn toNfd(self: Self, cp: u21) []const u21 {
-    return self.nfd[cp];
+pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 {
+    return cdata.nfd[cp];
 }
 
 // Returns the primary composite for the codepoints in `cp`.
-pub fn toNfc(self: Self, cps: [2]u21) ?u21 {
-    return self.nfc.get(cps);
+pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 {
+    return cdata.nfc.get(cps);
 }
+
+const std = @import("std");
+const builtin = @import("builtin");
+const compress = std.compress;
+const mem = std.mem;
diff --git a/src/CaseFold.zig b/src/CaseFold.zig
index c84a420..6490aea 100644
--- a/src/CaseFold.zig
+++ b/src/CaseFold.zig
@@ -95,10 +95,8 @@ pub fn compatCaselessMatch(
 test "compatCaselessMatch" {
     const allocator = testing.allocator;
 
-    var norm_data = Normalize.NormData{};
-    try norm_data.init(allocator);
-    defer norm_data.deinit(allocator);
-    const n = Normalize{ .norm_data = &norm_data };
+    const n = try Normalize.init(allocator);
+    defer n.deinit(allocator);
 
     const fold_data = try FoldData.init(allocator);
     defer fold_data.deinit(allocator);
@@ -171,10 +169,8 @@ pub fn canonCaselessMatch(
 test "canonCaselessMatch" {
     const allocator = testing.allocator;
 
-    var norm_data = Normalize.NormData{};
-    try norm_data.init(allocator);
-    defer norm_data.deinit(allocator);
-    const n = Normalize{ .norm_data = &norm_data };
+    const n = try Normalize.init(allocator);
+    defer n.deinit(allocator);
 
     const fold_data = try FoldData.init(allocator);
     defer fold_data.deinit(allocator);
diff --git a/src/CombiningData.zig b/src/CombiningData.zig
index b5e227a..fd64a3b 100644
--- a/src/CombiningData.zig
+++ b/src/CombiningData.zig
@@ -1,14 +1,11 @@
-const std = @import("std");
-const builtin = @import("builtin");
-const compress = std.compress;
-const mem = std.mem;
+//! Combining Class Data
 
 s1: []u16 = undefined,
 s2: []u8 = undefined,
 
-const Self = @This();
+const CombiningData = @This();
 
-pub fn init(allocator: mem.Allocator) !Self {
+pub fn init(allocator: mem.Allocator) !CombiningData {
     const decompressor = compress.flate.inflate.decompressor;
     const in_bytes = @embedFile("ccc");
     var in_fbs = std.io.fixedBufferStream(in_bytes);
@@ -17,32 +14,37 @@ pub fn init(allocator: mem.Allocator) !Self {
 
     const endian = builtin.cpu.arch.endian();
 
-    var self = Self{};
+    var cbdata = CombiningData{};
 
     const stage_1_len: u16 = try reader.readInt(u16, endian);
-    self.s1 = try allocator.alloc(u16, stage_1_len);
-    errdefer allocator.free(self.s1);
-    for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
+    cbdata.s1 = try allocator.alloc(u16, stage_1_len);
+    errdefer allocator.free(cbdata.s1);
+    for (0..stage_1_len) |i| cbdata.s1[i] = try reader.readInt(u16, endian);
 
     const stage_2_len: u16 = try reader.readInt(u16, endian);
-    self.s2 = try allocator.alloc(u8, stage_2_len);
-    errdefer allocator.free(self.s2);
-    _ = try reader.readAll(self.s2);
+    cbdata.s2 = try allocator.alloc(u8, stage_2_len);
+    errdefer allocator.free(cbdata.s2);
+    _ = try reader.readAll(cbdata.s2);
 
-    return self;
+    return cbdata;
 }
 
-pub fn deinit(self: *const Self, allocator: mem.Allocator) void {
-    allocator.free(self.s1);
-    allocator.free(self.s2);
+pub fn deinit(cbdata: *const CombiningData, allocator: mem.Allocator) void {
+    allocator.free(cbdata.s1);
+    allocator.free(cbdata.s2);
 }
 
 /// Returns the canonical combining class for a code point.
-pub fn ccc(self: Self, cp: u21) u8 {
-    return self.s2[self.s1[cp >> 8] + (cp & 0xff)];
+pub fn ccc(cbdata: CombiningData, cp: u21) u8 {
+    return cbdata.s2[cbdata.s1[cp >> 8] + (cp & 0xff)];
 }
 
 /// True if `cp` is a starter code point, not a combining character.
-pub fn isStarter(self: Self, cp: u21) bool {
-    return self.s2[self.s1[cp >> 8] + (cp & 0xff)] == 0;
+pub fn isStarter(cbdata: CombiningData, cp: u21) bool {
+    return cbdata.s2[cbdata.s1[cp >> 8] + (cp & 0xff)] == 0;
 }
+
+const std = @import("std");
+const builtin = @import("builtin");
+const compress = std.compress;
+const mem = std.mem;
diff --git a/src/CompatData.zig b/src/CompatData.zig
index ac08048..d787103 100644
--- a/src/CompatData.zig
+++ b/src/CompatData.zig
@@ -1,13 +1,10 @@
-const std = @import("std");
-const builtin = @import("builtin");
-const compress = std.compress;
-const mem = std.mem;
+//! Compatibility Data
 
 nfkd: [][]u21 = undefined,
 
-const Self = @This();
+const CompatData = @This();
 
-pub fn init(allocator: mem.Allocator) !Self {
+pub fn init(allocator: mem.Allocator) !CompatData {
     const decompressor = compress.flate.inflate.decompressor;
     const in_bytes = @embedFile("compat");
     var in_fbs = std.io.fixedBufferStream(in_bytes);
@@ -15,34 +12,39 @@ pub fn init(allocator: mem.Allocator) !Self {
     var reader = in_decomp.reader();
 
     const endian = builtin.cpu.arch.endian();
-    var self = Self{
+    var cpdata = CompatData{
         .nfkd = try allocator.alloc([]u21, 0x110000),
     };
-    errdefer self.deinit(allocator);
+    errdefer cpdata.deinit(allocator);
 
-    @memset(self.nfkd, &.{});
+    @memset(cpdata.nfkd, &.{});
 
     while (true) {
         const len: u8 = try reader.readInt(u8, endian);
         if (len == 0) break;
         const cp = try reader.readInt(u24, endian);
-        self.nfkd[cp] = try allocator.alloc(u21, len - 1);
+        cpdata.nfkd[cp] = try allocator.alloc(u21, len - 1);
         for (0..len - 1) |i| {
-            self.nfkd[cp][i] = @intCast(try reader.readInt(u24, endian));
+            cpdata.nfkd[cp][i] = @intCast(try reader.readInt(u24, endian));
         }
     }
 
-    return self;
+    return cpdata;
 }
 
-pub fn deinit(self: *const Self, allocator: mem.Allocator) void {
-    for (self.nfkd) |slice| {
+pub fn deinit(cpdata: *const CompatData, allocator: mem.Allocator) void {
+    for (cpdata.nfkd) |slice| {
         if (slice.len != 0) allocator.free(slice);
     }
-    allocator.free(self.nfkd);
+    allocator.free(cpdata.nfkd);
 }
 
 /// Returns compatibility decomposition for `cp`.
-pub fn toNfkd(self: Self, cp: u21) []u21 {
-    return self.nfkd[cp];
+pub fn toNfkd(cpdata: *const CompatData, cp: u21) []u21 {
+    return cpdata.nfkd[cp];
 }
+
+const std = @import("std");
+const builtin = @import("builtin");
+const compress = std.compress;
+const mem = std.mem;
diff --git a/src/FoldData.zig b/src/FoldData.zig
index e44e714..b7fdceb 100644
--- a/src/FoldData.zig
+++ b/src/FoldData.zig
@@ -12,9 +12,9 @@ stage1: []u8 = undefined,
 stage2: []u8 = undefined,
 stage3: []i24 = undefined,
 
-const Self = @This();
+const FoldData = @This();
 
-pub fn init(allocator: mem.Allocator) !Self {
+pub fn init(allocator: mem.Allocator) !FoldData {
     const decompressor = compress.flate.inflate.decompressor;
     const in_bytes = @embedFile("fold");
     var in_fbs = std.io.fixedBufferStream(in_bytes);
@@ -23,61 +23,61 @@ pub fn init(allocator: mem.Allocator) !Self {
 
     const endian = builtin.cpu.arch.endian();
 
-    var self = Self{};
-    self.cutoff = @intCast(try reader.readInt(u24, endian));
-    self.multiple_start = @intCast(try reader.readInt(u24, endian));
+    var fdata = FoldData{};
+    fdata.cutoff = @intCast(try reader.readInt(u24, endian));
+    fdata.multiple_start = @intCast(try reader.readInt(u24, endian));
 
     var len = try reader.readInt(u16, endian);
-    self.stage1 = try allocator.alloc(u8, len);
-    errdefer allocator.free(self.stage1);
-    for (0..len) |i| self.stage1[i] = try reader.readInt(u8, endian);
+    fdata.stage1 = try allocator.alloc(u8, len);
+    errdefer allocator.free(fdata.stage1);
+    for (0..len) |i| fdata.stage1[i] = try reader.readInt(u8, endian);
 
     len = try reader.readInt(u16, endian);
-    self.stage2 = try allocator.alloc(u8, len);
-    errdefer allocator.free(self.stage2);
-    for (0..len) |i| self.stage2[i] = try reader.readInt(u8, endian);
+    fdata.stage2 = try allocator.alloc(u8, len);
+    errdefer allocator.free(fdata.stage2);
+    for (0..len) |i| fdata.stage2[i] = try reader.readInt(u8, endian);
 
     len = try reader.readInt(u16, endian);
-    self.stage3 = try allocator.alloc(i24, len);
-    errdefer allocator.free(self.stage3);
-    for (0..len) |i| self.stage3[i] = try reader.readInt(i24, endian);
+    fdata.stage3 = try allocator.alloc(i24, len);
+    errdefer allocator.free(fdata.stage3);
+    for (0..len) |i| fdata.stage3[i] = try reader.readInt(i24, endian);
 
-    self.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian));
-    self.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian));
+    fdata.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian));
+    fdata.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian));
     len = try reader.readInt(u16, endian);
-    self.cwcf_exceptions = try allocator.alloc(u21, len);
-    errdefer allocator.free(self.cwcf_exceptions);
-    for (0..len) |i| self.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian));
+    fdata.cwcf_exceptions = try allocator.alloc(u21, len);
+    errdefer allocator.free(fdata.cwcf_exceptions);
+    for (0..len) |i| fdata.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian));
 
-    return self;
+    return fdata;
 }
 
-pub fn deinit(self: *const Self, allocator: mem.Allocator) void {
-    allocator.free(self.stage1);
-    allocator.free(self.stage2);
-    allocator.free(self.stage3);
-    allocator.free(self.cwcf_exceptions);
+pub fn deinit(fdata: *const FoldData, allocator: mem.Allocator) void {
+    allocator.free(fdata.stage1);
+    allocator.free(fdata.stage2);
+    allocator.free(fdata.stage3);
+    allocator.free(fdata.cwcf_exceptions);
 }
 
 /// Returns the case fold for `cp`.
-pub fn caseFold(self: Self, cp: u21, buf: []u21) []const u21 {
-    if (cp >= self.cutoff) return &.{};
+pub fn caseFold(fdata: *const FoldData, cp: u21, buf: []u21) []const u21 {
+    if (cp >= fdata.cutoff) return &.{};
 
-    const stage1_val = self.stage1[cp >> 8];
+    const stage1_val = fdata.stage1[cp >> 8];
     if (stage1_val == 0) return &.{};
 
     const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF);
-    const stage3_index = self.stage2[stage2_index];
+    const stage3_index = fdata.stage2[stage2_index];
 
     if (stage3_index & 0x80 != 0) {
-        const real_index = @as(usize, self.multiple_start) + (stage3_index ^ 0x80) * 3;
-        const mapping = mem.sliceTo(self.stage3[real_index..][0..3], 0);
+        const real_index = @as(usize, fdata.multiple_start) + (stage3_index ^ 0x80) * 3;
+        const mapping = mem.sliceTo(fdata.stage3[real_index..][0..3], 0);
         for (mapping, 0..) |c, i| buf[i] = @intCast(c);
 
         return buf[0..mapping.len];
     }
 
-    const offset = self.stage3[stage3_index];
+    const offset = fdata.stage3[stage3_index];
     if (offset == 0) return &.{};
 
     buf[0] = @intCast(@as(i32, cp) + offset);
@@ -86,14 +86,14 @@ pub fn caseFold(self: Self, cp: u21, buf: []u21) []const u21 {
 }
 
 /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`).
-pub fn changesWhenCaseFolded(self: Self, cp: u21) bool {
+pub fn changesWhenCaseFolded(fdata: *const FoldData, cp: u21) bool {
     var buf: [3]u21 = undefined;
-    const has_mapping = self.caseFold(cp, &buf).len != 0;
-    return has_mapping and !self.isCwcfException(cp);
+    const has_mapping = fdata.caseFold(cp, &buf).len != 0;
+    return has_mapping and !fdata.isCwcfException(cp);
 }
 
-fn isCwcfException(self: Self, cp: u21) bool {
-    return cp >= self.cwcf_exceptions_min and
-        cp <= self.cwcf_exceptions_max and
-        std.mem.indexOfScalar(u21, self.cwcf_exceptions, cp) != null;
+fn isCwcfException(fdata: *const FoldData, cp: u21) bool {
+    return cp >= fdata.cwcf_exceptions_min and
+        cp <= fdata.cwcf_exceptions_max and
+        std.mem.indexOfScalar(u21, fdata.cwcf_exceptions, cp) != null;
 }
diff --git a/src/HangulData.zig b/src/HangulData.zig
index 4bccbe6..8c5f3ad 100644
--- a/src/HangulData.zig
+++ b/src/HangulData.zig
@@ -1,8 +1,4 @@
-const std = @import("std");
-const builtin = @import("builtin");
-const compress = std.compress;
-const mem = std.mem;
-const testing = std.testing;
+//! Hangul Data
 
 pub const Syllable = enum {
     none,
@@ -16,9 +12,9 @@ pub const Syllable = enum {
 s1: []u16 = undefined,
 s2: []u3 = undefined,
 
-const Self = @This();
+const Hangul = @This();
 
-pub fn init(allocator: mem.Allocator) !Self {
+pub fn init(allocator: mem.Allocator) !Hangul {
     const decompressor = compress.flate.inflate.decompressor;
     const in_bytes = @embedFile("hangul");
     var in_fbs = std.io.fixedBufferStream(in_bytes);
@@ -26,27 +22,33 @@ pub fn init(allocator: mem.Allocator) !Self {
     var reader = in_decomp.reader();
 
     const endian = builtin.cpu.arch.endian();
-    var self = Self{};
+    var hangul = Hangul{};
 
     const stage_1_len: u16 = try reader.readInt(u16, endian);
-    self.s1 = try allocator.alloc(u16, stage_1_len);
-    errdefer allocator.free(self.s1);
-    for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
+    hangul.s1 = try allocator.alloc(u16, stage_1_len);
+    errdefer allocator.free(hangul.s1);
+    for (0..stage_1_len) |i| hangul.s1[i] = try reader.readInt(u16, endian);
 
     const stage_2_len: u16 = try reader.readInt(u16, endian);
-    self.s2 = try allocator.alloc(u3, stage_2_len);
-    errdefer allocator.free(self.s2);
-    for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian));
+    hangul.s2 = try allocator.alloc(u3, stage_2_len);
+    errdefer allocator.free(hangul.s2);
+    for (0..stage_2_len) |i| hangul.s2[i] = @intCast(try reader.readInt(u8, endian));
 
-    return self;
+    return hangul;
 }
 
-pub fn deinit(self: *const Self, allocator: mem.Allocator) void {
-    allocator.free(self.s1);
-    allocator.free(self.s2);
+pub fn deinit(hangul: *const Hangul, allocator: mem.Allocator) void {
+    allocator.free(hangul.s1);
+    allocator.free(hangul.s2);
 }
 
 /// Returns the Hangul syllable type for `cp`.
-pub fn syllable(self: Self, cp: u21) Syllable {
-    return @enumFromInt(self.s2[self.s1[cp >> 8] + (cp & 0xff)]);
+pub fn syllable(hangul: *const Hangul, cp: u21) Syllable {
+    return @enumFromInt(hangul.s2[hangul.s1[cp >> 8] + (cp & 0xff)]);
 }
+
+const std = @import("std");
+const builtin = @import("builtin");
+const compress = std.compress;
+const mem = std.mem;
+const testing = std.testing;
diff --git a/src/NormData.zig b/src/NormData.zig
deleted file mode 100644
index a123860..0000000
--- a/src/NormData.zig
+++ /dev/null
@@ -1,37 +0,0 @@
-const std = @import("std");
-const mem = std.mem;
-
-const CanonData = @import("CanonData");
-const CccData = @import("CombiningData");
-const CompatData = @import("CompatData");
-const FoldData = @import("FoldData");
-const HangulData = @import("HangulData");
-const NormPropsData = @import("NormPropsData");
-
-canon_data: CanonData = undefined,
-ccc_data: CccData = undefined,
-compat_data: CompatData = undefined,
-hangul_data: HangulData = undefined,
-normp_data: NormPropsData = undefined,
-
-const Self = @This();
-
-pub fn init(self: *Self, allocator: std.mem.Allocator) !void {
-    self.canon_data = try CanonData.init(allocator);
-    errdefer self.canon_data.deinit(allocator);
-    self.ccc_data = try CccData.init(allocator);
-    errdefer self.ccc_data.deinit(allocator);
-    self.compat_data = try CompatData.init(allocator);
-    errdefer self.compat_data.deinit(allocator);
-    self.hangul_data = try HangulData.init(allocator);
-    errdefer self.hangul_data.deinit(allocator);
-    self.normp_data = try NormPropsData.init(allocator);
-}
-
-pub fn deinit(self: *Self, allocator: mem.Allocator) void {
-    self.canon_data.deinit(allocator);
-    self.ccc_data.deinit(allocator);
-    self.compat_data.deinit(allocator);
-    self.hangul_data.deinit(allocator);
-    self.normp_data.deinit(allocator);
-}
diff --git a/src/NormPropsData.zig b/src/NormPropsData.zig
index e79ae91..ca69569 100644
--- a/src/NormPropsData.zig
+++ b/src/NormPropsData.zig
@@ -1,15 +1,11 @@
-const std = @import("std");
-const builtin = @import("builtin");
-const compress = std.compress;
-const mem = std.mem;
-const testing = std.testing;
+//! Normalization Properties Data
 
 s1: []u16 = undefined,
 s2: []u4 = undefined,
 
-const Self = @This();
+const NormProps = @This();
 
-pub fn init(allocator: mem.Allocator) !Self {
+pub fn init(allocator: mem.Allocator) !NormProps {
     const decompressor = compress.flate.inflate.decompressor;
     const in_bytes = @embedFile("normp");
     var in_fbs = std.io.fixedBufferStream(in_bytes);
@@ -17,37 +13,43 @@ pub fn init(allocator: mem.Allocator) !Self {
     var reader = in_decomp.reader();
 
     const endian = builtin.cpu.arch.endian();
-    var self = Self{};
+    var norms = NormProps{};
 
     const stage_1_len: u16 = try reader.readInt(u16, endian);
-    self.s1 = try allocator.alloc(u16, stage_1_len);
-    errdefer allocator.free(self.s1);
-    for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
+    norms.s1 = try allocator.alloc(u16, stage_1_len);
+    errdefer allocator.free(norms.s1);
+    for (0..stage_1_len) |i| norms.s1[i] = try reader.readInt(u16, endian);
 
     const stage_2_len: u16 = try reader.readInt(u16, endian);
-    self.s2 = try allocator.alloc(u4, stage_2_len);
-    errdefer allocator.free(self.s2);
-    for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian));
+    norms.s2 = try allocator.alloc(u4, stage_2_len);
+    errdefer allocator.free(norms.s2);
+    for (0..stage_2_len) |i| norms.s2[i] = @intCast(try reader.readInt(u8, endian));
 
-    return self;
+    return norms;
 }
 
-pub fn deinit(self: *const Self, allocator: mem.Allocator) void {
-    allocator.free(self.s1);
-    allocator.free(self.s2);
+pub fn deinit(norms: *const NormProps, allocator: mem.Allocator) void {
+    allocator.free(norms.s1);
+    allocator.free(norms.s2);
 }
 
 /// Returns true if `cp` is already in NFD form.
-pub fn isNfd(self: Self, cp: u21) bool {
-    return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 1 == 0;
+pub fn isNfd(norms: *const NormProps, cp: u21) bool {
+    return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 1 == 0;
 }
 
 /// Returns true if `cp` is already in NFKD form.
-pub fn isNfkd(self: Self, cp: u21) bool {
-    return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 2 == 0;
+pub fn isNfkd(norms: *const NormProps, cp: u21) bool {
+    return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 2 == 0;
 }
 
 /// Returns true if `cp` is not allowed in any normalized form.
-pub fn isFcx(self: Self, cp: u21) bool {
-    return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
+pub fn isFcx(norms: *const NormProps, cp: u21) bool {
+    return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
 }
+
+const std = @import("std");
+const builtin = @import("builtin");
+const compress = std.compress;
+const mem = std.mem;
+const testing = std.testing;
diff --git a/src/Normalize.zig b/src/Normalize.zig
index b738b27..4f014cf 100644
--- a/src/Normalize.zig
+++ b/src/Normalize.zig
@@ -2,23 +2,41 @@
 //! Unicode Normalization. You can normalize strings into NFC,
 //! NFKC, NFD, and NFKD normalization forms.
 
-const std = @import("std");
-const debug = std.debug;
-const assert = debug.assert;
-const fmt = std.fmt;
-const heap = std.heap;
-const mem = std.mem;
-const simd = std.simd;
-const testing = std.testing;
-const unicode = std.unicode;
-
-const ascii = @import("ascii");
-const CodePointIterator = @import("code_point").Iterator;
-pub const NormData = @import("NormData");
+canon_data: CanonData = undefined,
+ccc_data: CccData = undefined,
+compat_data: CompatData = undefined,
+hangul_data: HangulData = undefined,
+normp_data: NormPropsData = undefined,
+
+const Normalize = @This();
+
+pub fn init(allocator: Allocator) !Normalize {
+    var norm: Normalize = undefined;
+    try norm.setup(allocator);
+    return norm;
+}
 
-norm_data: *const NormData,
+pub fn setup(self: *Normalize, allocator: Allocator) !void {
+    self.canon_data = try CanonData.init(allocator);
+    errdefer self.canon_data.deinit(allocator);
+    self.ccc_data = try CccData.init(allocator);
+    errdefer self.ccc_data.deinit(allocator);
+    self.compat_data = try CompatData.init(allocator);
+    errdefer self.compat_data.deinit(allocator);
+    self.hangul_data = try HangulData.init(allocator);
+    errdefer self.hangul_data.deinit(allocator);
+    self.normp_data = try NormPropsData.init(allocator);
+}
 
-const Self = @This();
+pub fn deinit(norm: *const Normalize, allocator: Allocator) void {
+    // Reasonably safe (?)
+    var mut_norm = @constCast(norm);
+    mut_norm.canon_data.deinit(allocator);
+    mut_norm.ccc_data.deinit(allocator);
+    mut_norm.compat_data.deinit(allocator);
+    mut_norm.hangul_data.deinit(allocator);
+    mut_norm.normp_data.deinit(allocator);
+}
 
 const SBase: u21 = 0xAC00;
 const LBase: u21 = 0x1100;
@@ -30,8 +48,8 @@ const TCount: u21 = 28;
 const NCount: u21 = 588; // VCount * TCount
 const SCount: u21 = 11172; // LCount * NCount
 
-fn decomposeHangul(self: Self, cp: u21, buf: []u21) ?Decomp {
-    const kind = self.norm_data.hangul_data.syllable(cp);
+fn decomposeHangul(self: Normalize, cp: u21, buf: []u21) ?Decomp {
+    const kind = self.hangul_data.syllable(cp);
     if (kind != .LV and kind != .LVT) return null;
 
     const SIndex: u21 = cp - SBase;
@@ -90,21 +108,21 @@ const Decomp = struct {
 };
 
 // `mapping` retrieves the decomposition mapping for a code point as per the UCD.
-fn mapping(self: Self, cp: u21, form: Form) Decomp {
+fn mapping(self: Normalize, cp: u21, form: Form) Decomp {
     var dc = Decomp{};
 
     switch (form) {
         .nfd => {
-            dc.cps = self.norm_data.canon_data.toNfd(cp);
+            dc.cps = self.canon_data.toNfd(cp);
             if (dc.cps.len != 0) dc.form = .nfd;
         },
 
         .nfkd => {
-            dc.cps = self.norm_data.compat_data.toNfkd(cp);
+            dc.cps = self.compat_data.toNfkd(cp);
             if (dc.cps.len != 0) {
                 dc.form = .nfkd;
             } else {
-                dc.cps = self.norm_data.canon_data.toNfd(cp);
+                dc.cps = self.canon_data.toNfd(cp);
                 if (dc.cps.len != 0) dc.form = .nfkd;
             }
         },
@@ -117,7 +135,7 @@ fn mapping(self: Self, cp: u21, form: Form) Decomp {
 
 // `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`.
 fn decompose(
-    self: Self,
+    self: Normalize,
     cp: u21,
     form: Form,
     buf: []u21,
@@ -127,8 +145,8 @@ fn decompose(
 
     // NFD / NFKD quick checks.
     switch (form) {
-        .nfd => if (self.norm_data.normp_data.isNfd(cp)) return .{},
-        .nfkd => if (self.norm_data.normp_data.isNfkd(cp)) return .{},
+        .nfd => if (self.normp_data.isNfd(cp)) return .{},
+        .nfkd => if (self.normp_data.isNfkd(cp)) return .{},
         else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."),
     }
 
@@ -175,10 +193,8 @@ fn decompose(
 
 test "decompose" {
     const allocator = testing.allocator;
-    var data: NormData = undefined;
-    try NormData.init(&data, allocator);
-    defer data.deinit(allocator);
-    var n = Self{ .norm_data = &data };
+    const n = try Normalize.init(allocator);
+    defer n.deinit(allocator);
 
     var buf: [18]u21 = undefined;
 
@@ -228,42 +244,42 @@ pub const Result = struct {
     slice: []const u8,
 
     /// Ensures that the slice result is a copy of the input, by making a copy if it was not.
-    pub fn toOwned(result: Result, allocator: mem.Allocator) error{OutOfMemory}!Result {
+    pub fn toOwned(result: Result, allocator: Allocator) error{OutOfMemory}!Result {
         if (result.allocated) return result;
         return .{ .allocated = true, .slice = try allocator.dupe(u8, result.slice) };
     }
 
-    pub fn deinit(self: *const Result, allocator: mem.Allocator) void {
+    pub fn deinit(self: *const Result, allocator: Allocator) void {
         if (self.allocated) allocator.free(self.slice);
     }
 };
 
 // Compares code points by Canonical Combining Class order.
-fn cccLess(self: Self, lhs: u21, rhs: u21) bool {
-    return self.norm_data.ccc_data.ccc(lhs) < self.norm_data.ccc_data.ccc(rhs);
+fn cccLess(self: Normalize, lhs: u21, rhs: u21) bool {
+    return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs);
 }
 
 // Applies the Canonical Sorting Algorithm.
-fn canonicalSort(self: Self, cps: []u21) void {
+fn canonicalSort(self: Normalize, cps: []u21) void {
     var i: usize = 0;
     while (i < cps.len) : (i += 1) {
         const start: usize = i;
-        while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {}
+        while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {}
         mem.sort(u21, cps[start..i], self, cccLess);
     }
 }
 
 /// Normalize `str` to NFD.
-pub fn nfd(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result {
+pub fn nfd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result {
     return self.nfxd(allocator, str, .nfd);
 }
 
 /// Normalize `str` to NFKD.
-pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result {
+pub fn nfkd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result {
     return self.nfxd(allocator, str, .nfkd);
 }
 
-pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error![]u21 {
+pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error![]u21 {
     var dcp_list = std.ArrayList(u21).init(allocator);
     defer dcp_list.deinit();
 
@@ -284,7 +300,7 @@ pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, for
     return try dcp_list.toOwnedSlice();
 }
 
-fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error!Result {
+fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result {
     // Quick checks.
     if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
 
@@ -305,10 +321,8 @@ fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A
 
 test "nfd ASCII / no-alloc" {
     const allocator = testing.allocator;
-    var data: NormData = undefined;
-    try NormData.init(&data, allocator);
-    defer data.deinit(allocator);
-    const n = Self{ .norm_data = &data };
+    const n = try Normalize.init(allocator);
+    defer n.deinit(allocator);
 
     const result = try n.nfd(allocator, "Hello World!");
     defer result.deinit(allocator);
@@ -318,10 +332,8 @@ test "nfd ASCII / no-alloc" {
 
 test "nfd !ASCII / alloc" {
     const allocator = testing.allocator;
-    var data: NormData = undefined;
-    try NormData.init(&data, allocator);
-    defer data.deinit(allocator);
-    const n = Self{ .norm_data = &data };
+    const n = try Normalize.init(allocator);
+    defer n.deinit(allocator);
 
     const result = try n.nfd(allocator, "Héllo World! \u{3d3}");
     defer result.deinit(allocator);
@@ -331,10 +343,8 @@ test "nfd !ASCII / alloc" {
 
 test "nfkd ASCII / no-alloc" {
     const allocator = testing.allocator;
-    var data: NormData = undefined;
-    try NormData.init(&data, allocator);
-    defer data.deinit(allocator);
-    const n = Self{ .norm_data = &data };
+    const n = try Normalize.init(allocator);
+    defer n.deinit(allocator);
 
     const result = try n.nfkd(allocator, "Hello World!");
     defer result.deinit(allocator);
@@ -344,10 +354,8 @@ test "nfkd ASCII / no-alloc" {
 
 test "nfkd !ASCII / alloc" {
     const allocator = testing.allocator;
-    var data: NormData = undefined;
-    try NormData.init(&data, allocator);
-    defer data.deinit(allocator);
-    const n = Self{ .norm_data = &data };
+    const n = try Normalize.init(allocator);
+    defer n.deinit(allocator);
 
     const result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
     defer result.deinit(allocator);
@@ -356,10 +364,10 @@ test "nfkd !ASCII / alloc" {
 }
 
 pub fn nfdCodePoints(
-    self: Self,
-    allocator: mem.Allocator,
+    self: Normalize,
+    allocator: Allocator,
     cps: []const u21,
-) mem.Allocator.Error![]u21 {
+) Allocator.Error![]u21 {
     var dcp_list = std.ArrayList(u21).init(allocator);
     defer dcp_list.deinit();
 
@@ -381,10 +389,10 @@ pub fn nfdCodePoints(
 }
 
 pub fn nfkdCodePoints(
-    self: Self,
-    allocator: mem.Allocator,
+    self: Normalize,
+    allocator: Allocator,
     cps: []const u21,
-) mem.Allocator.Error![]u21 {
+) Allocator.Error![]u21 {
     var dcp_list = std.ArrayList(u21).init(allocator);
     defer dcp_list.deinit();
 
@@ -407,21 +415,21 @@ pub fn nfkdCodePoints(
 
 // Composition (NFC, NFKC)
 
-fn isHangul(self: Self, cp: u21) bool {
-    return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none;
+fn isHangul(self: Normalize, cp: u21) bool {
+    return cp >= 0x1100 and self.hangul_data.syllable(cp) != .none;
 }
 
 /// Normalizes `str` to NFC.
-pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result {
+pub fn nfc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result {
     return self.nfxc(allocator, str, .nfc);
 }
 
 /// Normalizes `str` to NFKC.
-pub fn nfkc(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result {
+pub fn nfkc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result {
     return self.nfxc(allocator, str, .nfkc);
 }
 
-fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error!Result {
+fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result {
     // Quick checks.
     if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
     if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str };
@@ -446,7 +454,7 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A
         block_check: while (i < dcps.len) : (i += 1) {
             const C = dcps[i];
             if (C == tombstone) continue :block_check;
-            const cc_C = self.norm_data.ccc_data.ccc(C);
+            const cc_C = self.ccc_data.ccc(C);
             var starter_index: ?usize = null;
             var j: usize = i;
 
@@ -456,11 +464,11 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A
                 if (dcps[j] == tombstone) continue;
 
                 // Check for starter.
-                if (self.norm_data.ccc_data.isStarter(dcps[j])) {
+                if (self.ccc_data.isStarter(dcps[j])) {
                     // Check for blocking conditions.
                     for (dcps[(j + 1)..i]) |B| {
                         if (B == tombstone) continue;
-                        const cc_B = self.norm_data.ccc_data.ccc(B);
+                        const cc_B = self.ccc_data.ccc(B);
                         if (cc_B != 0 and self.isHangul(C)) continue :block_check;
                         if (cc_B >= cc_C) continue :block_check;
                     }
@@ -484,8 +492,8 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A
                 // them algorithmically if possible.
                 if (self.isHangul(L) and self.isHangul(C)) {
                     // Get Hangul syllable types.
-                    const l_stype = self.norm_data.hangul_data.syllable(L);
-                    const c_stype = self.norm_data.hangul_data.syllable(C);
+                    const l_stype = self.hangul_data.syllable(L);
+                    const c_stype = self.hangul_data.syllable(C);
 
                     if (l_stype == .LV and c_stype == .T) {
                         // LV, T canonical composition.
@@ -508,13 +516,13 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A
                 if (!processed_hangul) {
                     // L, C are not Hangul, so check for primary composite
                     // in the Unicode Character Database.
-                    if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| {
+                    if (self.canon_data.toNfc(.{ L, C })) |P| {
                         // We have a primary composite P for L, C.
                         // We must check if P is not in the Full
                         // Composition Exclusions  (FCX) list,
                         // preventing it from appearing in any
                         // composed form (NFC, NFKC).
-                        if (!self.norm_data.normp_data.isFcx(P)) {
+                        if (!self.normp_data.isFcx(P)) {
                             dcps[sidx] = P;
                             dcps[i] = tombstone; // Mark for deletion.
                             deleted += 1;
@@ -544,10 +552,8 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A
 
 test "nfc" {
     const allocator = testing.allocator;
-    var data: NormData = undefined;
-    try NormData.init(&data, allocator);
-    defer data.deinit(allocator);
-    const n = Self{ .norm_data = &data };
+    const n = try Normalize.init(allocator);
+    defer n.deinit(allocator);
 
     const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
     defer result.deinit(allocator);
@@ -557,10 +563,8 @@ test "nfc" {
 
 test "nfkc" {
     const allocator = testing.allocator;
-    var data: NormData = undefined;
-    try NormData.init(&data, allocator);
-    defer data.deinit(allocator);
-    const n = Self{ .norm_data = &data };
+    const n = try Normalize.init(allocator);
+    defer n.deinit(allocator);
 
     const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
     defer result.deinit(allocator);
@@ -569,7 +573,7 @@ test "nfkc" {
 }
 
 /// Tests for equality of `a` and `b` after normalizing to NFC.
-pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool {
+pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8) !bool {
     const norm_result_a = try self.nfc(allocator, a);
     defer norm_result_a.deinit(allocator);
     const norm_result_b = try self.nfc(allocator, b);
@@ -580,10 +584,8 @@ pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !
 
 test "eql" {
     const allocator = testing.allocator;
-    var data: NormData = undefined;
-    try NormData.init(&data, allocator);
-    defer data.deinit(allocator);
-    const n = Self{ .norm_data = &data };
+    const n = try Normalize.init(allocator);
+    defer n.deinit(allocator);
 
     try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
     try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
@@ -629,3 +631,24 @@ test "isLatin1Only" {
     const not_latin1_only = "Héllo, World! \u{3d3}";
     try testing.expect(!isLatin1Only(not_latin1_only));
 }
+
+const std = @import("std");
+const debug = std.debug;
+const assert = debug.assert;
+const fmt = std.fmt;
+const heap = std.heap;
+const mem = std.mem;
+const simd = std.simd;
+const testing = std.testing;
+const unicode = std.unicode;
+const Allocator = std.mem.Allocator;
+
+const ascii = @import("ascii");
+const CodePointIterator = @import("code_point").Iterator;
+
+const CanonData = @import("CanonData");
+const CccData = @import("CombiningData");
+const CompatData = @import("CompatData");
+const FoldData = @import("FoldData");
+const HangulData = @import("HangulData");
+const NormPropsData = @import("NormPropsData");
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 3cb5df5..8b9069a 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -43,9 +43,8 @@ test "Unicode normalization tests" {
     defer arena.deinit();
     var allocator = arena.allocator();
 
-    var norm_data: Normalize.NormData = undefined;
-    try Normalize.NormData.init(&norm_data, allocator);
-    const n = Normalize{ .norm_data = &norm_data };
+    const n = try Normalize.init(allocator);
+    defer n.deinit(allocator);
 
     var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
     defer file.close();
-- 
cgit v1.2.3