From 836a4b6e63ac4bd7beb406cb20edf23f0bd342a9 Mon Sep 17 00:00:00 2001
From: Jose Colon Rodriguez
Date: Mon, 26 Feb 2024 12:24:42 -0400
Subject: Using separate data struct model.

---
 src/CombiningClassData.zig |  48 ++++++
 src/DisplayWidth.zig       | 351 +++++++++++++++++++++++++++++++++++++++++++
 src/DisplayWidthData.zig   |  82 +++++++++++
 src/GraphemeData.zig       |  86 +++++++++++
 src/Normalizer.zig         |  97 ++++++------
 src/display_width.zig      | 360 ---------------------------------------------
 src/grapheme.zig           |  73 ++++-----
 src/main.zig               |  32 +++-
 8 files changed, 680 insertions(+), 449 deletions(-)
 create mode 100644 src/CombiningClassData.zig
 create mode 100644 src/DisplayWidth.zig
 create mode 100644 src/DisplayWidthData.zig
 create mode 100644 src/GraphemeData.zig
 delete mode 100644 src/display_width.zig

(limited to 'src')

diff --git a/src/CombiningClassData.zig b/src/CombiningClassData.zig
new file mode 100644
index 0000000..95c947d
--- /dev/null
+++ b/src/CombiningClassData.zig
@@ -0,0 +1,48 @@
+const std = @import("std");
+const builtin = @import("builtin");
+const compress = std.compress;
+const mem = std.mem;
+
+allocator: mem.Allocator,
+s1: []u16 = undefined,
+s2: []u8 = undefined,
+
+const Self = @This();
+
+pub fn init(allocator: mem.Allocator) !Self {
+    const decompressor = compress.deflate.decompressor;
+    const in_bytes = @embedFile("ccc");
+    var in_fbs = std.io.fixedBufferStream(in_bytes);
+    var in_decomp = try decompressor(allocator, in_fbs.reader(), null);
+    defer in_decomp.deinit();
+    var reader = in_decomp.reader();
+
+    const endian = builtin.cpu.arch.endian();
+
+    var self = Self{ .allocator = allocator };
+
+    const stage_1_len: u16 = try reader.readInt(u16, endian);
+    self.s1 = try allocator.alloc(u16, stage_1_len);
+    for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
+
+    const stage_2_len: u16 = try reader.readInt(u16, endian);
+    self.s2 = try allocator.alloc(u8, stage_2_len);
+    _ = try reader.readAll(self.s2);
+
+    return self;
+}
+
+pub fn deinit(self: *Self) void {
+    self.allocator.free(self.s1);
+    self.allocator.free(self.s2);
+}
+
+/// Returns the canonical combining class for a code point.
+pub inline fn ccc(self: Self, cp: u21) u8 {
+    return self.s2[self.s1[cp >> 8] + (cp & 0xff)];
+}
+
+/// True if `cp` is a starter code point, not a combining character.
+pub inline fn isStarter(self: Self, cp: u21) bool {
+    return self.s2[self.s1[cp >> 8] + (cp & 0xff)] == 0;
+}
diff --git a/src/DisplayWidth.zig b/src/DisplayWidth.zig
new file mode 100644
index 0000000..85d04a0
--- /dev/null
+++ b/src/DisplayWidth.zig
@@ -0,0 +1,351 @@
+const std = @import("std");
+const builtin = @import("builtin");
+const ArrayList = std.ArrayList;
+const mem = std.mem;
+const simd = std.simd;
+const testing = std.testing;
+
+const ascii = @import("ascii");
+const CodePointIterator = @import("code_point").Iterator;
+const GraphemeIterator = @import("grapheme").Iterator;
+pub const Data = @import("DisplayWidthData");
+
+data: *Data,
+
+const Self = @This();
+
+/// strWidth returns the total display width of `str` as the number of cells
+/// required in a fixed-pitch font (i.e. a terminal screen).
+pub fn strWidth(self: Self, str: []const u8) usize {
+    var total: isize = 0;
+
+    // ASCII fast path
+    if (ascii.isAsciiOnly(str)) {
+        for (str) |b| total += self.data.codePointWidth(b);
+        return @intCast(@max(0, total));
+    }
+
+    var giter = GraphemeIterator.init(str, &self.data.g_data);
+
+    while (giter.next()) |gc| {
+        var cp_iter = CodePointIterator{ .bytes = gc.bytes(str) };
+        var gc_total: isize = 0;
+
+        while (cp_iter.next()) |cp| {
+            var w = self.data.codePointWidth(cp.code);
+
+            if (w != 0) {
+                // Handle text emoji sequence.
+                if (cp_iter.next()) |ncp| {
+                    // emoji text sequence.
+                    if (ncp.code == 0xFE0E) w = 1;
+                }
+
+                // Only adding width of first non-zero-width code point.
+                if (gc_total == 0) {
+                    gc_total = w;
+                    break;
+                }
+            }
+        }
+
+        total += gc_total;
+    }
+
+    return @intCast(@max(0, total));
+}
+
+test "strWidth" {
+    var data = try Data.init(testing.allocator);
+    defer data.deinit();
+    const self = Self{ .data = &data };
+
+    try testing.expectEqual(@as(usize, 5), self.strWidth("Hello\r\n"));
+    try testing.expectEqual(@as(usize, 1), self.strWidth("\u{0065}\u{0301}"));
+    try testing.expectEqual(@as(usize, 2), self.strWidth("\u{1F476}\u{1F3FF}\u{0308}\u{200D}\u{1F476}\u{1F3FF}"));
+    try testing.expectEqual(@as(usize, 8), self.strWidth("Hello 😊"));
+    try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo 😊"));
+    try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo :)"));
+    try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo 🇪🇸"));
+    try testing.expectEqual(@as(usize, 2), self.strWidth("\u{26A1}")); // Lone emoji
+    try testing.expectEqual(@as(usize, 1), self.strWidth("\u{26A1}\u{FE0E}")); // Text sequence
+    try testing.expectEqual(@as(usize, 2), self.strWidth("\u{26A1}\u{FE0F}")); // Presentation sequence
+    try testing.expectEqual(@as(usize, 0), self.strWidth("A\x08")); // Backspace
+    try testing.expectEqual(@as(usize, 0), self.strWidth("\x7FA")); // DEL
+    try testing.expectEqual(@as(usize, 0), self.strWidth("\x7FA\x08\x08")); // never less than o
+
+    // wcwidth Python lib tests. See: https://github.com/jquast/wcwidth/blob/master/tests/test_core.py
+    const empty = "";
+    try testing.expectEqual(@as(usize, 0), self.strWidth(empty));
+    const with_null = "hello\x00world";
+    try testing.expectEqual(@as(usize, 10), self.strWidth(with_null));
+    const hello_jp = "コンニチハ, セカイ!";
+    try testing.expectEqual(@as(usize, 19), self.strWidth(hello_jp));
+    const control = "\x1b[0m";
+    try testing.expectEqual(@as(usize, 3), self.strWidth(control));
+    const balinese = "\u{1B13}\u{1B28}\u{1B2E}\u{1B44}";
+    try testing.expectEqual(@as(usize, 3), self.strWidth(balinese));
+
+    // These commented out tests require a new specification for complex scripts.
+    // See: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf
+    // const jamo = "\u{1100}\u{1160}";
+    // try testing.expectEqual(@as(usize, 3), strWidth(jamo));
+    // const devengari = "\u{0915}\u{094D}\u{0937}\u{093F}";
+    // try testing.expectEqual(@as(usize, 3), strWidth(devengari));
+    // const tamal = "\u{0b95}\u{0bcd}\u{0bb7}\u{0bcc}";
+    // try testing.expectEqual(@as(usize, 5), strWidth(tamal));
+    // const kannada_1 = "\u{0cb0}\u{0ccd}\u{0c9d}\u{0cc8}";
+    // try testing.expectEqual(@as(usize, 3), strWidth(kannada_1));
+    // The following passes but as a mere coincidence.
+    const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}";
+    try testing.expectEqual(@as(usize, 2), self.strWidth(kannada_2));
+
+    // From Rust https://github.com/jameslanska/unicode-display-width
+    try testing.expectEqual(@as(usize, 15), self.strWidth("🔥🗡🍩👩🏻‍🚀⏰💃🏼🔦👍🏻"));
+    try testing.expectEqual(@as(usize, 2), self.strWidth("🦀"));
+    try testing.expectEqual(@as(usize, 2), self.strWidth("👨‍👩‍👧‍👧"));
+    try testing.expectEqual(@as(usize, 2), self.strWidth("👩‍🔬"));
+    try testing.expectEqual(@as(usize, 9), self.strWidth("sane text"));
+    try testing.expectEqual(@as(usize, 9), self.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ"));
+    try testing.expectEqual(@as(usize, 17), self.strWidth("슬라바 우크라이나"));
+    try testing.expectEqual(@as(usize, 1), self.strWidth("\u{378}"));
+}
+
+/// centers `str` in a new string of width `total_width` (in display cells) using `pad` as padding.
+/// If the length of `str` and `total_width` have different parity, the right side of `str` will
+/// receive one additional pad. This makes sure the returned string fills the requested width.
+/// Caller must free returned bytes with `allocator`.
+pub fn center(
+    self: Self,
+    allocator: mem.Allocator,
+    str: []const u8,
+    total_width: usize,
+    pad: []const u8,
+) ![]u8 {
+    const str_width = self.strWidth(str);
+    if (str_width > total_width) return error.StrTooLong;
+    if (str_width == total_width) return try allocator.dupe(u8, str);
+
+    const pad_width = self.strWidth(pad);
+    if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong;
+
+    const margin_width = @divFloor((total_width - str_width), 2);
+    if (pad_width > margin_width) return error.PadTooLong;
+    const extra_pad: usize = if (total_width % 2 != str_width % 2) 1 else 0;
+    const pads = @divFloor(margin_width, pad_width) * 2 + extra_pad;
+
+    var result = try allocator.alloc(u8, pads * pad.len + str.len);
+    var bytes_index: usize = 0;
+    var pads_index: usize = 0;
+
+    while (pads_index < pads / 2) : (pads_index += 1) {
+        @memcpy(result[bytes_index..][0..pad.len], pad);
+        bytes_index += pad.len;
+    }
+
+    @memcpy(result[bytes_index..][0..str.len], str);
+    bytes_index += str.len;
+
+    pads_index = 0;
+    while (pads_index < pads / 2 + extra_pad) : (pads_index += 1) {
+        @memcpy(result[bytes_index..][0..pad.len], pad);
+        bytes_index += pad.len;
+    }
+
+    return result;
+}
+
+test "center" {
+    const allocator = testing.allocator;
+    var data = try Data.init(allocator);
+    defer data.deinit();
+    const self = Self{ .data = &data };
+
+    // Input and width both have odd length
+    var centered = try self.center(allocator, "abc", 9, "*");
+    try testing.expectEqualSlices(u8, "***abc***", centered);
+
+    // Input and width both have even length
+    testing.allocator.free(centered);
+    centered = try self.center(allocator, "w😊w", 10, "-");
+    try testing.expectEqualSlices(u8, "---w😊w---", centered);
+
+    // Input has even length, width has odd length
+    testing.allocator.free(centered);
+    centered = try self.center(allocator, "1234", 9, "-");
+    try testing.expectEqualSlices(u8, "--1234---", centered);
+
+    // Input has odd length, width has even length
+    testing.allocator.free(centered);
+    centered = try self.center(allocator, "123", 8, "-");
+    try testing.expectEqualSlices(u8, "--123---", centered);
+
+    // Input is the same length as the width
+    testing.allocator.free(centered);
+    centered = try self.center(allocator, "123", 3, "-");
+    try testing.expectEqualSlices(u8, "123", centered);
+
+    // Input is empty
+    testing.allocator.free(centered);
+    centered = try self.center(allocator, "", 3, "-");
+    try testing.expectEqualSlices(u8, "---", centered);
+
+    // Input is empty and width is zero
+    testing.allocator.free(centered);
+    centered = try self.center(allocator, "", 0, "-");
+    try testing.expectEqualSlices(u8, "", centered);
+
+    // Input is longer than the width, which is an error
+    testing.allocator.free(centered);
+    try testing.expectError(error.StrTooLong, self.center(allocator, "123", 2, "-"));
+}
+
+/// padLeft returns a new string of width `total_width` (in display cells) using `pad` as padding
+/// on the left side. Caller must free returned bytes with `allocator`.
+pub fn padLeft(
+    self: Self,
+    allocator: mem.Allocator,
+    str: []const u8,
+    total_width: usize,
+    pad: []const u8,
+) ![]u8 {
+    const str_width = self.strWidth(str);
+    if (str_width > total_width) return error.StrTooLong;
+
+    const pad_width = self.strWidth(pad);
+    if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong;
+
+    const margin_width = total_width - str_width;
+    if (pad_width > margin_width) return error.PadTooLong;
+
+    const pads = @divFloor(margin_width, pad_width);
+
+    var result = try allocator.alloc(u8, pads * pad.len + str.len);
+    var bytes_index: usize = 0;
+    var pads_index: usize = 0;
+
+    while (pads_index < pads) : (pads_index += 1) {
+        @memcpy(result[bytes_index..][0..pad.len], pad);
+        bytes_index += pad.len;
+    }
+
+    @memcpy(result[bytes_index..][0..str.len], str);
+
+    return result;
+}
+
+test "padLeft" {
+    const allocator = testing.allocator;
+    var data = try Data.init(allocator);
+    defer data.deinit();
+    const self = Self{ .data = &data };
+
+    var right_aligned = try self.padLeft(allocator, "abc", 9, "*");
+    defer testing.allocator.free(right_aligned);
+    try testing.expectEqualSlices(u8, "******abc", right_aligned);
+
+    testing.allocator.free(right_aligned);
+    right_aligned = try self.padLeft(allocator, "w😊w", 10, "-");
+    try testing.expectEqualSlices(u8, "------w😊w", right_aligned);
+}
+
+/// padRight returns a new string of width `total_width` (in display cells) using `pad` as padding
+/// on the right side.  Caller must free returned bytes with `allocator`.
+pub fn padRight(
+    self: Self,
+    allocator: mem.Allocator,
+    str: []const u8,
+    total_width: usize,
+    pad: []const u8,
+) ![]u8 {
+    const str_width = self.strWidth(str);
+    if (str_width > total_width) return error.StrTooLong;
+
+    const pad_width = self.strWidth(pad);
+    if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong;
+
+    const margin_width = total_width - str_width;
+    if (pad_width > margin_width) return error.PadTooLong;
+
+    const pads = @divFloor(margin_width, pad_width);
+
+    var result = try allocator.alloc(u8, pads * pad.len + str.len);
+    var bytes_index: usize = 0;
+    var pads_index: usize = 0;
+
+    @memcpy(result[bytes_index..][0..str.len], str);
+    bytes_index += str.len;
+
+    while (pads_index < pads) : (pads_index += 1) {
+        @memcpy(result[bytes_index..][0..pad.len], pad);
+        bytes_index += pad.len;
+    }
+
+    return result;
+}
+
+test "padRight" {
+    const allocator = testing.allocator;
+    var data = try Data.init(allocator);
+    defer data.deinit();
+    const self = Self{ .data = &data };
+
+    var left_aligned = try self.padRight(allocator, "abc", 9, "*");
+    defer testing.allocator.free(left_aligned);
+    try testing.expectEqualSlices(u8, "abc******", left_aligned);
+
+    testing.allocator.free(left_aligned);
+    left_aligned = try self.padRight(allocator, "w😊w", 10, "-");
+    try testing.expectEqualSlices(u8, "w😊w------", left_aligned);
+}
+
+/// Wraps a string approximately at the given number of colums per line.
+/// `threshold` defines how far the last column of the last word can be
+/// from the edge. Caller must free returned bytes with `allocator`.
+pub fn wrap(
+    self: Self,
+    allocator: mem.Allocator,
+    str: []const u8,
+    columns: usize,
+    threshold: usize,
+) ![]u8 {
+    var result = ArrayList(u8).init(allocator);
+    defer result.deinit();
+
+    var line_iter = mem.tokenizeAny(u8, str, "\r\n");
+    var line_width: usize = 0;
+
+    while (line_iter.next()) |line| {
+        var word_iter = mem.tokenizeScalar(u8, line, ' ');
+
+        while (word_iter.next()) |word| {
+            try result.appendSlice(word);
+            try result.append(' ');
+            line_width += self.strWidth(word) + 1;
+
+            if (line_width > columns or columns - line_width <= threshold) {
+                try result.append('\n');
+                line_width = 0;
+            }
+        }
+    }
+
+    // Remove trailing space and newline.
+    _ = result.pop();
+    _ = result.pop();
+
+    return try result.toOwnedSlice();
+}
+
+test "wrap" {
+    const allocator = testing.allocator;
+    var data = try Data.init(allocator);
+    defer data.deinit();
+    const self = Self{ .data = &data };
+
+    const input = "The quick brown fox\r\njumped over the lazy dog!";
+    const got = try self.wrap(allocator, input, 10, 3);
+    defer testing.allocator.free(got);
+    const want = "The quick \nbrown fox \njumped \nover the \nlazy dog!";
+    try testing.expectEqualStrings(want, got);
+}
diff --git a/src/DisplayWidthData.zig b/src/DisplayWidthData.zig
new file mode 100644
index 0000000..32f8658
--- /dev/null
+++ b/src/DisplayWidthData.zig
@@ -0,0 +1,82 @@
+const std = @import("std");
+const builtin = @import("builtin");
+const compress = std.compress;
+const mem = std.mem;
+const testing = std.testing;
+
+const GraphemeData = @import("GraphemeData");
+
+allocator: mem.Allocator,
+g_data: GraphemeData,
+s1: []u16 = undefined,
+s2: []i3 = undefined,
+
+const Self = @This();
+
+pub fn init(allocator: mem.Allocator) !Self {
+    const decompressor = compress.deflate.decompressor;
+    const in_bytes = @embedFile("dwp");
+    var in_fbs = std.io.fixedBufferStream(in_bytes);
+    var in_decomp = try decompressor(allocator, in_fbs.reader(), null);
+    defer in_decomp.deinit();
+    var reader = in_decomp.reader();
+
+    const endian = builtin.cpu.arch.endian();
+
+    var self = Self{
+        .allocator = allocator,
+        .g_data = try GraphemeData.init(allocator),
+    };
+
+    const stage_1_len: u16 = try reader.readInt(u16, endian);
+    self.s1 = try allocator.alloc(u16, stage_1_len);
+    for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
+
+    const stage_2_len: u16 = try reader.readInt(u16, endian);
+    self.s2 = try allocator.alloc(i3, stage_2_len);
+    for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(i8, endian));
+
+    return self;
+}
+
+pub fn deinit(self: *Self) void {
+    self.allocator.free(self.s1);
+    self.allocator.free(self.s2);
+    self.g_data.deinit();
+}
+
+/// codePointWidth returns the number of cells `cp` requires when rendered
+/// in a fixed-pitch font (i.e. a terminal screen). This can range from -1 to
+/// 3, where BACKSPACE and DELETE return -1 and 3-em-dash returns 3. C0/C1
+/// control codes return 0. If `cjk` is true, ambiguous code points return 2,
+/// otherwise they return 1.
+pub inline fn codePointWidth(self: Self, cp: u21) i3 {
+    return self.s2[self.s1[cp >> 8] + (cp & 0xff)];
+}
+
+test "codePointWidth" {
+    try testing.expectEqual(@as(i3, 0), codePointWidth(0x0000)); // null
+    try testing.expectEqual(@as(i3, -1), codePointWidth(0x8)); // \b
+    try testing.expectEqual(@as(i3, -1), codePointWidth(0x7f)); // DEL
+    try testing.expectEqual(@as(i3, 0), codePointWidth(0x0005)); // Cf
+    try testing.expectEqual(@as(i3, 0), codePointWidth(0x0007)); // \a BEL
+    try testing.expectEqual(@as(i3, 0), codePointWidth(0x000A)); // \n LF
+    try testing.expectEqual(@as(i3, 0), codePointWidth(0x000B)); // \v VT
+    try testing.expectEqual(@as(i3, 0), codePointWidth(0x000C)); // \f FF
+    try testing.expectEqual(@as(i3, 0), codePointWidth(0x000D)); // \r CR
+    try testing.expectEqual(@as(i3, 0), codePointWidth(0x000E)); // SQ
+    try testing.expectEqual(@as(i3, 0), codePointWidth(0x000F)); // SI
+
+    try testing.expectEqual(@as(i3, 0), codePointWidth(0x070F)); // Cf
+    try testing.expectEqual(@as(i3, 1), codePointWidth(0x0603)); // Cf Arabic
+
+    try testing.expectEqual(@as(i3, 1), codePointWidth(0x00AD)); // soft-hyphen
+    try testing.expectEqual(@as(i3, 2), codePointWidth(0x2E3A)); // two-em dash
+    try testing.expectEqual(@as(i3, 3), codePointWidth(0x2E3B)); // three-em dash
+
+    try testing.expectEqual(@as(i3, 1), codePointWidth(0x00BD)); // ambiguous halfwidth
+
+    try testing.expectEqual(@as(i3, 1), codePointWidth('é'));
+    try testing.expectEqual(@as(i3, 2), codePointWidth('😊'));
+    try testing.expectEqual(@as(i3, 2), codePointWidth('统'));
+}
diff --git a/src/GraphemeData.zig b/src/GraphemeData.zig
new file mode 100644
index 0000000..e418dea
--- /dev/null
+++ b/src/GraphemeData.zig
@@ -0,0 +1,86 @@
+const std = @import("std");
+const builtin = @import("builtin");
+const compress = std.compress;
+const mem = std.mem;
+
+/// Indic syllable type.
+pub const Indic = enum {
+    none,
+
+    Consonant,
+    Extend,
+    Linker,
+};
+
+/// Grapheme break property.
+pub const Gbp = enum {
+    none,
+    Control,
+    CR,
+    Extend,
+    L,
+    LF,
+    LV,
+    LVT,
+    Prepend,
+    Regional_Indicator,
+    SpacingMark,
+    T,
+    V,
+    ZWJ,
+};
+
+allocator: mem.Allocator,
+s1: []u16 = undefined,
+s2: []u16 = undefined,
+s3: []u8 = undefined,
+
+const Self = @This();
+
+pub fn init(allocator: mem.Allocator) !Self {
+    const decompressor = compress.deflate.decompressor;
+    const in_bytes = @embedFile("gbp");
+    var in_fbs = std.io.fixedBufferStream(in_bytes);
+    var in_decomp = try decompressor(allocator, in_fbs.reader(), null);
+    defer in_decomp.deinit();
+    var reader = in_decomp.reader();
+
+    const endian = builtin.cpu.arch.endian();
+
+    var self = Self{ .allocator = allocator };
+
+    const s1_len: u16 = try reader.readInt(u16, endian);
+    self.s1 = try allocator.alloc(u16, s1_len);
+    for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
+
+    const s2_len: u16 = try reader.readInt(u16, endian);
+    self.s2 = try allocator.alloc(u16, s2_len);
+    for (0..s2_len) |i| self.s2[i] = try reader.readInt(u16, endian);
+
+    const s3_len: u16 = try reader.readInt(u16, endian);
+    self.s3 = try allocator.alloc(u8, s3_len);
+    _ = try reader.readAll(self.s3);
+
+    return self;
+}
+
+pub fn deinit(self: *Self) void {
+    self.allocator.free(self.s1);
+    self.allocator.free(self.s2);
+    self.allocator.free(self.s3);
+}
+
+/// Lookup the grapheme break property for a code point.
+pub inline fn gbp(self: Self, cp: u21) Gbp {
+    return @enumFromInt(self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] >> 4);
+}
+
+/// Lookup the indic syllable type for a code point.
+pub inline fn indic(self: Self, cp: u21) Indic {
+    return @enumFromInt((self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7);
+}
+
+/// Lookup the indic syllable type for a code point.
+pub inline fn isEmoji(self: Self, cp: u21) bool {
+    return self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1;
+}
diff --git a/src/Normalizer.zig b/src/Normalizer.zig
index 1b4a2d5..6a19f47 100644
--- a/src/Normalizer.zig
+++ b/src/Normalizer.zig
@@ -8,16 +8,18 @@ const CodePointIterator = @import("code_point").Iterator;
 const case_fold_map = @import("ziglyph").case_folding;
 const hangul_map = @import("ziglyph").hangul;
 const norm_props = @import("ziglyph").normalization_props;
-const normp = @import("normp");
-
-const Self = @This();
+pub const Data = @import("CombiningClassData");
 
+ccc_data: *Data,
 nfc_map: std.AutoHashMap([2]u21, u21),
 nfd_map: std.AutoHashMap(u21, [2]u21),
 nfkd_map: std.AutoHashMap(u21, [18]u21),
 
-pub fn init(allocator: std.mem.Allocator) !Self {
+const Self = @This();
+
+pub fn init(allocator: std.mem.Allocator, data: *Data) !Self {
     var self = Self{
+        .ccc_data = data,
         .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator),
         .nfd_map = std.AutoHashMap(u21, [2]u21).init(allocator),
         .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator),
@@ -95,7 +97,9 @@ pub fn deinit(self: *Self) void {
 }
 
 test "init / deinit" {
-    var n = try init(std.testing.allocator);
+    var data = try Data.init(std.testing.allocator);
+    defer data.deinit();
+    var n = try init(std.testing.allocator, &data);
     defer n.deinit();
 }
 
@@ -241,7 +245,9 @@ pub fn decompose(self: Self, cp: u21, form: Form) Decomp {
 
 test "decompose" {
     const allocator = std.testing.allocator;
-    var n = try init(allocator);
+    var data = try Data.init(allocator);
+    defer data.deinit();
+    var n = try init(allocator, &data);
     defer n.deinit();
 
     var dc = n.decompose('é', .nfd);
@@ -307,19 +313,17 @@ pub const Result = struct {
 };
 
 // Compares code points by Canonical Combining Class order.
-fn cccLess(_: void, lhs: u21, rhs: u21) bool {
-    const lcc = normp.stage_2[normp.stage_1[lhs >> 8] + (lhs & 0xff)];
-    const rcc = normp.stage_2[normp.stage_1[rhs >> 8] + (rhs & 0xff)];
-    return lcc < rcc;
+fn cccLess(self: Self, lhs: u21, rhs: u21) bool {
+    return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs);
 }
 
 // Applies the Canonical Sorting Algorithm.
-fn canonicalSort(cps: []u21) void {
+fn canonicalSort(self: Self, cps: []u21) void {
     var i: usize = 0;
     while (i < cps.len) : (i += 1) {
         const start: usize = i;
-        while (i < cps.len and normp.stage_2[normp.stage_1[cps[i] >> 8] + (cps[i] & 0xff)] != 0) : (i += 1) {}
-        std.mem.sort(u21, cps[start..i], {}, cccLess);
+        while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {}
+        std.mem.sort(u21, cps[start..i], self, cccLess);
     }
 }
 
@@ -349,7 +353,7 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
         try dcp_list.appendSlice(slice);
     }
 
-    canonicalSort(dcp_list.items);
+    self.canonicalSort(dcp_list.items);
 
     var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4);
     defer dstr_list.deinit();
@@ -365,7 +369,9 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
 
 test "nfd ASCII / no-alloc" {
     const allocator = std.testing.allocator;
-    var n = try init(allocator);
+    var data = try Data.init(allocator);
+    defer data.deinit();
+    var n = try init(allocator, &data);
     defer n.deinit();
 
     var result = try n.nfd(allocator, "Hello World!");
@@ -376,7 +382,9 @@ test "nfd ASCII / no-alloc" {
 
 test "nfd !ASCII / alloc" {
     const allocator = std.testing.allocator;
-    var n = try init(allocator);
+    var data = try Data.init(allocator);
+    defer data.deinit();
+    var n = try init(allocator, &data);
     defer n.deinit();
 
     var result = try n.nfd(allocator, "Héllo World! \u{3d3}");
@@ -387,7 +395,9 @@ test "nfd !ASCII / alloc" {
 
 test "nfkd ASCII / no-alloc" {
     const allocator = std.testing.allocator;
-    var n = try init(allocator);
+    var data = try Data.init(allocator);
+    defer data.deinit();
+    var n = try init(allocator, &data);
     defer n.deinit();
 
     var result = try n.nfkd(allocator, "Hello World!");
@@ -398,7 +408,9 @@ test "nfkd ASCII / no-alloc" {
 
 test "nfkd !ASCII / alloc" {
     const allocator = std.testing.allocator;
-    var n = try init(allocator);
+    var data = try Data.init(allocator);
+    defer data.deinit();
+    var n = try init(allocator, &data);
     defer n.deinit();
 
     var result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
@@ -413,16 +425,8 @@ fn isHangul(cp: u21) bool {
     return cp >= 0x1100 and hangul_map.syllableType(cp) != null;
 }
 
-fn isStarter(cp: u21) bool {
-    return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] == 0;
-}
-
-fn isCombining(cp: u21) bool {
-    return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] != 0;
-}
-
-fn isNonHangulStarter(cp: u21) bool {
-    return !isHangul(cp) and isStarter(cp);
+fn isNonHangulStarter(self: Self, cp: u21) bool {
+    return !isHangul(cp) and self.ccc_data.isStarter(cp);
 }
 
 /// Normalizes `str` to NFC.
@@ -464,7 +468,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
 
         block_check: while (i < d_list.items.len) : (i += 1) {
             const C = d_list.items[i];
-            const cc_C = normp.stage_2[normp.stage_1[C >> 8] + (C & 0xff)];
+            const cc_C = self.ccc_data.ccc(C);
             var starter_index: ?usize = null;
             var j: usize = i;
 
@@ -472,14 +476,14 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
                 j -= 1;
 
                 // Check for starter.
-                if (isStarter(d_list.items[j])) {
+                if (self.ccc_data.isStarter(d_list.items[j])) {
                     if (i - j > 1) { // If there's distance between the starting point and the current position.
                         for (d_list.items[(j + 1)..i]) |B| {
+                            const cc_B = self.ccc_data.ccc(B);
                             // Check for blocking conditions.
                             if (isHangul(C)) {
-                                if (isCombining(B) or isNonHangulStarter(B)) continue :block_check;
+                                if (cc_B != 0 or self.isNonHangulStarter(B)) continue :block_check;
                             }
-                            const cc_B = normp.stage_2[normp.stage_1[B >> 8] + (B & 0xff)];
                             if (cc_B >= cc_C) continue :block_check;
                         }
                     }
@@ -560,7 +564,9 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
 
 test "nfc" {
     const allocator = std.testing.allocator;
-    var n = try init(allocator);
+    var data = try Data.init(allocator);
+    defer data.deinit();
+    var n = try init(allocator, &data);
     defer n.deinit();
 
     var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
@@ -571,7 +577,9 @@ test "nfc" {
 
 test "nfkc" {
     const allocator = std.testing.allocator;
-    var n = try init(allocator);
+    var data = try Data.init(allocator);
+    defer data.deinit();
+    var n = try init(allocator, &data);
     defer n.deinit();
 
     var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
@@ -630,7 +638,9 @@ pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u
 
 test "eql" {
     const allocator = std.testing.allocator;
-    var n = try init(allocator);
+    var data = try Data.init(allocator);
+    defer data.deinit();
+    var n = try init(allocator, &data);
     defer n.deinit();
 
     try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
@@ -697,7 +707,9 @@ pub fn eqlCaseless(self: Self, allocator: std.mem.Allocator, a: []const u8, b: [
 
 test "eqlCaseless" {
     const allocator = std.testing.allocator;
-    var n = try init(allocator);
+    var data = try Data.init(allocator);
+    defer data.deinit();
+    var n = try init(allocator, &data);
     defer n.deinit();
 
     try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}"));
@@ -707,7 +719,7 @@ test "eqlCaseless" {
 // FCD
 fn getLeadCcc(self: Self, cp: u21) u8 {
     const dc = self.mapping(cp, .nfd);
-    return normp.stage_2[normp.stage_1[dc.cps[0] >> 8] + (dc.cps[0] & 0xff)];
+    return self.ccc_data.ccc(dc.cps[0]);
 }
 
 fn getTrailCcc(self: Self, cp: u21) u8 {
@@ -715,8 +727,7 @@ fn getTrailCcc(self: Self, cp: u21) u8 {
     const len = for (dc.cps, 0..) |dcp, i| {
         if (dcp == 0) break i;
     } else dc.cps.len;
-    const tcp = dc.cps[len -| 1];
-    return normp.stage_2[normp.stage_1[tcp >> 8] + (tcp & 0xff)];
+    return self.ccc_data.ccc(dc.cps[len - 1]);
 }
 
 /// Fast check to detect if a string is already in NFC or NFD form.
@@ -733,7 +744,9 @@ pub fn isFcd(self: Self, str: []const u8) bool {
 
 test "isFcd" {
     const allocator = std.testing.allocator;
-    var n = try init(allocator);
+    var data = try Data.init(allocator);
+    defer data.deinit();
+    var n = try init(allocator, &data);
     defer n.deinit();
 
     const is_nfc = "José \u{3D3}";
@@ -751,7 +764,9 @@ test "Unicode normalization tests" {
     defer arena.deinit();
     var allocator = arena.allocator();
 
-    var n = try init(allocator);
+    var data = try Data.init(allocator);
+    defer data.deinit();
+    var n = try init(allocator, &data);
     defer n.deinit();
 
     var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
diff --git a/src/display_width.zig b/src/display_width.zig
deleted file mode 100644
index a916cac..0000000
--- a/src/display_width.zig
+++ /dev/null
@@ -1,360 +0,0 @@
-const std = @import("std");
-const simd = std.simd;
-const mem = std.mem;
-const testing = std.testing;
-
-const ascii = @import("ascii");
-const CodePointIterator = @import("code_point").Iterator;
-const dwp = @import("dwp");
-const GraphemeIterator = @import("grapheme").Iterator;
-
-/// codePointWidth returns the number of cells `cp` requires when rendered
-/// in a fixed-pitch font (i.e. a terminal screen). This can range from -1 to
-/// 3, where BACKSPACE and DELETE return -1 and 3-em-dash returns 3. C0/C1
-/// control codes return 0. If `cjk` is true, ambiguous code points return 2,
-/// otherwise they return 1.
-pub fn codePointWidth(cp: u21) i3 {
-    return dwp.stage_2[dwp.stage_1[cp >> 8] + (cp & 0xff)];
-}
-
-test "codePointWidth" {
-    try testing.expectEqual(@as(i3, 0), codePointWidth(0x0000)); // null
-    try testing.expectEqual(@as(i3, -1), codePointWidth(0x8)); // \b
-    try testing.expectEqual(@as(i3, -1), codePointWidth(0x7f)); // DEL
-    try testing.expectEqual(@as(i3, 0), codePointWidth(0x0005)); // Cf
-    try testing.expectEqual(@as(i3, 0), codePointWidth(0x0007)); // \a BEL
-    try testing.expectEqual(@as(i3, 0), codePointWidth(0x000A)); // \n LF
-    try testing.expectEqual(@as(i3, 0), codePointWidth(0x000B)); // \v VT
-    try testing.expectEqual(@as(i3, 0), codePointWidth(0x000C)); // \f FF
-    try testing.expectEqual(@as(i3, 0), codePointWidth(0x000D)); // \r CR
-    try testing.expectEqual(@as(i3, 0), codePointWidth(0x000E)); // SQ
-    try testing.expectEqual(@as(i3, 0), codePointWidth(0x000F)); // SI
-
-    try testing.expectEqual(@as(i3, 0), codePointWidth(0x070F)); // Cf
-    try testing.expectEqual(@as(i3, 1), codePointWidth(0x0603)); // Cf Arabic
-
-    try testing.expectEqual(@as(i3, 1), codePointWidth(0x00AD)); // soft-hyphen
-    try testing.expectEqual(@as(i3, 2), codePointWidth(0x2E3A)); // two-em dash
-    try testing.expectEqual(@as(i3, 3), codePointWidth(0x2E3B)); // three-em dash
-
-    try testing.expectEqual(@as(i3, 1), codePointWidth(0x00BD)); // ambiguous halfwidth
-
-    try testing.expectEqual(@as(i3, 1), codePointWidth('é'));
-    try testing.expectEqual(@as(i3, 2), codePointWidth('😊'));
-    try testing.expectEqual(@as(i3, 2), codePointWidth('统'));
-}
-
-/// strWidth returns the total display width of `str` as the number of cells
-/// required in a fixed-pitch font (i.e. a terminal screen).
-pub fn strWidth(str: []const u8) usize {
-    var total: isize = 0;
-
-    // ASCII fast path
-    if (ascii.isAsciiOnly(str)) {
-        for (str) |b| total += codePointWidth(b);
-        return @intCast(@max(0, total));
-    }
-
-    var giter = GraphemeIterator.init(str);
-
-    while (giter.next()) |gc| {
-        var cp_iter = CodePointIterator{ .bytes = gc.bytes(str) };
-        var gc_total: isize = 0;
-
-        while (cp_iter.next()) |cp| {
-            var w = codePointWidth(cp.code);
-
-            if (w != 0) {
-                // Handle text emoji sequence.
-                if (cp_iter.next()) |ncp| {
-                    // emoji text sequence.
-                    if (ncp.code == 0xFE0E) w = 1;
-                }
-
-                // Only adding width of first non-zero-width code point.
-                if (gc_total == 0) {
-                    gc_total = w;
-                    break;
-                }
-            }
-        }
-
-        total += gc_total;
-    }
-
-    return @intCast(@max(0, total));
-}
-
-test "strWidth" {
-    try testing.expectEqual(@as(usize, 5), strWidth("Hello\r\n"));
-    try testing.expectEqual(@as(usize, 1), strWidth("\u{0065}\u{0301}"));
-    try testing.expectEqual(@as(usize, 2), strWidth("\u{1F476}\u{1F3FF}\u{0308}\u{200D}\u{1F476}\u{1F3FF}"));
-    try testing.expectEqual(@as(usize, 8), strWidth("Hello 😊"));
-    try testing.expectEqual(@as(usize, 8), strWidth("Héllo 😊"));
-    try testing.expectEqual(@as(usize, 8), strWidth("Héllo :)"));
-    try testing.expectEqual(@as(usize, 8), strWidth("Héllo 🇪🇸"));
-    try testing.expectEqual(@as(usize, 2), strWidth("\u{26A1}")); // Lone emoji
-    try testing.expectEqual(@as(usize, 1), strWidth("\u{26A1}\u{FE0E}")); // Text sequence
-    try testing.expectEqual(@as(usize, 2), strWidth("\u{26A1}\u{FE0F}")); // Presentation sequence
-    try testing.expectEqual(@as(usize, 0), strWidth("A\x08")); // Backspace
-    try testing.expectEqual(@as(usize, 0), strWidth("\x7FA")); // DEL
-    try testing.expectEqual(@as(usize, 0), strWidth("\x7FA\x08\x08")); // never less than o
-
-    // wcwidth Python lib tests. See: https://github.com/jquast/wcwidth/blob/master/tests/test_core.py
-    const empty = "";
-    try testing.expectEqual(@as(usize, 0), strWidth(empty));
-    const with_null = "hello\x00world";
-    try testing.expectEqual(@as(usize, 10), strWidth(with_null));
-    const hello_jp = "コンニチハ, セカイ!";
-    try testing.expectEqual(@as(usize, 19), strWidth(hello_jp));
-    const control = "\x1b[0m";
-    try testing.expectEqual(@as(usize, 3), strWidth(control));
-    const balinese = "\u{1B13}\u{1B28}\u{1B2E}\u{1B44}";
-    try testing.expectEqual(@as(usize, 3), strWidth(balinese));
-
-    // These commented out tests require a new specification for complex scripts.
-    // See: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf
-    // const jamo = "\u{1100}\u{1160}";
-    // try testing.expectEqual(@as(usize, 3), strWidth(jamo));
-    // const devengari = "\u{0915}\u{094D}\u{0937}\u{093F}";
-    // try testing.expectEqual(@as(usize, 3), strWidth(devengari));
-    // const tamal = "\u{0b95}\u{0bcd}\u{0bb7}\u{0bcc}";
-    // try testing.expectEqual(@as(usize, 5), strWidth(tamal));
-    // const kannada_1 = "\u{0cb0}\u{0ccd}\u{0c9d}\u{0cc8}";
-    // try testing.expectEqual(@as(usize, 3), strWidth(kannada_1));
-    // The following passes but as a mere coincidence.
-    const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}";
-    try testing.expectEqual(@as(usize, 2), strWidth(kannada_2));
-
-    // From Rust https://github.com/jameslanska/unicode-display-width
-    try testing.expectEqual(@as(usize, 15), strWidth("🔥🗡🍩👩🏻‍🚀⏰💃🏼🔦👍🏻"));
-    try testing.expectEqual(@as(usize, 2), strWidth("🦀"));
-    try testing.expectEqual(@as(usize, 2), strWidth("👨‍👩‍👧‍👧"));
-    try testing.expectEqual(@as(usize, 2), strWidth("👩‍🔬"));
-    try testing.expectEqual(@as(usize, 9), strWidth("sane text"));
-    try testing.expectEqual(@as(usize, 9), strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ"));
-    try testing.expectEqual(@as(usize, 17), strWidth("슬라바 우크라이나"));
-    try testing.expectEqual(@as(usize, 1), strWidth("\u{378}"));
-}
-
-/// centers `str` in a new string of width `total_width` (in display cells) using `pad` as padding.
-/// If the length of `str` and `total_width` have different parity, the right side of `str` will
-/// receive one additional pad. This makes sure the returned string fills the requested width.
-/// Caller must free returned bytes with `allocator`.
-pub fn center(
-    allocator: mem.Allocator,
-    str: []const u8,
-    total_width: usize,
-    pad: []const u8,
-) ![]u8 {
-    const str_width = strWidth(str);
-    if (str_width > total_width) return error.StrTooLong;
-    if (str_width == total_width) return try allocator.dupe(u8, str);
-
-    const pad_width = strWidth(pad);
-    if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong;
-
-    const margin_width = @divFloor((total_width - str_width), 2);
-    if (pad_width > margin_width) return error.PadTooLong;
-    const extra_pad: usize = if (total_width % 2 != str_width % 2) 1 else 0;
-    const pads = @divFloor(margin_width, pad_width) * 2 + extra_pad;
-
-    var result = try allocator.alloc(u8, pads * pad.len + str.len);
-    var bytes_index: usize = 0;
-    var pads_index: usize = 0;
-
-    while (pads_index < pads / 2) : (pads_index += 1) {
-        @memcpy(result[bytes_index..][0..pad.len], pad);
-        bytes_index += pad.len;
-    }
-
-    @memcpy(result[bytes_index..][0..str.len], str);
-    bytes_index += str.len;
-
-    pads_index = 0;
-    while (pads_index < pads / 2 + extra_pad) : (pads_index += 1) {
-        @memcpy(result[bytes_index..][0..pad.len], pad);
-        bytes_index += pad.len;
-    }
-
-    return result;
-}
-
-test "center" {
-    var allocator = std.testing.allocator;
-
-    // Input and width both have odd length
-    var centered = try center(allocator, "abc", 9, "*");
-    try testing.expectEqualSlices(u8, "***abc***", centered);
-
-    // Input and width both have even length
-    allocator.free(centered);
-    centered = try center(allocator, "w😊w", 10, "-");
-    try testing.expectEqualSlices(u8, "---w😊w---", centered);
-
-    // Input has even length, width has odd length
-    allocator.free(centered);
-    centered = try center(allocator, "1234", 9, "-");
-    try testing.expectEqualSlices(u8, "--1234---", centered);
-
-    // Input has odd length, width has even length
-    allocator.free(centered);
-    centered = try center(allocator, "123", 8, "-");
-    try testing.expectEqualSlices(u8, "--123---", centered);
-
-    // Input is the same length as the width
-    allocator.free(centered);
-    centered = try center(allocator, "123", 3, "-");
-    try testing.expectEqualSlices(u8, "123", centered);
-
-    // Input is empty
-    allocator.free(centered);
-    centered = try center(allocator, "", 3, "-");
-    try testing.expectEqualSlices(u8, "---", centered);
-
-    // Input is empty and width is zero
-    allocator.free(centered);
-    centered = try center(allocator, "", 0, "-");
-    try testing.expectEqualSlices(u8, "", centered);
-
-    // Input is longer than the width, which is an error
-    allocator.free(centered);
-    try testing.expectError(error.StrTooLong, center(allocator, "123", 2, "-"));
-}
-
-/// padLeft returns a new string of width `total_width` (in display cells) using `pad` as padding
-/// on the left side. Caller must free returned bytes with `allocator`.
-pub fn padLeft(
-    allocator: std.mem.Allocator,
-    str: []const u8,
-    total_width: usize,
-    pad: []const u8,
-) ![]u8 {
-    const str_width = strWidth(str);
-    if (str_width > total_width) return error.StrTooLong;
-
-    const pad_width = strWidth(pad);
-    if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong;
-
-    const margin_width = total_width - str_width;
-    if (pad_width > margin_width) return error.PadTooLong;
-
-    const pads = @divFloor(margin_width, pad_width);
-
-    var result = try allocator.alloc(u8, pads * pad.len + str.len);
-    var bytes_index: usize = 0;
-    var pads_index: usize = 0;
-
-    while (pads_index < pads) : (pads_index += 1) {
-        @memcpy(result[bytes_index..][0..pad.len], pad);
-        bytes_index += pad.len;
-    }
-
-    @memcpy(result[bytes_index..][0..str.len], str);
-
-    return result;
-}
-
-test "padLeft" {
-    var allocator = std.testing.allocator;
-
-    var right_aligned = try padLeft(allocator, "abc", 9, "*");
-    defer allocator.free(right_aligned);
-    try testing.expectEqualSlices(u8, "******abc", right_aligned);
-
-    allocator.free(right_aligned);
-    right_aligned = try padLeft(allocator, "w😊w", 10, "-");
-    try testing.expectEqualSlices(u8, "------w😊w", right_aligned);
-}
-
-/// padRight returns a new string of width `total_width` (in display cells) using `pad` as padding
-/// on the right side.  Caller must free returned bytes with `allocator`.
-pub fn padRight(
-    allocator: std.mem.Allocator,
-    str: []const u8,
-    total_width: usize,
-    pad: []const u8,
-) ![]u8 {
-    const str_width = strWidth(str);
-    if (str_width > total_width) return error.StrTooLong;
-
-    const pad_width = strWidth(pad);
-    if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong;
-
-    const margin_width = total_width - str_width;
-    if (pad_width > margin_width) return error.PadTooLong;
-
-    const pads = @divFloor(margin_width, pad_width);
-
-    var result = try allocator.alloc(u8, pads * pad.len + str.len);
-    var bytes_index: usize = 0;
-    var pads_index: usize = 0;
-
-    @memcpy(result[bytes_index..][0..str.len], str);
-    bytes_index += str.len;
-
-    while (pads_index < pads) : (pads_index += 1) {
-        @memcpy(result[bytes_index..][0..pad.len], pad);
-        bytes_index += pad.len;
-    }
-
-    return result;
-}
-
-test "padRight" {
-    var allocator = std.testing.allocator;
-
-    var left_aligned = try padRight(allocator, "abc", 9, "*");
-    defer allocator.free(left_aligned);
-    try testing.expectEqualSlices(u8, "abc******", left_aligned);
-
-    allocator.free(left_aligned);
-    left_aligned = try padRight(allocator, "w😊w", 10, "-");
-    try testing.expectEqualSlices(u8, "w😊w------", left_aligned);
-}
-
-/// Wraps a string approximately at the given number of colums per line.
-/// `threshold` defines how far the last column of the last word can be
-/// from the edge. Caller must free returned bytes with `allocator`.
-pub fn wrap(
-    allocator: std.mem.Allocator,
-    str: []const u8,
-    columns: usize,
-    threshold: usize,
-) ![]u8 {
-    var result = std.ArrayList(u8).init(allocator);
-    defer result.deinit();
-
-    var line_iter = mem.tokenizeAny(u8, str, "\r\n");
-    var line_width: usize = 0;
-
-    while (line_iter.next()) |line| {
-        var word_iter = mem.tokenizeScalar(u8, line, ' ');
-
-        while (word_iter.next()) |word| {
-            try result.appendSlice(word);
-            try result.append(' ');
-            line_width += strWidth(word) + 1;
-
-            if (line_width > columns or columns - line_width <= threshold) {
-                try result.append('\n');
-                line_width = 0;
-            }
-        }
-    }
-
-    // Remove trailing space and newline.
-    _ = result.pop();
-    _ = result.pop();
-
-    return try result.toOwnedSlice();
-}
-
-test "wrap" {
-    var allocator = std.testing.allocator;
-    const input = "The quick brown fox\r\njumped over the lazy dog!";
-    const got = try wrap(allocator, input, 10, 3);
-    defer allocator.free(got);
-    const want = "The quick \nbrown fox \njumped \nover the \nlazy dog!";
-    try testing.expectEqualStrings(want, got);
-}
diff --git a/src/grapheme.zig b/src/grapheme.zig
index 3fdf10b..7125b5b 100644
--- a/src/grapheme.zig
+++ b/src/grapheme.zig
@@ -1,9 +1,10 @@
 const std = @import("std");
+const mem = std.mem;
 const unicode = std.unicode;
 
 const CodePoint = @import("code_point").CodePoint;
 const CodePointIterator = @import("code_point").Iterator;
-const gbp = @import("gbp");
+pub const Data = @import("GraphemeData");
 
 /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
 pub const Grapheme = struct {
@@ -21,12 +22,13 @@ pub const Grapheme = struct {
 pub const Iterator = struct {
     buf: [2]?CodePoint = .{ null, null },
     cp_iter: CodePointIterator,
+    data: *Data,
 
     const Self = @This();
 
     /// Assumes `src` is valid UTF-8.
-    pub fn init(str: []const u8) Self {
-        var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } };
+    pub fn init(str: []const u8, data: *Data) Self {
+        var self = Self{ .cp_iter = .{ .bytes = str }, .data = data };
         self.advance();
         return self;
     }
@@ -55,6 +57,7 @@ pub const Iterator = struct {
         if (graphemeBreak(
             self.buf[0].?.code,
             self.buf[1].?.code,
+            self.data,
             &state,
         )) return Grapheme{ .len = gc_len, .offset = gc_start };
 
@@ -67,6 +70,7 @@ pub const Iterator = struct {
             if (graphemeBreak(
                 self.buf[0].?.code,
                 if (self.buf[1]) |ncp| ncp.code else 0,
+                self.data,
                 &state,
             )) break;
         }
@@ -76,18 +80,12 @@ pub const Iterator = struct {
 };
 
 // Predicates
-fn isBreaker(cp: u21) bool {
+fn isBreaker(cp: u21, data: *Data) bool {
     // Extract relevant properties.
-    const cp_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]];
-    const cp_gbp_prop: gbp.Gbp = @enumFromInt(cp_props_byte >> 4);
+    const cp_gbp_prop = data.gbp(cp);
     return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
 }
 
-fn isIgnorable(cp: u21) bool {
-    const cp_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]];
-    return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}';
-}
-
 // Grapheme break state.
 const State = struct {
     bits: u3 = 0,
@@ -135,18 +133,17 @@ const State = struct {
 pub fn graphemeBreak(
     cp1: u21,
     cp2: u21,
+    data: *Data,
     state: *State,
 ) bool {
     // Extract relevant properties.
-    const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]];
-    const cp1_gbp_prop: gbp.Gbp = @enumFromInt(cp1_props_byte >> 4);
-    const cp1_indic_prop: gbp.Indic = @enumFromInt((cp1_props_byte >> 1) & 0x7);
-    const cp1_is_emoji = cp1_props_byte & 1 == 1;
+    const cp1_gbp_prop = data.gbp(cp1);
+    const cp1_indic_prop = data.indic(cp1);
+    const cp1_is_emoji = data.isEmoji(cp1);
 
-    const cp2_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp2 >> 8] + (cp2 & 0xff)]];
-    const cp2_gbp_prop: gbp.Gbp = @enumFromInt(cp2_props_byte >> 4);
-    const cp2_indic_prop: gbp.Indic = @enumFromInt((cp2_props_byte >> 1) & 0x7);
-    const cp2_is_emoji = cp2_props_byte & 1 == 1;
+    const cp2_gbp_prop = data.gbp(cp2);
+    const cp2_indic_prop = data.indic(cp2);
+    const cp2_is_emoji = data.isEmoji(cp2);
 
     // GB11: Emoji Extend* ZWJ x Emoji
     if (!state.hasXpic() and cp1_is_emoji) state.setXpic();
@@ -157,7 +154,7 @@ pub fn graphemeBreak(
     if (cp1 == '\r' and cp2 == '\n') return false;
 
     // GB4: Control
-    if (isBreaker(cp1)) return true;
+    if (isBreaker(cp1, data)) return true;
 
     // GB11: Emoji Extend* ZWJ x Emoji
     if (state.hasXpic() and
@@ -175,7 +172,7 @@ pub fn graphemeBreak(
     if (cp2_gbp_prop == .SpacingMark) return false;
 
     // GB9b: Prepend x
-    if (cp1_gbp_prop == .Prepend and !isBreaker(cp2)) return false;
+    if (cp1_gbp_prop == .Prepend and !isBreaker(cp2, data)) return false;
 
     // GB12, GB13: RI x RI
     if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
@@ -240,6 +237,9 @@ test "Segmentation GraphemeIterator" {
     var buf_reader = std.io.bufferedReader(file.reader());
     var input_stream = buf_reader.reader();
 
+    var data = try Data.init(allocator);
+    defer data.deinit();
+
     var buf: [4096]u8 = undefined;
     var line_no: usize = 1;
 
@@ -282,7 +282,7 @@ test "Segmentation GraphemeIterator" {
         }
 
         // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
-        var iter = Iterator.init(all_bytes.items);
+        var iter = Iterator.init(all_bytes.items, &data);
 
         // Chaeck.
         for (want.items) |want_gc| {
@@ -295,19 +295,6 @@ test "Segmentation GraphemeIterator" {
     }
 }
 
-test "Segmentation comptime GraphemeIterator" {
-    const want = [_][]const u8{ "H", "é", "l", "l", "o" };
-
-    comptime {
-        const src = "Héllo";
-        var ct_iter = Iterator.init(src);
-        var i = 0;
-        while (ct_iter.next()) |grapheme| : (i += 1) {
-            try std.testing.expectEqualStrings(grapheme.bytes(src), want[i]);
-        }
-    }
-}
-
 test "Segmentation ZWJ and ZWSP emoji sequences" {
     const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
     const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
@@ -315,18 +302,22 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
     const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;
     const no_joiner = seq_1 ++ seq_2;
 
-    var ct_iter = Iterator.init(with_zwj);
+    var data = try Data.init(std.testing.allocator);
+    defer data.deinit();
+
+    var iter = Iterator.init(with_zwj, &data);
+
     var i: usize = 0;
-    while (ct_iter.next()) |_| : (i += 1) {}
+    while (iter.next()) |_| : (i += 1) {}
     try std.testing.expectEqual(@as(usize, 1), i);
 
-    ct_iter = Iterator.init(with_zwsp);
+    iter = Iterator.init(with_zwsp, &data);
     i = 0;
-    while (ct_iter.next()) |_| : (i += 1) {}
+    while (iter.next()) |_| : (i += 1) {}
     try std.testing.expectEqual(@as(usize, 3), i);
 
-    ct_iter = Iterator.init(no_joiner);
+    iter = Iterator.init(no_joiner, &data);
     i = 0;
-    while (ct_iter.next()) |_| : (i += 1) {}
+    while (iter.next()) |_| : (i += 1) {}
     try std.testing.expectEqual(@as(usize, 2), i);
 }
diff --git a/src/main.zig b/src/main.zig
index 946ae01..57db05b 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -1,29 +1,47 @@
 const std = @import("std");
 
 // const GraphemeIterator = @import("ziglyph").GraphemeIterator;
-// const GraphemeIterator = @import("Grapheme").GraphemeIterator;
+// const Data = @import("grapheme").Data;
+// const GraphemeIterator = @import("grapheme").Iterator;
+
 // const codePointWidth = @import("ziglyph").display_width.codePointWidth;
-// const codePointWidth = @import("display_width").codePointWidth;
 // const strWidth = @import("ziglyph").display_width.strWidth;
+// const Data = @import("display_width").Data;
+// const codePointWidth = @import("display_width").codePointWidth;
 // const strWidth = @import("display_width").strWidth;
-// const CodePointIterator = @import("CodePoint").CodePointIterator;
+
+// const CodePointIterator = @import("ziglyph").CodePointIterator;
+// const CodePointIterator = @import("code_point").Iterator;
+
 // const ascii = @import("ascii");
 // const ascii = std.ascii;
+
 // const norm = @import("ziglyph").Normalizer;
+const Data = @import("Normalizer").Data;
 const norm = @import("Normalizer");
 
 pub fn main() !void {
+    var args_iter = std.process.args();
+    _ = args_iter.skip();
+    const in_path = args_iter.next() orelse return error.MissingArg;
+
     var gpa = std.heap.GeneralPurposeAllocator(.{}){};
     defer _ = gpa.deinit();
     const allocator = gpa.allocator();
 
-    const input = try std.fs.cwd().readFileAlloc(allocator, "data/lang_mix.txt", std.math.maxInt(u32));
+    const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32));
     defer allocator.free(input);
 
-    var n = try norm.init(allocator);
+    var data = try Data.init(allocator);
+    defer data.deinit();
+
+    var n = try norm.init(allocator, &data);
     defer n.deinit();
+    // var n = try norm.init(allocator);
+    // defer n.deinit();
 
-    // var iter = GraphemeIterator.init(input);
+    // var iter = GraphemeIterator.init(input, &data);
+    // defer iter.deinit();
     // var iter = CodePointIterator{ .bytes = input };
     var iter = std.mem.splitScalar(u8, input, '\n');
 
@@ -33,7 +51,7 @@ pub fn main() !void {
 
     // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code));
     // while (iter.next()) |_| result += 1;
-    // while (iter.next()) |line| result += strWidth(line);
+    // while (iter.next()) |line| result += strWidth(line, &data);
     while (iter.next()) |line| {
         var nfc = try n.nfc(allocator, line);
         result += nfc.slice.len;
-- 
cgit v1.2.3