From 1be5e46490e061761b4b97dff5c6acb2181d6fe9 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 30 Apr 2025 11:58:19 -0400 Subject: Factor out 'Data' for grapheme and DisplayWidth In the process of refactoring the whole library, so that it doesn't expose anything called "Data" separately from user functionality. --- src/DisplayWidth.zig | 240 ++++++++++++++++++++++++++++++++++---------------- src/GraphemeData.zig | 12 +-- src/Normalize.zig | 29 +++--- src/WidthData.zig | 32 +++++-- src/grapheme.zig | 109 ++++++++++++++++++++--- src/unicode_tests.zig | 10 +-- 6 files changed, 313 insertions(+), 119 deletions(-) (limited to 'src') diff --git a/src/DisplayWidth.zig b/src/DisplayWidth.zig index 8631bd4..11ec59e 100644 --- a/src/DisplayWidth.zig +++ b/src/DisplayWidth.zig @@ -2,38 +2,131 @@ const std = @import("std"); const builtin = @import("builtin"); const options = @import("options"); const ArrayList = std.ArrayList; +const compress = std.compress; const mem = std.mem; const simd = std.simd; const testing = std.testing; const ascii = @import("ascii"); const CodePointIterator = @import("code_point").Iterator; -const GraphemeIterator = @import("grapheme").Iterator; pub const DisplayWidthData = @import("DisplayWidthData"); -data: *const DisplayWidthData, +const Graphemes = @import("Graphemes"); -const Self = @This(); +g_data: Graphemes, +s1: []u16 = undefined, +s2: []i4 = undefined, +owns_gdata: bool, + +const DisplayWidth = @This(); + +pub fn init(allocator: mem.Allocator) mem.Allocator.Error!DisplayWidth { + var dw: DisplayWidth = try DisplayWidth.setup(allocator); + errdefer { + allocator.free(dw.s1); + allocator.free(dw.s2); + } + dw.owns_gdata = true; + dw.g_data = try Graphemes.init(allocator); + errdefer dw.g_data.deinit(allocator); + return dw; +} + +pub fn initWithGraphemeData(allocator: mem.Allocator, g_data: Graphemes) mem.Allocator.Error!DisplayWidth { + var dw = try DisplayWidth.setup(allocator); + dw.g_data = g_data; + dw.owns_gdata = false; + return dw; +} + +// Sets up the DisplayWidthData, leaving the GraphemeData undefined. +fn setup(allocator: mem.Allocator) mem.Allocator.Error!DisplayWidth { + const decompressor = compress.flate.inflate.decompressor; + const in_bytes = @embedFile("dwp"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = decompressor(.raw, in_fbs.reader()); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + var dw: DisplayWidth = undefined; + + const stage_1_len: u16 = reader.readInt(u16, endian) catch unreachable; + dw.s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(dw.s1); + for (0..stage_1_len) |i| dw.s1[i] = reader.readInt(u16, endian) catch unreachable; + + const stage_2_len: u16 = reader.readInt(u16, endian) catch unreachable; + dw.s2 = try allocator.alloc(i4, stage_2_len); + errdefer allocator.free(dw.s2); + for (0..stage_2_len) |i| dw.s2[i] = @intCast(reader.readInt(i8, endian) catch unreachable); + + return dw; +} + +pub fn deinit(dw: *const DisplayWidth, allocator: mem.Allocator) void { + allocator.free(dw.s1); + allocator.free(dw.s2); + if (dw.owns_gdata) dw.g_data.deinit(allocator); +} + +/// codePointWidth returns the number of cells `cp` requires when rendered +/// in a fixed-pitch font (i.e. a terminal screen). This can range from -1 to +/// 3, where BACKSPACE and DELETE return -1 and 3-em-dash returns 3. C0/C1 +/// control codes return 0. If `cjk` is true, ambiguous code points return 2, +/// otherwise they return 1. +pub fn codePointWidth(dw: DisplayWidth, cp: u21) i4 { + return dw.s2[dw.s1[cp >> 8] + (cp & 0xff)]; +} + +test "codePointWidth" { + const dw = try DisplayWidth.init(std.testing.allocator); + defer dw.deinit(std.testing.allocator); + try testing.expectEqual(@as(i4, 0), dw.codePointWidth(0x0000)); // null + try testing.expectEqual(@as(i4, -1), dw.codePointWidth(0x8)); // \b + try testing.expectEqual(@as(i4, -1), dw.codePointWidth(0x7f)); // DEL + try testing.expectEqual(@as(i4, 0), dw.codePointWidth(0x0005)); // Cf + try testing.expectEqual(@as(i4, 0), dw.codePointWidth(0x0007)); // \a BEL + try testing.expectEqual(@as(i4, 0), dw.codePointWidth(0x000A)); // \n LF + try testing.expectEqual(@as(i4, 0), dw.codePointWidth(0x000B)); // \v VT + try testing.expectEqual(@as(i4, 0), dw.codePointWidth(0x000C)); // \f FF + try testing.expectEqual(@as(i4, 0), dw.codePointWidth(0x000D)); // \r CR + try testing.expectEqual(@as(i4, 0), dw.codePointWidth(0x000E)); // SQ + try testing.expectEqual(@as(i4, 0), dw.codePointWidth(0x000F)); // SI + + try testing.expectEqual(@as(i4, 0), dw.codePointWidth(0x070F)); // Cf + try testing.expectEqual(@as(i4, 1), dw.codePointWidth(0x0603)); // Cf Arabic + + try testing.expectEqual(@as(i4, 1), dw.codePointWidth(0x00AD)); // soft-hyphen + try testing.expectEqual(@as(i4, 2), dw.codePointWidth(0x2E3A)); // two-em dash + try testing.expectEqual(@as(i4, 3), dw.codePointWidth(0x2E3B)); // three-em dash + + try testing.expectEqual(@as(i4, 1), dw.codePointWidth(0x00BD)); // ambiguous halfwidth + + try testing.expectEqual(@as(i4, 1), dw.codePointWidth('é')); + try testing.expectEqual(@as(i4, 2), dw.codePointWidth('😊')); + try testing.expectEqual(@as(i4, 2), dw.codePointWidth('统')); +} /// strWidth returns the total display width of `str` as the number of cells /// required in a fixed-pitch font (i.e. a terminal screen). -pub fn strWidth(self: Self, str: []const u8) usize { +pub fn strWidth(dw: DisplayWidth, str: []const u8) usize { var total: isize = 0; // ASCII fast path if (ascii.isAsciiOnly(str)) { - for (str) |b| total += self.data.codePointWidth(b); + for (str) |b| total += dw.codePointWidth(b); return @intCast(@max(0, total)); } - var giter = GraphemeIterator.init(str, &self.data.g_data); + var giter = dw.g_data.iterator(str); while (giter.next()) |gc| { var cp_iter = CodePointIterator{ .bytes = gc.bytes(str) }; var gc_total: isize = 0; while (cp_iter.next()) |cp| { - var w = self.data.codePointWidth(cp.code); + var w = dw.codePointWidth(cp.code); if (w != 0) { // Handle text emoji sequence. @@ -58,41 +151,40 @@ pub fn strWidth(self: Self, str: []const u8) usize { } test "strWidth" { - const data = try DisplayWidthData.init(testing.allocator); - defer data.deinit(testing.allocator); - const self = Self{ .data = &data }; + const dw = try DisplayWidth.init(testing.allocator); + defer dw.deinit(testing.allocator); const c0 = options.c0_width orelse 0; - try testing.expectEqual(@as(usize, 5), self.strWidth("Hello\r\n")); - try testing.expectEqual(@as(usize, 1), self.strWidth("\u{0065}\u{0301}")); - try testing.expectEqual(@as(usize, 2), self.strWidth("\u{1F476}\u{1F3FF}\u{0308}\u{200D}\u{1F476}\u{1F3FF}")); - try testing.expectEqual(@as(usize, 8), self.strWidth("Hello 😊")); - try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo 😊")); - try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo :)")); - try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo 🇪🇸")); - try testing.expectEqual(@as(usize, 2), self.strWidth("\u{26A1}")); // Lone emoji - try testing.expectEqual(@as(usize, 1), self.strWidth("\u{26A1}\u{FE0E}")); // Text sequence - try testing.expectEqual(@as(usize, 2), self.strWidth("\u{26A1}\u{FE0F}")); // Presentation sequence - try testing.expectEqual(@as(usize, 1), self.strWidth("\u{2764}")); // Default text presentation - try testing.expectEqual(@as(usize, 1), self.strWidth("\u{2764}\u{FE0E}")); // Default text presentation with VS15 selector - try testing.expectEqual(@as(usize, 2), self.strWidth("\u{2764}\u{FE0F}")); // Default text presentation with VS16 selector + try testing.expectEqual(@as(usize, 5), dw.strWidth("Hello\r\n")); + try testing.expectEqual(@as(usize, 1), dw.strWidth("\u{0065}\u{0301}")); + try testing.expectEqual(@as(usize, 2), dw.strWidth("\u{1F476}\u{1F3FF}\u{0308}\u{200D}\u{1F476}\u{1F3FF}")); + try testing.expectEqual(@as(usize, 8), dw.strWidth("Hello 😊")); + try testing.expectEqual(@as(usize, 8), dw.strWidth("Héllo 😊")); + try testing.expectEqual(@as(usize, 8), dw.strWidth("Héllo :)")); + try testing.expectEqual(@as(usize, 8), dw.strWidth("Héllo 🇪🇸")); + try testing.expectEqual(@as(usize, 2), dw.strWidth("\u{26A1}")); // Lone emoji + try testing.expectEqual(@as(usize, 1), dw.strWidth("\u{26A1}\u{FE0E}")); // Text sequence + try testing.expectEqual(@as(usize, 2), dw.strWidth("\u{26A1}\u{FE0F}")); // Presentation sequence + try testing.expectEqual(@as(usize, 1), dw.strWidth("\u{2764}")); // Default text presentation + try testing.expectEqual(@as(usize, 1), dw.strWidth("\u{2764}\u{FE0E}")); // Default text presentation with VS15 selector + try testing.expectEqual(@as(usize, 2), dw.strWidth("\u{2764}\u{FE0F}")); // Default text presentation with VS16 selector const expect_bs: usize = if (c0 == 0) 0 else 1 + c0; - try testing.expectEqual(expect_bs, self.strWidth("A\x08")); // Backspace - try testing.expectEqual(expect_bs, self.strWidth("\x7FA")); // DEL + try testing.expectEqual(expect_bs, dw.strWidth("A\x08")); // Backspace + try testing.expectEqual(expect_bs, dw.strWidth("\x7FA")); // DEL const expect_long_del: usize = if (c0 == 0) 0 else 1 + (c0 * 3); - try testing.expectEqual(expect_long_del, self.strWidth("\x7FA\x08\x08")); // never less than 0 + try testing.expectEqual(expect_long_del, dw.strWidth("\x7FA\x08\x08")); // never less than 0 // wcwidth Python lib tests. See: https://github.com/jquast/wcwidth/blob/master/tests/test_core.py const empty = ""; - try testing.expectEqual(@as(usize, 0), self.strWidth(empty)); + try testing.expectEqual(@as(usize, 0), dw.strWidth(empty)); const with_null = "hello\x00world"; - try testing.expectEqual(@as(usize, 10 + c0), self.strWidth(with_null)); + try testing.expectEqual(@as(usize, 10 + c0), dw.strWidth(with_null)); const hello_jp = "コンニチハ, セカイ!"; - try testing.expectEqual(@as(usize, 19), self.strWidth(hello_jp)); + try testing.expectEqual(@as(usize, 19), dw.strWidth(hello_jp)); const control = "\x1b[0m"; - try testing.expectEqual(@as(usize, 3 + c0), self.strWidth(control)); + try testing.expectEqual(@as(usize, 3 + c0), dw.strWidth(control)); const balinese = "\u{1B13}\u{1B28}\u{1B2E}\u{1B44}"; - try testing.expectEqual(@as(usize, 3), self.strWidth(balinese)); + try testing.expectEqual(@as(usize, 3), dw.strWidth(balinese)); // These commented out tests require a new specification for complex scripts. // See: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf @@ -106,17 +198,17 @@ test "strWidth" { // try testing.expectEqual(@as(usize, 3), strWidth(kannada_1)); // The following passes but as a mere coincidence. const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}"; - try testing.expectEqual(@as(usize, 2), self.strWidth(kannada_2)); + try testing.expectEqual(@as(usize, 2), dw.strWidth(kannada_2)); // From Rust https://github.com/jameslanska/unicode-display-width - try testing.expectEqual(@as(usize, 15), self.strWidth("🔥🗡🍩👩🏻‍🚀⏰💃🏼🔦👍🏻")); - try testing.expectEqual(@as(usize, 2), self.strWidth("🦀")); - try testing.expectEqual(@as(usize, 2), self.strWidth("👨‍👩‍👧‍👧")); - try testing.expectEqual(@as(usize, 2), self.strWidth("👩‍🔬")); - try testing.expectEqual(@as(usize, 9), self.strWidth("sane text")); - try testing.expectEqual(@as(usize, 9), self.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ")); - try testing.expectEqual(@as(usize, 17), self.strWidth("슬라바 우크라이나")); - try testing.expectEqual(@as(usize, 1), self.strWidth("\u{378}")); + try testing.expectEqual(@as(usize, 15), dw.strWidth("🔥🗡🍩👩🏻‍🚀⏰💃🏼🔦👍🏻")); + try testing.expectEqual(@as(usize, 2), dw.strWidth("🦀")); + try testing.expectEqual(@as(usize, 2), dw.strWidth("👨‍👩‍👧‍👧")); + try testing.expectEqual(@as(usize, 2), dw.strWidth("👩‍🔬")); + try testing.expectEqual(@as(usize, 9), dw.strWidth("sane text")); + try testing.expectEqual(@as(usize, 9), dw.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ")); + try testing.expectEqual(@as(usize, 17), dw.strWidth("슬라바 우크라이나")); + try testing.expectEqual(@as(usize, 1), dw.strWidth("\u{378}")); } /// centers `str` in a new string of width `total_width` (in display cells) using `pad` as padding. @@ -124,17 +216,17 @@ test "strWidth" { /// receive one additional pad. This makes sure the returned string fills the requested width. /// Caller must free returned bytes with `allocator`. pub fn center( - self: Self, + dw: DisplayWidth, allocator: mem.Allocator, str: []const u8, total_width: usize, pad: []const u8, ) ![]u8 { - const str_width = self.strWidth(str); + const str_width = dw.strWidth(str); if (str_width > total_width) return error.StrTooLong; if (str_width == total_width) return try allocator.dupe(u8, str); - const pad_width = self.strWidth(pad); + const pad_width = dw.strWidth(pad); if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; const margin_width = @divFloor((total_width - str_width), 2); @@ -165,62 +257,61 @@ pub fn center( test "center" { const allocator = testing.allocator; - const data = try DisplayWidthData.init(allocator); - defer data.deinit(allocator); - const self = Self{ .data = &data }; + const dw = try DisplayWidth.init(allocator); + defer dw.deinit(allocator); // Input and width both have odd length - var centered = try self.center(allocator, "abc", 9, "*"); + var centered = try dw.center(allocator, "abc", 9, "*"); try testing.expectEqualSlices(u8, "***abc***", centered); // Input and width both have even length testing.allocator.free(centered); - centered = try self.center(allocator, "w😊w", 10, "-"); + centered = try dw.center(allocator, "w😊w", 10, "-"); try testing.expectEqualSlices(u8, "---w😊w---", centered); // Input has even length, width has odd length testing.allocator.free(centered); - centered = try self.center(allocator, "1234", 9, "-"); + centered = try dw.center(allocator, "1234", 9, "-"); try testing.expectEqualSlices(u8, "--1234---", centered); // Input has odd length, width has even length testing.allocator.free(centered); - centered = try self.center(allocator, "123", 8, "-"); + centered = try dw.center(allocator, "123", 8, "-"); try testing.expectEqualSlices(u8, "--123---", centered); // Input is the same length as the width testing.allocator.free(centered); - centered = try self.center(allocator, "123", 3, "-"); + centered = try dw.center(allocator, "123", 3, "-"); try testing.expectEqualSlices(u8, "123", centered); // Input is empty testing.allocator.free(centered); - centered = try self.center(allocator, "", 3, "-"); + centered = try dw.center(allocator, "", 3, "-"); try testing.expectEqualSlices(u8, "---", centered); // Input is empty and width is zero testing.allocator.free(centered); - centered = try self.center(allocator, "", 0, "-"); + centered = try dw.center(allocator, "", 0, "-"); try testing.expectEqualSlices(u8, "", centered); // Input is longer than the width, which is an error testing.allocator.free(centered); - try testing.expectError(error.StrTooLong, self.center(allocator, "123", 2, "-")); + try testing.expectError(error.StrTooLong, dw.center(allocator, "123", 2, "-")); } /// padLeft returns a new string of width `total_width` (in display cells) using `pad` as padding /// on the left side. Caller must free returned bytes with `allocator`. pub fn padLeft( - self: Self, + dw: DisplayWidth, allocator: mem.Allocator, str: []const u8, total_width: usize, pad: []const u8, ) ![]u8 { - const str_width = self.strWidth(str); + const str_width = dw.strWidth(str); if (str_width > total_width) return error.StrTooLong; - const pad_width = self.strWidth(pad); + const pad_width = dw.strWidth(pad); if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; const margin_width = total_width - str_width; @@ -244,32 +335,31 @@ pub fn padLeft( test "padLeft" { const allocator = testing.allocator; - const data = try DisplayWidthData.init(allocator); - defer data.deinit(allocator); - const self = Self{ .data = &data }; + const dw = try DisplayWidth.init(allocator); + defer dw.deinit(allocator); - var right_aligned = try self.padLeft(allocator, "abc", 9, "*"); + var right_aligned = try dw.padLeft(allocator, "abc", 9, "*"); defer testing.allocator.free(right_aligned); try testing.expectEqualSlices(u8, "******abc", right_aligned); testing.allocator.free(right_aligned); - right_aligned = try self.padLeft(allocator, "w😊w", 10, "-"); + right_aligned = try dw.padLeft(allocator, "w😊w", 10, "-"); try testing.expectEqualSlices(u8, "------w😊w", right_aligned); } /// padRight returns a new string of width `total_width` (in display cells) using `pad` as padding /// on the right side. Caller must free returned bytes with `allocator`. pub fn padRight( - self: Self, + dw: DisplayWidth, allocator: mem.Allocator, str: []const u8, total_width: usize, pad: []const u8, ) ![]u8 { - const str_width = self.strWidth(str); + const str_width = dw.strWidth(str); if (str_width > total_width) return error.StrTooLong; - const pad_width = self.strWidth(pad); + const pad_width = dw.strWidth(pad); if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; const margin_width = total_width - str_width; @@ -294,16 +384,15 @@ pub fn padRight( test "padRight" { const allocator = testing.allocator; - const data = try DisplayWidthData.init(allocator); - defer data.deinit(allocator); - const self = Self{ .data = &data }; + const dw = try DisplayWidth.init(allocator); + defer dw.deinit(allocator); - var left_aligned = try self.padRight(allocator, "abc", 9, "*"); + var left_aligned = try dw.padRight(allocator, "abc", 9, "*"); defer testing.allocator.free(left_aligned); try testing.expectEqualSlices(u8, "abc******", left_aligned); testing.allocator.free(left_aligned); - left_aligned = try self.padRight(allocator, "w😊w", 10, "-"); + left_aligned = try dw.padRight(allocator, "w😊w", 10, "-"); try testing.expectEqualSlices(u8, "w😊w------", left_aligned); } @@ -311,7 +400,7 @@ test "padRight" { /// `threshold` defines how far the last column of the last word can be /// from the edge. Caller must free returned bytes with `allocator`. pub fn wrap( - self: Self, + dw: DisplayWidth, allocator: mem.Allocator, str: []const u8, columns: usize, @@ -329,7 +418,7 @@ pub fn wrap( while (word_iter.next()) |word| { try result.appendSlice(word); try result.append(' '); - line_width += self.strWidth(word) + 1; + line_width += dw.strWidth(word) + 1; if (line_width > columns or columns - line_width <= threshold) { try result.append('\n'); @@ -347,12 +436,11 @@ pub fn wrap( test "wrap" { const allocator = testing.allocator; - const data = try DisplayWidthData.init(allocator); - defer data.deinit(allocator); - const self = Self{ .data = &data }; + const dw = try DisplayWidth.init(allocator); + defer dw.deinit(allocator); const input = "The quick brown fox\r\njumped over the lazy dog!"; - const got = try self.wrap(allocator, input, 10, 3); + const got = try dw.wrap(allocator, input, 10, 3); defer testing.allocator.free(got); const want = "The quick \nbrown fox \njumped \nover the \nlazy dog!"; try testing.expectEqualStrings(want, got); diff --git a/src/GraphemeData.zig b/src/GraphemeData.zig index 6d3174d..df025cb 100644 --- a/src/GraphemeData.zig +++ b/src/GraphemeData.zig @@ -36,7 +36,7 @@ s3: []u8 = undefined, const Self = @This(); -pub fn init(allocator: mem.Allocator) mem.Allocator.Error!Self { +pub inline fn init(allocator: mem.Allocator) mem.Allocator.Error!Self { const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("gbp"); var in_fbs = std.io.fixedBufferStream(in_bytes); @@ -65,23 +65,23 @@ pub fn init(allocator: mem.Allocator) mem.Allocator.Error!Self { return self; } -pub fn deinit(self: *const Self, allocator: mem.Allocator) void { +pub inline fn deinit(self: *const Self, allocator: mem.Allocator) void { allocator.free(self.s1); allocator.free(self.s2); allocator.free(self.s3); } /// Lookup the grapheme break property for a code point. -pub fn gbp(self: Self, cp: u21) Gbp { +pub inline fn gbp(self: Self, cp: u21) Gbp { return @enumFromInt(self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] >> 4); } /// Lookup the indic syllable type for a code point. -pub fn indic(self: Self, cp: u21) Indic { +pub inline fn indic(self: Self, cp: u21) Indic { return @enumFromInt((self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7); } -/// Lookup the indic syllable type for a code point. -pub fn isEmoji(self: Self, cp: u21) bool { +/// Lookup the emoji property for a code point. +pub inline fn isEmoji(self: Self, cp: u21) bool { return self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; } diff --git a/src/Normalize.zig b/src/Normalize.zig index a28b708..b738b27 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig @@ -177,7 +177,7 @@ test "decompose" { const allocator = testing.allocator; var data: NormData = undefined; try NormData.init(&data, allocator); - defer data.deinit(); + defer data.deinit(allocator); var n = Self{ .norm_data = &data }; var buf: [18]u21 = undefined; @@ -307,11 +307,11 @@ test "nfd ASCII / no-alloc" { const allocator = testing.allocator; var data: NormData = undefined; try NormData.init(&data, allocator); - defer data.deinit(); + defer data.deinit(allocator); const n = Self{ .norm_data = &data }; const result = try n.nfd(allocator, "Hello World!"); - defer result.deinit(); + defer result.deinit(allocator); try testing.expectEqualStrings("Hello World!", result.slice); } @@ -320,11 +320,11 @@ test "nfd !ASCII / alloc" { const allocator = testing.allocator; var data: NormData = undefined; try NormData.init(&data, allocator); - defer data.deinit(); + defer data.deinit(allocator); const n = Self{ .norm_data = &data }; const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); - defer result.deinit(); + defer result.deinit(allocator); try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice); } @@ -333,11 +333,11 @@ test "nfkd ASCII / no-alloc" { const allocator = testing.allocator; var data: NormData = undefined; try NormData.init(&data, allocator); - defer data.deinit(); + defer data.deinit(allocator); const n = Self{ .norm_data = &data }; const result = try n.nfkd(allocator, "Hello World!"); - defer result.deinit(); + defer result.deinit(allocator); try testing.expectEqualStrings("Hello World!", result.slice); } @@ -346,11 +346,11 @@ test "nfkd !ASCII / alloc" { const allocator = testing.allocator; var data: NormData = undefined; try NormData.init(&data, allocator); - defer data.deinit(); + defer data.deinit(allocator); const n = Self{ .norm_data = &data }; const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); - defer result.deinit(); + defer result.deinit(allocator); try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); } @@ -546,11 +546,11 @@ test "nfc" { const allocator = testing.allocator; var data: NormData = undefined; try NormData.init(&data, allocator); - defer data.deinit(); + defer data.deinit(allocator); const n = Self{ .norm_data = &data }; const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); - defer result.deinit(); + defer result.deinit(allocator); try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice); } @@ -559,11 +559,11 @@ test "nfkc" { const allocator = testing.allocator; var data: NormData = undefined; try NormData.init(&data, allocator); - defer data.deinit(); + defer data.deinit(allocator); const n = Self{ .norm_data = &data }; const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); - defer result.deinit(); + defer result.deinit(allocator); try testing.expectEqualStrings("Complex char: \u{038E}", result.slice); } @@ -582,7 +582,7 @@ test "eql" { const allocator = testing.allocator; var data: NormData = undefined; try NormData.init(&data, allocator); - defer data.deinit(); + defer data.deinit(allocator); const n = Self{ .norm_data = &data }; try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); @@ -628,5 +628,4 @@ test "isLatin1Only" { try testing.expect(isLatin1Only(latin1_only)); const not_latin1_only = "Héllo, World! \u{3d3}"; try testing.expect(!isLatin1Only(not_latin1_only)); - try testing.expect(false); } diff --git a/src/WidthData.zig b/src/WidthData.zig index b07a679..ca7eaf0 100644 --- a/src/WidthData.zig +++ b/src/WidthData.zig @@ -4,15 +4,36 @@ const compress = std.compress; const mem = std.mem; const testing = std.testing; -const GraphemeData = @import("GraphemeData"); +const Graphemes = @import("Graphemes"); -g_data: GraphemeData, +g_data: Graphemes, s1: []u16 = undefined, s2: []i4 = undefined, +owns_gdata: bool, const Self = @This(); pub fn init(allocator: mem.Allocator) mem.Allocator.Error!Self { + var self: Self = try Self.setup(allocator); + errdefer { + allocator.free(self.s1); + allocator.free(self.s2); + } + self.owns_gdata = true; + self.g_data = try Graphemes.init(allocator); + errdefer self.g_data.deinit(allocator); + return self; +} + +pub fn initWithGraphemeData(allocator: mem.Allocator, g_data: Graphemes) mem.Allocator.Error!Self { + var self = try Self.setup(allocator); + self.g_data = g_data; + self.owns_gdata = false; + return self; +} + +// Sets up the DisplayWidthData, leaving the GraphemeData undefined. +fn setup(allocator: mem.Allocator) mem.Allocator.Error!Self { const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("dwp"); var in_fbs = std.io.fixedBufferStream(in_bytes); @@ -21,10 +42,7 @@ pub fn init(allocator: mem.Allocator) mem.Allocator.Error!Self { const endian = builtin.cpu.arch.endian(); - var self = Self{ - .g_data = try GraphemeData.init(allocator), - }; - errdefer self.g_data.deinit(allocator); + var self: Self = undefined; const stage_1_len: u16 = reader.readInt(u16, endian) catch unreachable; self.s1 = try allocator.alloc(u16, stage_1_len); @@ -42,7 +60,7 @@ pub fn init(allocator: mem.Allocator) mem.Allocator.Error!Self { pub fn deinit(self: *const Self, allocator: mem.Allocator) void { allocator.free(self.s1); allocator.free(self.s2); - self.g_data.deinit(allocator); + if (self.owns_gdata) self.g_data.deinit(allocator); } /// codePointWidth returns the number of cells `cp` requires when rendered diff --git a/src/grapheme.zig b/src/grapheme.zig index 25fd71d..79cd2c6 100644 --- a/src/grapheme.zig +++ b/src/grapheme.zig @@ -1,10 +1,99 @@ const std = @import("std"); +const builtin = @import("builtin"); const mem = std.mem; +const Allocator = mem.Allocator; +const compress = std.compress; const unicode = std.unicode; const CodePoint = @import("code_point").CodePoint; const CodePointIterator = @import("code_point").Iterator; -pub const GraphemeData = @import("GraphemeData"); + +s1: []u16 = undefined, +s2: []u16 = undefined, +s3: []u8 = undefined, + +const Graphemes = @This(); + +pub inline fn init(allocator: mem.Allocator) mem.Allocator.Error!Graphemes { + const decompressor = compress.flate.inflate.decompressor; + const in_bytes = @embedFile("gbp"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = decompressor(.raw, in_fbs.reader()); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + var self = Graphemes{}; + + const s1_len: u16 = reader.readInt(u16, endian) catch unreachable; + self.s1 = try allocator.alloc(u16, s1_len); + errdefer allocator.free(self.s1); + for (0..s1_len) |i| self.s1[i] = reader.readInt(u16, endian) catch unreachable; + + const s2_len: u16 = reader.readInt(u16, endian) catch unreachable; + self.s2 = try allocator.alloc(u16, s2_len); + errdefer allocator.free(self.s2); + for (0..s2_len) |i| self.s2[i] = reader.readInt(u16, endian) catch unreachable; + + const s3_len: u16 = reader.readInt(u16, endian) catch unreachable; + self.s3 = try allocator.alloc(u8, s3_len); + errdefer allocator.free(self.s3); + _ = reader.readAll(self.s3) catch unreachable; + + return self; +} + +pub fn deinit(graphemes: *const Graphemes, allocator: mem.Allocator) void { + allocator.free(graphemes.s1); + allocator.free(graphemes.s2); + allocator.free(graphemes.s3); +} + +/// Lookup the grapheme break property for a code point. +pub fn gbp(graphemes: Graphemes, cp: u21) Gbp { + return @enumFromInt(graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 4); +} + +/// Lookup the indic syllable type for a code point. +pub fn indic(graphemes: Graphemes, cp: u21) Indic { + return @enumFromInt((graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7); +} + +/// Lookup the emoji property for a code point. +pub fn isEmoji(graphemes: Graphemes, cp: u21) bool { + return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; +} + +pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { + return Iterator.init(string, graphemes); +} + +/// Indic syllable type. +pub const Indic = enum { + none, + + Consonant, + Extend, + Linker, +}; + +/// Grapheme break property. +pub const Gbp = enum { + none, + Control, + CR, + Extend, + L, + LF, + LV, + LVT, + Prepend, + Regional_Indicator, + SpacingMark, + T, + V, + ZWJ, +}; /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. pub const Grapheme = struct { @@ -22,12 +111,12 @@ pub const Grapheme = struct { pub const Iterator = struct { buf: [2]?CodePoint = .{ null, null }, cp_iter: CodePointIterator, - data: *const GraphemeData, + data: *const Graphemes, const Self = @This(); /// Assumes `src` is valid UTF-8. - pub fn init(str: []const u8, data: *const GraphemeData) Self { + pub fn init(str: []const u8, data: *const Graphemes) Self { var self = Self{ .cp_iter = .{ .bytes = str }, .data = data }; self.advance(); return self; @@ -149,7 +238,7 @@ pub const Iterator = struct { }; // Predicates -fn isBreaker(cp: u21, data: *const GraphemeData) bool { +fn isBreaker(cp: u21, data: *const Graphemes) bool { // Extract relevant properties. const cp_gbp_prop = data.gbp(cp); return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; @@ -202,7 +291,7 @@ pub const State = struct { pub fn graphemeBreak( cp1: u21, cp2: u21, - data: *const GraphemeData, + data: *const Graphemes, state: *State, ) bool { // Extract relevant properties. @@ -306,25 +395,25 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; const no_joiner = seq_1 ++ seq_2; - const data = try GraphemeData.init(std.testing.allocator); - defer data.deinit(std.testing.allocator); + const graphemes = try Graphemes.init(std.testing.allocator); + defer graphemes.deinit(std.testing.allocator); { - var iter = Iterator.init(with_zwj, &data); + var iter = graphemes.iterator(with_zwj); var i: usize = 0; while (iter.next()) |_| : (i += 1) {} try std.testing.expectEqual(@as(usize, 1), i); } { - var iter = Iterator.init(with_zwsp, &data); + var iter = graphemes.iterator(with_zwsp); var i: usize = 0; while (iter.next()) |_| : (i += 1) {} try std.testing.expectEqual(@as(usize, 3), i); } { - var iter = Iterator.init(no_joiner, &data); + var iter = graphemes.iterator(no_joiner); var i: usize = 0; while (iter.next()) |_| : (i += 1) {} try std.testing.expectEqual(@as(usize, 2), i); diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 7236ff6..de1b9ec 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -9,7 +9,7 @@ const unicode = std.unicode; const grapheme = @import("grapheme"); const Grapheme = @import("grapheme").Grapheme; -const GraphemeData = @import("grapheme").GraphemeData; +const Graphemes = @import("grapheme"); const GraphemeIterator = @import("grapheme").Iterator; const Normalize = @import("Normalize"); @@ -18,10 +18,10 @@ comptime { } test "Iterator.peek" { const peek_seq = "aΔ👨🏻‍🌾→"; - const data = try GraphemeData.init(std.testing.allocator); + const data = try Graphemes.init(std.testing.allocator); defer data.deinit(std.testing.allocator); - var iter = grapheme.Iterator.init(peek_seq, &data); + var iter = data.iterator(peek_seq); const peek_a = iter.peek().?; const next_a = iter.next().?; try std.testing.expectEqual(peek_a, next_a); @@ -162,7 +162,7 @@ test "Segmentation GraphemeIterator" { var buf_reader = std.io.bufferedReader(file.reader()); var input_stream = buf_reader.reader(); - const data = try GraphemeData.init(allocator); + const data = try Graphemes.init(allocator); defer data.deinit(allocator); var buf: [4096]u8 = undefined; @@ -207,7 +207,7 @@ test "Segmentation GraphemeIterator" { } // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); - var iter = GraphemeIterator.init(all_bytes.items, &data); + var iter = data.iterator(all_bytes.items); // Chaeck. for (want.items) |want_gc| { -- cgit v1.2.3