From 836a4b6e63ac4bd7beb406cb20edf23f0bd342a9 Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Mon, 26 Feb 2024 12:24:42 -0400 Subject: Using separate data struct model. --- src/CombiningClassData.zig | 48 ++++++ src/DisplayWidth.zig | 351 +++++++++++++++++++++++++++++++++++++++++++ src/DisplayWidthData.zig | 82 +++++++++++ src/GraphemeData.zig | 86 +++++++++++ src/Normalizer.zig | 97 ++++++------ src/display_width.zig | 360 --------------------------------------------- src/grapheme.zig | 73 ++++----- src/main.zig | 32 +++- 8 files changed, 680 insertions(+), 449 deletions(-) create mode 100644 src/CombiningClassData.zig create mode 100644 src/DisplayWidth.zig create mode 100644 src/DisplayWidthData.zig create mode 100644 src/GraphemeData.zig delete mode 100644 src/display_width.zig (limited to 'src') diff --git a/src/CombiningClassData.zig b/src/CombiningClassData.zig new file mode 100644 index 0000000..95c947d --- /dev/null +++ b/src/CombiningClassData.zig @@ -0,0 +1,48 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; + +allocator: mem.Allocator, +s1: []u16 = undefined, +s2: []u8 = undefined, + +const Self = @This(); + +pub fn init(allocator: mem.Allocator) !Self { + const decompressor = compress.deflate.decompressor; + const in_bytes = @embedFile("ccc"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = try decompressor(allocator, in_fbs.reader(), null); + defer in_decomp.deinit(); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + var self = Self{ .allocator = allocator }; + + const stage_1_len: u16 = try reader.readInt(u16, endian); + self.s1 = try allocator.alloc(u16, stage_1_len); + for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); + + const stage_2_len: u16 = try reader.readInt(u16, endian); + self.s2 = try allocator.alloc(u8, stage_2_len); + _ = try reader.readAll(self.s2); + + return self; +} + +pub fn deinit(self: *Self) void { + self.allocator.free(self.s1); + self.allocator.free(self.s2); +} + +/// Returns the canonical combining class for a code point. +pub inline fn ccc(self: Self, cp: u21) u8 { + return self.s2[self.s1[cp >> 8] + (cp & 0xff)]; +} + +/// True if `cp` is a starter code point, not a combining character. +pub inline fn isStarter(self: Self, cp: u21) bool { + return self.s2[self.s1[cp >> 8] + (cp & 0xff)] == 0; +} diff --git a/src/DisplayWidth.zig b/src/DisplayWidth.zig new file mode 100644 index 0000000..85d04a0 --- /dev/null +++ b/src/DisplayWidth.zig @@ -0,0 +1,351 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const ArrayList = std.ArrayList; +const mem = std.mem; +const simd = std.simd; +const testing = std.testing; + +const ascii = @import("ascii"); +const CodePointIterator = @import("code_point").Iterator; +const GraphemeIterator = @import("grapheme").Iterator; +pub const Data = @import("DisplayWidthData"); + +data: *Data, + +const Self = @This(); + +/// strWidth returns the total display width of `str` as the number of cells +/// required in a fixed-pitch font (i.e. a terminal screen). +pub fn strWidth(self: Self, str: []const u8) usize { + var total: isize = 0; + + // ASCII fast path + if (ascii.isAsciiOnly(str)) { + for (str) |b| total += self.data.codePointWidth(b); + return @intCast(@max(0, total)); + } + + var giter = GraphemeIterator.init(str, &self.data.g_data); + + while (giter.next()) |gc| { + var cp_iter = CodePointIterator{ .bytes = gc.bytes(str) }; + var gc_total: isize = 0; + + while (cp_iter.next()) |cp| { + var w = self.data.codePointWidth(cp.code); + + if (w != 0) { + // Handle text emoji sequence. + if (cp_iter.next()) |ncp| { + // emoji text sequence. + if (ncp.code == 0xFE0E) w = 1; + } + + // Only adding width of first non-zero-width code point. + if (gc_total == 0) { + gc_total = w; + break; + } + } + } + + total += gc_total; + } + + return @intCast(@max(0, total)); +} + +test "strWidth" { + var data = try Data.init(testing.allocator); + defer data.deinit(); + const self = Self{ .data = &data }; + + try testing.expectEqual(@as(usize, 5), self.strWidth("Hello\r\n")); + try testing.expectEqual(@as(usize, 1), self.strWidth("\u{0065}\u{0301}")); + try testing.expectEqual(@as(usize, 2), self.strWidth("\u{1F476}\u{1F3FF}\u{0308}\u{200D}\u{1F476}\u{1F3FF}")); + try testing.expectEqual(@as(usize, 8), self.strWidth("Hello 😊")); + try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo 😊")); + try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo :)")); + try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo 🇪🇸")); + try testing.expectEqual(@as(usize, 2), self.strWidth("\u{26A1}")); // Lone emoji + try testing.expectEqual(@as(usize, 1), self.strWidth("\u{26A1}\u{FE0E}")); // Text sequence + try testing.expectEqual(@as(usize, 2), self.strWidth("\u{26A1}\u{FE0F}")); // Presentation sequence + try testing.expectEqual(@as(usize, 0), self.strWidth("A\x08")); // Backspace + try testing.expectEqual(@as(usize, 0), self.strWidth("\x7FA")); // DEL + try testing.expectEqual(@as(usize, 0), self.strWidth("\x7FA\x08\x08")); // never less than o + + // wcwidth Python lib tests. See: https://github.com/jquast/wcwidth/blob/master/tests/test_core.py + const empty = ""; + try testing.expectEqual(@as(usize, 0), self.strWidth(empty)); + const with_null = "hello\x00world"; + try testing.expectEqual(@as(usize, 10), self.strWidth(with_null)); + const hello_jp = "コンニチハ, セカイ!"; + try testing.expectEqual(@as(usize, 19), self.strWidth(hello_jp)); + const control = "\x1b[0m"; + try testing.expectEqual(@as(usize, 3), self.strWidth(control)); + const balinese = "\u{1B13}\u{1B28}\u{1B2E}\u{1B44}"; + try testing.expectEqual(@as(usize, 3), self.strWidth(balinese)); + + // These commented out tests require a new specification for complex scripts. + // See: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf + // const jamo = "\u{1100}\u{1160}"; + // try testing.expectEqual(@as(usize, 3), strWidth(jamo)); + // const devengari = "\u{0915}\u{094D}\u{0937}\u{093F}"; + // try testing.expectEqual(@as(usize, 3), strWidth(devengari)); + // const tamal = "\u{0b95}\u{0bcd}\u{0bb7}\u{0bcc}"; + // try testing.expectEqual(@as(usize, 5), strWidth(tamal)); + // const kannada_1 = "\u{0cb0}\u{0ccd}\u{0c9d}\u{0cc8}"; + // try testing.expectEqual(@as(usize, 3), strWidth(kannada_1)); + // The following passes but as a mere coincidence. + const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}"; + try testing.expectEqual(@as(usize, 2), self.strWidth(kannada_2)); + + // From Rust https://github.com/jameslanska/unicode-display-width + try testing.expectEqual(@as(usize, 15), self.strWidth("🔥🗡🍩👩🏻‍🚀⏰💃🏼🔦👍🏻")); + try testing.expectEqual(@as(usize, 2), self.strWidth("🦀")); + try testing.expectEqual(@as(usize, 2), self.strWidth("👨‍👩‍👧‍👧")); + try testing.expectEqual(@as(usize, 2), self.strWidth("👩‍🔬")); + try testing.expectEqual(@as(usize, 9), self.strWidth("sane text")); + try testing.expectEqual(@as(usize, 9), self.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ")); + try testing.expectEqual(@as(usize, 17), self.strWidth("슬라바 우크라이나")); + try testing.expectEqual(@as(usize, 1), self.strWidth("\u{378}")); +} + +/// centers `str` in a new string of width `total_width` (in display cells) using `pad` as padding. +/// If the length of `str` and `total_width` have different parity, the right side of `str` will +/// receive one additional pad. This makes sure the returned string fills the requested width. +/// Caller must free returned bytes with `allocator`. +pub fn center( + self: Self, + allocator: mem.Allocator, + str: []const u8, + total_width: usize, + pad: []const u8, +) ![]u8 { + const str_width = self.strWidth(str); + if (str_width > total_width) return error.StrTooLong; + if (str_width == total_width) return try allocator.dupe(u8, str); + + const pad_width = self.strWidth(pad); + if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; + + const margin_width = @divFloor((total_width - str_width), 2); + if (pad_width > margin_width) return error.PadTooLong; + const extra_pad: usize = if (total_width % 2 != str_width % 2) 1 else 0; + const pads = @divFloor(margin_width, pad_width) * 2 + extra_pad; + + var result = try allocator.alloc(u8, pads * pad.len + str.len); + var bytes_index: usize = 0; + var pads_index: usize = 0; + + while (pads_index < pads / 2) : (pads_index += 1) { + @memcpy(result[bytes_index..][0..pad.len], pad); + bytes_index += pad.len; + } + + @memcpy(result[bytes_index..][0..str.len], str); + bytes_index += str.len; + + pads_index = 0; + while (pads_index < pads / 2 + extra_pad) : (pads_index += 1) { + @memcpy(result[bytes_index..][0..pad.len], pad); + bytes_index += pad.len; + } + + return result; +} + +test "center" { + const allocator = testing.allocator; + var data = try Data.init(allocator); + defer data.deinit(); + const self = Self{ .data = &data }; + + // Input and width both have odd length + var centered = try self.center(allocator, "abc", 9, "*"); + try testing.expectEqualSlices(u8, "***abc***", centered); + + // Input and width both have even length + testing.allocator.free(centered); + centered = try self.center(allocator, "w😊w", 10, "-"); + try testing.expectEqualSlices(u8, "---w😊w---", centered); + + // Input has even length, width has odd length + testing.allocator.free(centered); + centered = try self.center(allocator, "1234", 9, "-"); + try testing.expectEqualSlices(u8, "--1234---", centered); + + // Input has odd length, width has even length + testing.allocator.free(centered); + centered = try self.center(allocator, "123", 8, "-"); + try testing.expectEqualSlices(u8, "--123---", centered); + + // Input is the same length as the width + testing.allocator.free(centered); + centered = try self.center(allocator, "123", 3, "-"); + try testing.expectEqualSlices(u8, "123", centered); + + // Input is empty + testing.allocator.free(centered); + centered = try self.center(allocator, "", 3, "-"); + try testing.expectEqualSlices(u8, "---", centered); + + // Input is empty and width is zero + testing.allocator.free(centered); + centered = try self.center(allocator, "", 0, "-"); + try testing.expectEqualSlices(u8, "", centered); + + // Input is longer than the width, which is an error + testing.allocator.free(centered); + try testing.expectError(error.StrTooLong, self.center(allocator, "123", 2, "-")); +} + +/// padLeft returns a new string of width `total_width` (in display cells) using `pad` as padding +/// on the left side. Caller must free returned bytes with `allocator`. +pub fn padLeft( + self: Self, + allocator: mem.Allocator, + str: []const u8, + total_width: usize, + pad: []const u8, +) ![]u8 { + const str_width = self.strWidth(str); + if (str_width > total_width) return error.StrTooLong; + + const pad_width = self.strWidth(pad); + if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; + + const margin_width = total_width - str_width; + if (pad_width > margin_width) return error.PadTooLong; + + const pads = @divFloor(margin_width, pad_width); + + var result = try allocator.alloc(u8, pads * pad.len + str.len); + var bytes_index: usize = 0; + var pads_index: usize = 0; + + while (pads_index < pads) : (pads_index += 1) { + @memcpy(result[bytes_index..][0..pad.len], pad); + bytes_index += pad.len; + } + + @memcpy(result[bytes_index..][0..str.len], str); + + return result; +} + +test "padLeft" { + const allocator = testing.allocator; + var data = try Data.init(allocator); + defer data.deinit(); + const self = Self{ .data = &data }; + + var right_aligned = try self.padLeft(allocator, "abc", 9, "*"); + defer testing.allocator.free(right_aligned); + try testing.expectEqualSlices(u8, "******abc", right_aligned); + + testing.allocator.free(right_aligned); + right_aligned = try self.padLeft(allocator, "w😊w", 10, "-"); + try testing.expectEqualSlices(u8, "------w😊w", right_aligned); +} + +/// padRight returns a new string of width `total_width` (in display cells) using `pad` as padding +/// on the right side. Caller must free returned bytes with `allocator`. +pub fn padRight( + self: Self, + allocator: mem.Allocator, + str: []const u8, + total_width: usize, + pad: []const u8, +) ![]u8 { + const str_width = self.strWidth(str); + if (str_width > total_width) return error.StrTooLong; + + const pad_width = self.strWidth(pad); + if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; + + const margin_width = total_width - str_width; + if (pad_width > margin_width) return error.PadTooLong; + + const pads = @divFloor(margin_width, pad_width); + + var result = try allocator.alloc(u8, pads * pad.len + str.len); + var bytes_index: usize = 0; + var pads_index: usize = 0; + + @memcpy(result[bytes_index..][0..str.len], str); + bytes_index += str.len; + + while (pads_index < pads) : (pads_index += 1) { + @memcpy(result[bytes_index..][0..pad.len], pad); + bytes_index += pad.len; + } + + return result; +} + +test "padRight" { + const allocator = testing.allocator; + var data = try Data.init(allocator); + defer data.deinit(); + const self = Self{ .data = &data }; + + var left_aligned = try self.padRight(allocator, "abc", 9, "*"); + defer testing.allocator.free(left_aligned); + try testing.expectEqualSlices(u8, "abc******", left_aligned); + + testing.allocator.free(left_aligned); + left_aligned = try self.padRight(allocator, "w😊w", 10, "-"); + try testing.expectEqualSlices(u8, "w😊w------", left_aligned); +} + +/// Wraps a string approximately at the given number of colums per line. +/// `threshold` defines how far the last column of the last word can be +/// from the edge. Caller must free returned bytes with `allocator`. +pub fn wrap( + self: Self, + allocator: mem.Allocator, + str: []const u8, + columns: usize, + threshold: usize, +) ![]u8 { + var result = ArrayList(u8).init(allocator); + defer result.deinit(); + + var line_iter = mem.tokenizeAny(u8, str, "\r\n"); + var line_width: usize = 0; + + while (line_iter.next()) |line| { + var word_iter = mem.tokenizeScalar(u8, line, ' '); + + while (word_iter.next()) |word| { + try result.appendSlice(word); + try result.append(' '); + line_width += self.strWidth(word) + 1; + + if (line_width > columns or columns - line_width <= threshold) { + try result.append('\n'); + line_width = 0; + } + } + } + + // Remove trailing space and newline. + _ = result.pop(); + _ = result.pop(); + + return try result.toOwnedSlice(); +} + +test "wrap" { + const allocator = testing.allocator; + var data = try Data.init(allocator); + defer data.deinit(); + const self = Self{ .data = &data }; + + const input = "The quick brown fox\r\njumped over the lazy dog!"; + const got = try self.wrap(allocator, input, 10, 3); + defer testing.allocator.free(got); + const want = "The quick \nbrown fox \njumped \nover the \nlazy dog!"; + try testing.expectEqualStrings(want, got); +} diff --git a/src/DisplayWidthData.zig b/src/DisplayWidthData.zig new file mode 100644 index 0000000..32f8658 --- /dev/null +++ b/src/DisplayWidthData.zig @@ -0,0 +1,82 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; +const testing = std.testing; + +const GraphemeData = @import("GraphemeData"); + +allocator: mem.Allocator, +g_data: GraphemeData, +s1: []u16 = undefined, +s2: []i3 = undefined, + +const Self = @This(); + +pub fn init(allocator: mem.Allocator) !Self { + const decompressor = compress.deflate.decompressor; + const in_bytes = @embedFile("dwp"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = try decompressor(allocator, in_fbs.reader(), null); + defer in_decomp.deinit(); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + var self = Self{ + .allocator = allocator, + .g_data = try GraphemeData.init(allocator), + }; + + const stage_1_len: u16 = try reader.readInt(u16, endian); + self.s1 = try allocator.alloc(u16, stage_1_len); + for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); + + const stage_2_len: u16 = try reader.readInt(u16, endian); + self.s2 = try allocator.alloc(i3, stage_2_len); + for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(i8, endian)); + + return self; +} + +pub fn deinit(self: *Self) void { + self.allocator.free(self.s1); + self.allocator.free(self.s2); + self.g_data.deinit(); +} + +/// codePointWidth returns the number of cells `cp` requires when rendered +/// in a fixed-pitch font (i.e. a terminal screen). This can range from -1 to +/// 3, where BACKSPACE and DELETE return -1 and 3-em-dash returns 3. C0/C1 +/// control codes return 0. If `cjk` is true, ambiguous code points return 2, +/// otherwise they return 1. +pub inline fn codePointWidth(self: Self, cp: u21) i3 { + return self.s2[self.s1[cp >> 8] + (cp & 0xff)]; +} + +test "codePointWidth" { + try testing.expectEqual(@as(i3, 0), codePointWidth(0x0000)); // null + try testing.expectEqual(@as(i3, -1), codePointWidth(0x8)); // \b + try testing.expectEqual(@as(i3, -1), codePointWidth(0x7f)); // DEL + try testing.expectEqual(@as(i3, 0), codePointWidth(0x0005)); // Cf + try testing.expectEqual(@as(i3, 0), codePointWidth(0x0007)); // \a BEL + try testing.expectEqual(@as(i3, 0), codePointWidth(0x000A)); // \n LF + try testing.expectEqual(@as(i3, 0), codePointWidth(0x000B)); // \v VT + try testing.expectEqual(@as(i3, 0), codePointWidth(0x000C)); // \f FF + try testing.expectEqual(@as(i3, 0), codePointWidth(0x000D)); // \r CR + try testing.expectEqual(@as(i3, 0), codePointWidth(0x000E)); // SQ + try testing.expectEqual(@as(i3, 0), codePointWidth(0x000F)); // SI + + try testing.expectEqual(@as(i3, 0), codePointWidth(0x070F)); // Cf + try testing.expectEqual(@as(i3, 1), codePointWidth(0x0603)); // Cf Arabic + + try testing.expectEqual(@as(i3, 1), codePointWidth(0x00AD)); // soft-hyphen + try testing.expectEqual(@as(i3, 2), codePointWidth(0x2E3A)); // two-em dash + try testing.expectEqual(@as(i3, 3), codePointWidth(0x2E3B)); // three-em dash + + try testing.expectEqual(@as(i3, 1), codePointWidth(0x00BD)); // ambiguous halfwidth + + try testing.expectEqual(@as(i3, 1), codePointWidth('é')); + try testing.expectEqual(@as(i3, 2), codePointWidth('😊')); + try testing.expectEqual(@as(i3, 2), codePointWidth('统')); +} diff --git a/src/GraphemeData.zig b/src/GraphemeData.zig new file mode 100644 index 0000000..e418dea --- /dev/null +++ b/src/GraphemeData.zig @@ -0,0 +1,86 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; + +/// Indic syllable type. +pub const Indic = enum { + none, + + Consonant, + Extend, + Linker, +}; + +/// Grapheme break property. +pub const Gbp = enum { + none, + Control, + CR, + Extend, + L, + LF, + LV, + LVT, + Prepend, + Regional_Indicator, + SpacingMark, + T, + V, + ZWJ, +}; + +allocator: mem.Allocator, +s1: []u16 = undefined, +s2: []u16 = undefined, +s3: []u8 = undefined, + +const Self = @This(); + +pub fn init(allocator: mem.Allocator) !Self { + const decompressor = compress.deflate.decompressor; + const in_bytes = @embedFile("gbp"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = try decompressor(allocator, in_fbs.reader(), null); + defer in_decomp.deinit(); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + var self = Self{ .allocator = allocator }; + + const s1_len: u16 = try reader.readInt(u16, endian); + self.s1 = try allocator.alloc(u16, s1_len); + for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian); + + const s2_len: u16 = try reader.readInt(u16, endian); + self.s2 = try allocator.alloc(u16, s2_len); + for (0..s2_len) |i| self.s2[i] = try reader.readInt(u16, endian); + + const s3_len: u16 = try reader.readInt(u16, endian); + self.s3 = try allocator.alloc(u8, s3_len); + _ = try reader.readAll(self.s3); + + return self; +} + +pub fn deinit(self: *Self) void { + self.allocator.free(self.s1); + self.allocator.free(self.s2); + self.allocator.free(self.s3); +} + +/// Lookup the grapheme break property for a code point. +pub inline fn gbp(self: Self, cp: u21) Gbp { + return @enumFromInt(self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] >> 4); +} + +/// Lookup the indic syllable type for a code point. +pub inline fn indic(self: Self, cp: u21) Indic { + return @enumFromInt((self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7); +} + +/// Lookup the indic syllable type for a code point. +pub inline fn isEmoji(self: Self, cp: u21) bool { + return self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; +} diff --git a/src/Normalizer.zig b/src/Normalizer.zig index 1b4a2d5..6a19f47 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig @@ -8,16 +8,18 @@ const CodePointIterator = @import("code_point").Iterator; const case_fold_map = @import("ziglyph").case_folding; const hangul_map = @import("ziglyph").hangul; const norm_props = @import("ziglyph").normalization_props; -const normp = @import("normp"); - -const Self = @This(); +pub const Data = @import("CombiningClassData"); +ccc_data: *Data, nfc_map: std.AutoHashMap([2]u21, u21), nfd_map: std.AutoHashMap(u21, [2]u21), nfkd_map: std.AutoHashMap(u21, [18]u21), -pub fn init(allocator: std.mem.Allocator) !Self { +const Self = @This(); + +pub fn init(allocator: std.mem.Allocator, data: *Data) !Self { var self = Self{ + .ccc_data = data, .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator), .nfd_map = std.AutoHashMap(u21, [2]u21).init(allocator), .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), @@ -95,7 +97,9 @@ pub fn deinit(self: *Self) void { } test "init / deinit" { - var n = try init(std.testing.allocator); + var data = try Data.init(std.testing.allocator); + defer data.deinit(); + var n = try init(std.testing.allocator, &data); defer n.deinit(); } @@ -241,7 +245,9 @@ pub fn decompose(self: Self, cp: u21, form: Form) Decomp { test "decompose" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); var dc = n.decompose('é', .nfd); @@ -307,19 +313,17 @@ pub const Result = struct { }; // Compares code points by Canonical Combining Class order. -fn cccLess(_: void, lhs: u21, rhs: u21) bool { - const lcc = normp.stage_2[normp.stage_1[lhs >> 8] + (lhs & 0xff)]; - const rcc = normp.stage_2[normp.stage_1[rhs >> 8] + (rhs & 0xff)]; - return lcc < rcc; +fn cccLess(self: Self, lhs: u21, rhs: u21) bool { + return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs); } // Applies the Canonical Sorting Algorithm. -fn canonicalSort(cps: []u21) void { +fn canonicalSort(self: Self, cps: []u21) void { var i: usize = 0; while (i < cps.len) : (i += 1) { const start: usize = i; - while (i < cps.len and normp.stage_2[normp.stage_1[cps[i] >> 8] + (cps[i] & 0xff)] != 0) : (i += 1) {} - std.mem.sort(u21, cps[start..i], {}, cccLess); + while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} + std.mem.sort(u21, cps[start..i], self, cccLess); } } @@ -349,7 +353,7 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! try dcp_list.appendSlice(slice); } - canonicalSort(dcp_list.items); + self.canonicalSort(dcp_list.items); var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4); defer dstr_list.deinit(); @@ -365,7 +369,9 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! test "nfd ASCII / no-alloc" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); var result = try n.nfd(allocator, "Hello World!"); @@ -376,7 +382,9 @@ test "nfd ASCII / no-alloc" { test "nfd !ASCII / alloc" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); @@ -387,7 +395,9 @@ test "nfd !ASCII / alloc" { test "nfkd ASCII / no-alloc" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); var result = try n.nfkd(allocator, "Hello World!"); @@ -398,7 +408,9 @@ test "nfkd ASCII / no-alloc" { test "nfkd !ASCII / alloc" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); @@ -413,16 +425,8 @@ fn isHangul(cp: u21) bool { return cp >= 0x1100 and hangul_map.syllableType(cp) != null; } -fn isStarter(cp: u21) bool { - return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] == 0; -} - -fn isCombining(cp: u21) bool { - return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] != 0; -} - -fn isNonHangulStarter(cp: u21) bool { - return !isHangul(cp) and isStarter(cp); +fn isNonHangulStarter(self: Self, cp: u21) bool { + return !isHangul(cp) and self.ccc_data.isStarter(cp); } /// Normalizes `str` to NFC. @@ -464,7 +468,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! block_check: while (i < d_list.items.len) : (i += 1) { const C = d_list.items[i]; - const cc_C = normp.stage_2[normp.stage_1[C >> 8] + (C & 0xff)]; + const cc_C = self.ccc_data.ccc(C); var starter_index: ?usize = null; var j: usize = i; @@ -472,14 +476,14 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! j -= 1; // Check for starter. - if (isStarter(d_list.items[j])) { + if (self.ccc_data.isStarter(d_list.items[j])) { if (i - j > 1) { // If there's distance between the starting point and the current position. for (d_list.items[(j + 1)..i]) |B| { + const cc_B = self.ccc_data.ccc(B); // Check for blocking conditions. if (isHangul(C)) { - if (isCombining(B) or isNonHangulStarter(B)) continue :block_check; + if (cc_B != 0 or self.isNonHangulStarter(B)) continue :block_check; } - const cc_B = normp.stage_2[normp.stage_1[B >> 8] + (B & 0xff)]; if (cc_B >= cc_C) continue :block_check; } } @@ -560,7 +564,9 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! test "nfc" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); @@ -571,7 +577,9 @@ test "nfc" { test "nfkc" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); @@ -630,7 +638,9 @@ pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u test "eql" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); @@ -697,7 +707,9 @@ pub fn eqlCaseless(self: Self, allocator: std.mem.Allocator, a: []const u8, b: [ test "eqlCaseless" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}")); @@ -707,7 +719,7 @@ test "eqlCaseless" { // FCD fn getLeadCcc(self: Self, cp: u21) u8 { const dc = self.mapping(cp, .nfd); - return normp.stage_2[normp.stage_1[dc.cps[0] >> 8] + (dc.cps[0] & 0xff)]; + return self.ccc_data.ccc(dc.cps[0]); } fn getTrailCcc(self: Self, cp: u21) u8 { @@ -715,8 +727,7 @@ fn getTrailCcc(self: Self, cp: u21) u8 { const len = for (dc.cps, 0..) |dcp, i| { if (dcp == 0) break i; } else dc.cps.len; - const tcp = dc.cps[len -| 1]; - return normp.stage_2[normp.stage_1[tcp >> 8] + (tcp & 0xff)]; + return self.ccc_data.ccc(dc.cps[len - 1]); } /// Fast check to detect if a string is already in NFC or NFD form. @@ -733,7 +744,9 @@ pub fn isFcd(self: Self, str: []const u8) bool { test "isFcd" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); const is_nfc = "José \u{3D3}"; @@ -751,7 +764,9 @@ test "Unicode normalization tests" { defer arena.deinit(); var allocator = arena.allocator(); - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); diff --git a/src/display_width.zig b/src/display_width.zig deleted file mode 100644 index a916cac..0000000 --- a/src/display_width.zig +++ /dev/null @@ -1,360 +0,0 @@ -const std = @import("std"); -const simd = std.simd; -const mem = std.mem; -const testing = std.testing; - -const ascii = @import("ascii"); -const CodePointIterator = @import("code_point").Iterator; -const dwp = @import("dwp"); -const GraphemeIterator = @import("grapheme").Iterator; - -/// codePointWidth returns the number of cells `cp` requires when rendered -/// in a fixed-pitch font (i.e. a terminal screen). This can range from -1 to -/// 3, where BACKSPACE and DELETE return -1 and 3-em-dash returns 3. C0/C1 -/// control codes return 0. If `cjk` is true, ambiguous code points return 2, -/// otherwise they return 1. -pub fn codePointWidth(cp: u21) i3 { - return dwp.stage_2[dwp.stage_1[cp >> 8] + (cp & 0xff)]; -} - -test "codePointWidth" { - try testing.expectEqual(@as(i3, 0), codePointWidth(0x0000)); // null - try testing.expectEqual(@as(i3, -1), codePointWidth(0x8)); // \b - try testing.expectEqual(@as(i3, -1), codePointWidth(0x7f)); // DEL - try testing.expectEqual(@as(i3, 0), codePointWidth(0x0005)); // Cf - try testing.expectEqual(@as(i3, 0), codePointWidth(0x0007)); // \a BEL - try testing.expectEqual(@as(i3, 0), codePointWidth(0x000A)); // \n LF - try testing.expectEqual(@as(i3, 0), codePointWidth(0x000B)); // \v VT - try testing.expectEqual(@as(i3, 0), codePointWidth(0x000C)); // \f FF - try testing.expectEqual(@as(i3, 0), codePointWidth(0x000D)); // \r CR - try testing.expectEqual(@as(i3, 0), codePointWidth(0x000E)); // SQ - try testing.expectEqual(@as(i3, 0), codePointWidth(0x000F)); // SI - - try testing.expectEqual(@as(i3, 0), codePointWidth(0x070F)); // Cf - try testing.expectEqual(@as(i3, 1), codePointWidth(0x0603)); // Cf Arabic - - try testing.expectEqual(@as(i3, 1), codePointWidth(0x00AD)); // soft-hyphen - try testing.expectEqual(@as(i3, 2), codePointWidth(0x2E3A)); // two-em dash - try testing.expectEqual(@as(i3, 3), codePointWidth(0x2E3B)); // three-em dash - - try testing.expectEqual(@as(i3, 1), codePointWidth(0x00BD)); // ambiguous halfwidth - - try testing.expectEqual(@as(i3, 1), codePointWidth('é')); - try testing.expectEqual(@as(i3, 2), codePointWidth('😊')); - try testing.expectEqual(@as(i3, 2), codePointWidth('统')); -} - -/// strWidth returns the total display width of `str` as the number of cells -/// required in a fixed-pitch font (i.e. a terminal screen). -pub fn strWidth(str: []const u8) usize { - var total: isize = 0; - - // ASCII fast path - if (ascii.isAsciiOnly(str)) { - for (str) |b| total += codePointWidth(b); - return @intCast(@max(0, total)); - } - - var giter = GraphemeIterator.init(str); - - while (giter.next()) |gc| { - var cp_iter = CodePointIterator{ .bytes = gc.bytes(str) }; - var gc_total: isize = 0; - - while (cp_iter.next()) |cp| { - var w = codePointWidth(cp.code); - - if (w != 0) { - // Handle text emoji sequence. - if (cp_iter.next()) |ncp| { - // emoji text sequence. - if (ncp.code == 0xFE0E) w = 1; - } - - // Only adding width of first non-zero-width code point. - if (gc_total == 0) { - gc_total = w; - break; - } - } - } - - total += gc_total; - } - - return @intCast(@max(0, total)); -} - -test "strWidth" { - try testing.expectEqual(@as(usize, 5), strWidth("Hello\r\n")); - try testing.expectEqual(@as(usize, 1), strWidth("\u{0065}\u{0301}")); - try testing.expectEqual(@as(usize, 2), strWidth("\u{1F476}\u{1F3FF}\u{0308}\u{200D}\u{1F476}\u{1F3FF}")); - try testing.expectEqual(@as(usize, 8), strWidth("Hello 😊")); - try testing.expectEqual(@as(usize, 8), strWidth("Héllo 😊")); - try testing.expectEqual(@as(usize, 8), strWidth("Héllo :)")); - try testing.expectEqual(@as(usize, 8), strWidth("Héllo 🇪🇸")); - try testing.expectEqual(@as(usize, 2), strWidth("\u{26A1}")); // Lone emoji - try testing.expectEqual(@as(usize, 1), strWidth("\u{26A1}\u{FE0E}")); // Text sequence - try testing.expectEqual(@as(usize, 2), strWidth("\u{26A1}\u{FE0F}")); // Presentation sequence - try testing.expectEqual(@as(usize, 0), strWidth("A\x08")); // Backspace - try testing.expectEqual(@as(usize, 0), strWidth("\x7FA")); // DEL - try testing.expectEqual(@as(usize, 0), strWidth("\x7FA\x08\x08")); // never less than o - - // wcwidth Python lib tests. See: https://github.com/jquast/wcwidth/blob/master/tests/test_core.py - const empty = ""; - try testing.expectEqual(@as(usize, 0), strWidth(empty)); - const with_null = "hello\x00world"; - try testing.expectEqual(@as(usize, 10), strWidth(with_null)); - const hello_jp = "コンニチハ, セカイ!"; - try testing.expectEqual(@as(usize, 19), strWidth(hello_jp)); - const control = "\x1b[0m"; - try testing.expectEqual(@as(usize, 3), strWidth(control)); - const balinese = "\u{1B13}\u{1B28}\u{1B2E}\u{1B44}"; - try testing.expectEqual(@as(usize, 3), strWidth(balinese)); - - // These commented out tests require a new specification for complex scripts. - // See: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf - // const jamo = "\u{1100}\u{1160}"; - // try testing.expectEqual(@as(usize, 3), strWidth(jamo)); - // const devengari = "\u{0915}\u{094D}\u{0937}\u{093F}"; - // try testing.expectEqual(@as(usize, 3), strWidth(devengari)); - // const tamal = "\u{0b95}\u{0bcd}\u{0bb7}\u{0bcc}"; - // try testing.expectEqual(@as(usize, 5), strWidth(tamal)); - // const kannada_1 = "\u{0cb0}\u{0ccd}\u{0c9d}\u{0cc8}"; - // try testing.expectEqual(@as(usize, 3), strWidth(kannada_1)); - // The following passes but as a mere coincidence. - const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}"; - try testing.expectEqual(@as(usize, 2), strWidth(kannada_2)); - - // From Rust https://github.com/jameslanska/unicode-display-width - try testing.expectEqual(@as(usize, 15), strWidth("🔥🗡🍩👩🏻‍🚀⏰💃🏼🔦👍🏻")); - try testing.expectEqual(@as(usize, 2), strWidth("🦀")); - try testing.expectEqual(@as(usize, 2), strWidth("👨‍👩‍👧‍👧")); - try testing.expectEqual(@as(usize, 2), strWidth("👩‍🔬")); - try testing.expectEqual(@as(usize, 9), strWidth("sane text")); - try testing.expectEqual(@as(usize, 9), strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ")); - try testing.expectEqual(@as(usize, 17), strWidth("슬라바 우크라이나")); - try testing.expectEqual(@as(usize, 1), strWidth("\u{378}")); -} - -/// centers `str` in a new string of width `total_width` (in display cells) using `pad` as padding. -/// If the length of `str` and `total_width` have different parity, the right side of `str` will -/// receive one additional pad. This makes sure the returned string fills the requested width. -/// Caller must free returned bytes with `allocator`. -pub fn center( - allocator: mem.Allocator, - str: []const u8, - total_width: usize, - pad: []const u8, -) ![]u8 { - const str_width = strWidth(str); - if (str_width > total_width) return error.StrTooLong; - if (str_width == total_width) return try allocator.dupe(u8, str); - - const pad_width = strWidth(pad); - if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; - - const margin_width = @divFloor((total_width - str_width), 2); - if (pad_width > margin_width) return error.PadTooLong; - const extra_pad: usize = if (total_width % 2 != str_width % 2) 1 else 0; - const pads = @divFloor(margin_width, pad_width) * 2 + extra_pad; - - var result = try allocator.alloc(u8, pads * pad.len + str.len); - var bytes_index: usize = 0; - var pads_index: usize = 0; - - while (pads_index < pads / 2) : (pads_index += 1) { - @memcpy(result[bytes_index..][0..pad.len], pad); - bytes_index += pad.len; - } - - @memcpy(result[bytes_index..][0..str.len], str); - bytes_index += str.len; - - pads_index = 0; - while (pads_index < pads / 2 + extra_pad) : (pads_index += 1) { - @memcpy(result[bytes_index..][0..pad.len], pad); - bytes_index += pad.len; - } - - return result; -} - -test "center" { - var allocator = std.testing.allocator; - - // Input and width both have odd length - var centered = try center(allocator, "abc", 9, "*"); - try testing.expectEqualSlices(u8, "***abc***", centered); - - // Input and width both have even length - allocator.free(centered); - centered = try center(allocator, "w😊w", 10, "-"); - try testing.expectEqualSlices(u8, "---w😊w---", centered); - - // Input has even length, width has odd length - allocator.free(centered); - centered = try center(allocator, "1234", 9, "-"); - try testing.expectEqualSlices(u8, "--1234---", centered); - - // Input has odd length, width has even length - allocator.free(centered); - centered = try center(allocator, "123", 8, "-"); - try testing.expectEqualSlices(u8, "--123---", centered); - - // Input is the same length as the width - allocator.free(centered); - centered = try center(allocator, "123", 3, "-"); - try testing.expectEqualSlices(u8, "123", centered); - - // Input is empty - allocator.free(centered); - centered = try center(allocator, "", 3, "-"); - try testing.expectEqualSlices(u8, "---", centered); - - // Input is empty and width is zero - allocator.free(centered); - centered = try center(allocator, "", 0, "-"); - try testing.expectEqualSlices(u8, "", centered); - - // Input is longer than the width, which is an error - allocator.free(centered); - try testing.expectError(error.StrTooLong, center(allocator, "123", 2, "-")); -} - -/// padLeft returns a new string of width `total_width` (in display cells) using `pad` as padding -/// on the left side. Caller must free returned bytes with `allocator`. -pub fn padLeft( - allocator: std.mem.Allocator, - str: []const u8, - total_width: usize, - pad: []const u8, -) ![]u8 { - const str_width = strWidth(str); - if (str_width > total_width) return error.StrTooLong; - - const pad_width = strWidth(pad); - if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; - - const margin_width = total_width - str_width; - if (pad_width > margin_width) return error.PadTooLong; - - const pads = @divFloor(margin_width, pad_width); - - var result = try allocator.alloc(u8, pads * pad.len + str.len); - var bytes_index: usize = 0; - var pads_index: usize = 0; - - while (pads_index < pads) : (pads_index += 1) { - @memcpy(result[bytes_index..][0..pad.len], pad); - bytes_index += pad.len; - } - - @memcpy(result[bytes_index..][0..str.len], str); - - return result; -} - -test "padLeft" { - var allocator = std.testing.allocator; - - var right_aligned = try padLeft(allocator, "abc", 9, "*"); - defer allocator.free(right_aligned); - try testing.expectEqualSlices(u8, "******abc", right_aligned); - - allocator.free(right_aligned); - right_aligned = try padLeft(allocator, "w😊w", 10, "-"); - try testing.expectEqualSlices(u8, "------w😊w", right_aligned); -} - -/// padRight returns a new string of width `total_width` (in display cells) using `pad` as padding -/// on the right side. Caller must free returned bytes with `allocator`. -pub fn padRight( - allocator: std.mem.Allocator, - str: []const u8, - total_width: usize, - pad: []const u8, -) ![]u8 { - const str_width = strWidth(str); - if (str_width > total_width) return error.StrTooLong; - - const pad_width = strWidth(pad); - if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; - - const margin_width = total_width - str_width; - if (pad_width > margin_width) return error.PadTooLong; - - const pads = @divFloor(margin_width, pad_width); - - var result = try allocator.alloc(u8, pads * pad.len + str.len); - var bytes_index: usize = 0; - var pads_index: usize = 0; - - @memcpy(result[bytes_index..][0..str.len], str); - bytes_index += str.len; - - while (pads_index < pads) : (pads_index += 1) { - @memcpy(result[bytes_index..][0..pad.len], pad); - bytes_index += pad.len; - } - - return result; -} - -test "padRight" { - var allocator = std.testing.allocator; - - var left_aligned = try padRight(allocator, "abc", 9, "*"); - defer allocator.free(left_aligned); - try testing.expectEqualSlices(u8, "abc******", left_aligned); - - allocator.free(left_aligned); - left_aligned = try padRight(allocator, "w😊w", 10, "-"); - try testing.expectEqualSlices(u8, "w😊w------", left_aligned); -} - -/// Wraps a string approximately at the given number of colums per line. -/// `threshold` defines how far the last column of the last word can be -/// from the edge. Caller must free returned bytes with `allocator`. -pub fn wrap( - allocator: std.mem.Allocator, - str: []const u8, - columns: usize, - threshold: usize, -) ![]u8 { - var result = std.ArrayList(u8).init(allocator); - defer result.deinit(); - - var line_iter = mem.tokenizeAny(u8, str, "\r\n"); - var line_width: usize = 0; - - while (line_iter.next()) |line| { - var word_iter = mem.tokenizeScalar(u8, line, ' '); - - while (word_iter.next()) |word| { - try result.appendSlice(word); - try result.append(' '); - line_width += strWidth(word) + 1; - - if (line_width > columns or columns - line_width <= threshold) { - try result.append('\n'); - line_width = 0; - } - } - } - - // Remove trailing space and newline. - _ = result.pop(); - _ = result.pop(); - - return try result.toOwnedSlice(); -} - -test "wrap" { - var allocator = std.testing.allocator; - const input = "The quick brown fox\r\njumped over the lazy dog!"; - const got = try wrap(allocator, input, 10, 3); - defer allocator.free(got); - const want = "The quick \nbrown fox \njumped \nover the \nlazy dog!"; - try testing.expectEqualStrings(want, got); -} diff --git a/src/grapheme.zig b/src/grapheme.zig index 3fdf10b..7125b5b 100644 --- a/src/grapheme.zig +++ b/src/grapheme.zig @@ -1,9 +1,10 @@ const std = @import("std"); +const mem = std.mem; const unicode = std.unicode; const CodePoint = @import("code_point").CodePoint; const CodePointIterator = @import("code_point").Iterator; -const gbp = @import("gbp"); +pub const Data = @import("GraphemeData"); /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. pub const Grapheme = struct { @@ -21,12 +22,13 @@ pub const Grapheme = struct { pub const Iterator = struct { buf: [2]?CodePoint = .{ null, null }, cp_iter: CodePointIterator, + data: *Data, const Self = @This(); /// Assumes `src` is valid UTF-8. - pub fn init(str: []const u8) Self { - var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; + pub fn init(str: []const u8, data: *Data) Self { + var self = Self{ .cp_iter = .{ .bytes = str }, .data = data }; self.advance(); return self; } @@ -55,6 +57,7 @@ pub const Iterator = struct { if (graphemeBreak( self.buf[0].?.code, self.buf[1].?.code, + self.data, &state, )) return Grapheme{ .len = gc_len, .offset = gc_start }; @@ -67,6 +70,7 @@ pub const Iterator = struct { if (graphemeBreak( self.buf[0].?.code, if (self.buf[1]) |ncp| ncp.code else 0, + self.data, &state, )) break; } @@ -76,18 +80,12 @@ pub const Iterator = struct { }; // Predicates -fn isBreaker(cp: u21) bool { +fn isBreaker(cp: u21, data: *Data) bool { // Extract relevant properties. - const cp_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]]; - const cp_gbp_prop: gbp.Gbp = @enumFromInt(cp_props_byte >> 4); + const cp_gbp_prop = data.gbp(cp); return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; } -fn isIgnorable(cp: u21) bool { - const cp_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]]; - return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}'; -} - // Grapheme break state. const State = struct { bits: u3 = 0, @@ -135,18 +133,17 @@ const State = struct { pub fn graphemeBreak( cp1: u21, cp2: u21, + data: *Data, state: *State, ) bool { // Extract relevant properties. - const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; - const cp1_gbp_prop: gbp.Gbp = @enumFromInt(cp1_props_byte >> 4); - const cp1_indic_prop: gbp.Indic = @enumFromInt((cp1_props_byte >> 1) & 0x7); - const cp1_is_emoji = cp1_props_byte & 1 == 1; + const cp1_gbp_prop = data.gbp(cp1); + const cp1_indic_prop = data.indic(cp1); + const cp1_is_emoji = data.isEmoji(cp1); - const cp2_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp2 >> 8] + (cp2 & 0xff)]]; - const cp2_gbp_prop: gbp.Gbp = @enumFromInt(cp2_props_byte >> 4); - const cp2_indic_prop: gbp.Indic = @enumFromInt((cp2_props_byte >> 1) & 0x7); - const cp2_is_emoji = cp2_props_byte & 1 == 1; + const cp2_gbp_prop = data.gbp(cp2); + const cp2_indic_prop = data.indic(cp2); + const cp2_is_emoji = data.isEmoji(cp2); // GB11: Emoji Extend* ZWJ x Emoji if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); @@ -157,7 +154,7 @@ pub fn graphemeBreak( if (cp1 == '\r' and cp2 == '\n') return false; // GB4: Control - if (isBreaker(cp1)) return true; + if (isBreaker(cp1, data)) return true; // GB11: Emoji Extend* ZWJ x Emoji if (state.hasXpic() and @@ -175,7 +172,7 @@ pub fn graphemeBreak( if (cp2_gbp_prop == .SpacingMark) return false; // GB9b: Prepend x - if (cp1_gbp_prop == .Prepend and !isBreaker(cp2)) return false; + if (cp1_gbp_prop == .Prepend and !isBreaker(cp2, data)) return false; // GB12, GB13: RI x RI if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { @@ -240,6 +237,9 @@ test "Segmentation GraphemeIterator" { var buf_reader = std.io.bufferedReader(file.reader()); var input_stream = buf_reader.reader(); + var data = try Data.init(allocator); + defer data.deinit(); + var buf: [4096]u8 = undefined; var line_no: usize = 1; @@ -282,7 +282,7 @@ test "Segmentation GraphemeIterator" { } // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); - var iter = Iterator.init(all_bytes.items); + var iter = Iterator.init(all_bytes.items, &data); // Chaeck. for (want.items) |want_gc| { @@ -295,19 +295,6 @@ test "Segmentation GraphemeIterator" { } } -test "Segmentation comptime GraphemeIterator" { - const want = [_][]const u8{ "H", "é", "l", "l", "o" }; - - comptime { - const src = "Héllo"; - var ct_iter = Iterator.init(src); - var i = 0; - while (ct_iter.next()) |grapheme| : (i += 1) { - try std.testing.expectEqualStrings(grapheme.bytes(src), want[i]); - } - } -} - test "Segmentation ZWJ and ZWSP emoji sequences" { const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; @@ -315,18 +302,22 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; const no_joiner = seq_1 ++ seq_2; - var ct_iter = Iterator.init(with_zwj); + var data = try Data.init(std.testing.allocator); + defer data.deinit(); + + var iter = Iterator.init(with_zwj, &data); + var i: usize = 0; - while (ct_iter.next()) |_| : (i += 1) {} + while (iter.next()) |_| : (i += 1) {} try std.testing.expectEqual(@as(usize, 1), i); - ct_iter = Iterator.init(with_zwsp); + iter = Iterator.init(with_zwsp, &data); i = 0; - while (ct_iter.next()) |_| : (i += 1) {} + while (iter.next()) |_| : (i += 1) {} try std.testing.expectEqual(@as(usize, 3), i); - ct_iter = Iterator.init(no_joiner); + iter = Iterator.init(no_joiner, &data); i = 0; - while (ct_iter.next()) |_| : (i += 1) {} + while (iter.next()) |_| : (i += 1) {} try std.testing.expectEqual(@as(usize, 2), i); } diff --git a/src/main.zig b/src/main.zig index 946ae01..57db05b 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,29 +1,47 @@ const std = @import("std"); // const GraphemeIterator = @import("ziglyph").GraphemeIterator; -// const GraphemeIterator = @import("Grapheme").GraphemeIterator; +// const Data = @import("grapheme").Data; +// const GraphemeIterator = @import("grapheme").Iterator; + // const codePointWidth = @import("ziglyph").display_width.codePointWidth; -// const codePointWidth = @import("display_width").codePointWidth; // const strWidth = @import("ziglyph").display_width.strWidth; +// const Data = @import("display_width").Data; +// const codePointWidth = @import("display_width").codePointWidth; // const strWidth = @import("display_width").strWidth; -// const CodePointIterator = @import("CodePoint").CodePointIterator; + +// const CodePointIterator = @import("ziglyph").CodePointIterator; +// const CodePointIterator = @import("code_point").Iterator; + // const ascii = @import("ascii"); // const ascii = std.ascii; + // const norm = @import("ziglyph").Normalizer; +const Data = @import("Normalizer").Data; const norm = @import("Normalizer"); pub fn main() !void { + var args_iter = std.process.args(); + _ = args_iter.skip(); + const in_path = args_iter.next() orelse return error.MissingArg; + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; defer _ = gpa.deinit(); const allocator = gpa.allocator(); - const input = try std.fs.cwd().readFileAlloc(allocator, "data/lang_mix.txt", std.math.maxInt(u32)); + const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32)); defer allocator.free(input); - var n = try norm.init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + + var n = try norm.init(allocator, &data); defer n.deinit(); + // var n = try norm.init(allocator); + // defer n.deinit(); - // var iter = GraphemeIterator.init(input); + // var iter = GraphemeIterator.init(input, &data); + // defer iter.deinit(); // var iter = CodePointIterator{ .bytes = input }; var iter = std.mem.splitScalar(u8, input, '\n'); @@ -33,7 +51,7 @@ pub fn main() !void { // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); // while (iter.next()) |_| result += 1; - // while (iter.next()) |line| result += strWidth(line); + // while (iter.next()) |line| result += strWidth(line, &data); while (iter.next()) |line| { var nfc = try n.nfc(allocator, line); result += nfc.slice.len; -- cgit v1.2.3