From 836a4b6e63ac4bd7beb406cb20edf23f0bd342a9 Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Mon, 26 Feb 2024 12:24:42 -0400 Subject: Using separate data struct model. --- build.zig | 71 ++++++--- codegen/ccc.zig | 125 ++++++++++++++++ codegen/dwp.zig | 23 ++- codegen/gbp.zig | 68 +++------ codegen/normp.zig | 128 ---------------- src/CombiningClassData.zig | 48 ++++++ src/DisplayWidth.zig | 351 +++++++++++++++++++++++++++++++++++++++++++ src/DisplayWidthData.zig | 82 +++++++++++ src/GraphemeData.zig | 86 +++++++++++ src/Normalizer.zig | 97 ++++++------ src/display_width.zig | 360 --------------------------------------------- src/grapheme.zig | 73 ++++----- src/main.zig | 32 +++- 13 files changed, 881 insertions(+), 663 deletions(-) create mode 100644 codegen/ccc.zig delete mode 100644 codegen/normp.zig create mode 100644 src/CombiningClassData.zig create mode 100644 src/DisplayWidth.zig create mode 100644 src/DisplayWidthData.zig create mode 100644 src/GraphemeData.zig delete mode 100644 src/display_width.zig diff --git a/build.zig b/build.zig index def8b24..7cfb979 100644 --- a/build.zig +++ b/build.zig @@ -16,7 +16,7 @@ pub fn build(b: *std.Build) void { .optimize = .Debug, }); const run_gbp_gen_exe = b.addRunArtifact(gbp_gen_exe); - const gbp_gen_out = run_gbp_gen_exe.addOutputFileArg("gbp.zig"); + const gbp_gen_out = run_gbp_gen_exe.addOutputFileArg("gbp.bin.z"); // Display width const cjk = b.option(bool, "cjk", "Ambiguouse code points are wide (display width: 2).") orelse false; @@ -31,17 +31,17 @@ pub fn build(b: *std.Build) void { }); dwp_gen_exe.root_module.addOptions("options", options); const run_dwp_gen_exe = b.addRunArtifact(dwp_gen_exe); - const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.zig"); + const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.bin.z"); // Normalization properties - const normp_gen_exe = b.addExecutable(.{ - .name = "normp", - .root_source_file = .{ .path = "codegen/normp.zig" }, + const ccc_gen_exe = b.addExecutable(.{ + .name = "ccc", + .root_source_file = .{ .path = "codegen/ccc.zig" }, .target = b.host, .optimize = .Debug, }); - const run_normp_gen_exe = b.addRunArtifact(normp_gen_exe); - const normp_gen_out = run_normp_gen_exe.addOutputFileArg("normp.zig"); + const run_ccc_gen_exe = b.addRunArtifact(ccc_gen_exe); + const ccc_gen_out = run_ccc_gen_exe.addOutputFileArg("ccc.bin.z"); // Modules we provide // Code points @@ -52,13 +52,20 @@ pub fn build(b: *std.Build) void { }); // Grapheme clusters + const grapheme_data = b.createModule(.{ + .root_source_file = .{ .path = "src/GraphemeData.zig" }, + .target = target, + .optimize = optimize, + }); + grapheme_data.addAnonymousImport("gbp", .{ .root_source_file = gbp_gen_out }); + const grapheme = b.addModule("grapheme", .{ .root_source_file = .{ .path = "src/grapheme.zig" }, .target = target, .optimize = optimize, }); grapheme.addImport("code_point", code_point); - grapheme.addAnonymousImport("gbp", .{ .root_source_file = gbp_gen_out }); + grapheme.addImport("GraphemeData", grapheme_data); // ASCII utilities const ascii = b.addModule("ascii", .{ @@ -68,17 +75,32 @@ pub fn build(b: *std.Build) void { }); // Fixed pitch font display width - const display_width = b.addModule("display_width", .{ - .root_source_file = .{ .path = "src/display_width.zig" }, + const dw_data = b.createModule(.{ + .root_source_file = .{ .path = "src/DisplayWidthData.zig" }, + .target = target, + .optimize = optimize, + }); + dw_data.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out }); + dw_data.addImport("GraphemeData", grapheme_data); + + const display_width = b.addModule("DisplayWidth", .{ + .root_source_file = .{ .path = "src/DisplayWidth.zig" }, .target = target, .optimize = optimize, }); display_width.addImport("ascii", ascii); display_width.addImport("code_point", code_point); display_width.addImport("grapheme", grapheme); - display_width.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out }); + display_width.addImport("DisplayWidthData", dw_data); // Normalization + const ccc_data = b.createModule(.{ + .root_source_file = .{ .path = "src/CombiningClassData.zig" }, + .target = target, + .optimize = optimize, + }); + ccc_data.addAnonymousImport("ccc", .{ .root_source_file = ccc_gen_out }); + const norm = b.addModule("Normalizer", .{ .root_source_file = .{ .path = "src/Normalizer.zig" }, .target = target, @@ -86,7 +108,7 @@ pub fn build(b: *std.Build) void { }); norm.addImport("code_point", code_point); norm.addImport("ziglyph", ziglyph.module("ziglyph")); - norm.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out }); + norm.addImport("CombiningClassData", ccc_data); // Benchmark rig const exe = b.addExecutable(.{ @@ -95,11 +117,11 @@ pub fn build(b: *std.Build) void { .target = target, .optimize = optimize, }); - exe.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); - exe.root_module.addImport("ascii", ascii); - exe.root_module.addImport("code_point", code_point); - exe.root_module.addImport("grapheme", grapheme); - exe.root_module.addImport("display_width", display_width); + // exe.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); + // exe.root_module.addImport("ascii", ascii); + // exe.root_module.addImport("code_point", code_point); + // exe.root_module.addImport("grapheme", grapheme); + // exe.root_module.addImport("DisplayWidth", display_width); exe.root_module.addImport("Normalizer", norm); b.installArtifact(exe); @@ -112,17 +134,18 @@ pub fn build(b: *std.Build) void { // Tests const exe_unit_tests = b.addTest(.{ - .root_source_file = .{ .path = "src/Normalizer.zig" }, + .root_source_file = .{ .path = "src/DisplayWidth.zig" }, .target = target, .optimize = optimize, }); - // exe_unit_tests.root_module.addImport("ascii", ascii); + exe_unit_tests.root_module.addImport("ascii", ascii); exe_unit_tests.root_module.addImport("code_point", code_point); - // exe_unit_tests.root_module.addImport("grapheme", grapheme); - // exe_unit_tests.root_module.addAnonymousImport("gbp", .{ .root_source_file = gbp_gen_out }); - // exe_unit_tests.root_module.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out }); - exe_unit_tests.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); - exe_unit_tests.root_module.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out }); + // exe_unit_tests.root_module.addImport("GraphemeData", grapheme_data); + exe_unit_tests.root_module.addImport("grapheme", grapheme); + // exe_unit_tests.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); + // exe_unit_tests.root_module.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out }); + exe_unit_tests.root_module.addImport("DisplayWidthData", dw_data); + // exe_unit_tests.root_module.addImport("CombiningClassData", ccc_data); const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); diff --git a/codegen/ccc.zig b/codegen/ccc.zig new file mode 100644 index 0000000..93da6a0 --- /dev/null +++ b/codegen/ccc.zig @@ -0,0 +1,125 @@ +const std = @import("std"); +const builtin = @import("builtin"); + +const block_size = 256; +const Block = [block_size]u8; + +const BlockMap = std.HashMap( + Block, + u16, + struct { + pub fn hash(_: @This(), k: Block) u64 { + var hasher = std.hash.Wyhash.init(0); + std.hash.autoHashStrat(&hasher, k, .DeepRecursive); + return hasher.final(); + } + + pub fn eql(_: @This(), a: Block, b: Block) bool { + return std.mem.eql(u8, &a, &b); + } + }, + std.hash_map.default_max_load_percentage, +); + +pub fn main() !void { + var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); + defer arena.deinit(); + const allocator = arena.allocator(); + + var flat_map = std.AutoHashMap(u21, u8).init(allocator); + defer flat_map.deinit(); + + var line_buf: [4096]u8 = undefined; + + // Process DerivedEastAsianWidth.txt + var cc_file = try std.fs.cwd().openFile("data/unicode/extracted/DerivedCombiningClass.txt", .{}); + defer cc_file.close(); + var cc_buf = std.io.bufferedReader(cc_file.reader()); + const cc_reader = cc_buf.reader(); + + while (try cc_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { + if (line.len == 0 or line[0] == '#') continue; + const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line; + + var field_iter = std.mem.tokenizeAny(u8, no_comment, "; "); + var current_code: [2]u21 = undefined; + + var i: usize = 0; + while (field_iter.next()) |field| : (i += 1) { + switch (i) { + 0 => { + // Code point(s) + if (std.mem.indexOf(u8, field, "..")) |dots| { + current_code = .{ + try std.fmt.parseInt(u21, field[0..dots], 16), + try std.fmt.parseInt(u21, field[dots + 2 ..], 16), + }; + } else { + const code = try std.fmt.parseInt(u21, field, 16); + current_code = .{ code, code }; + } + }, + 1 => { + // Combining Class + if (std.mem.eql(u8, field, "0")) continue; + const cc = try std.fmt.parseInt(u8, field, 10); + for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), cc); + }, + else => {}, + } + } + } + + var blocks_map = BlockMap.init(allocator); + defer blocks_map.deinit(); + + var stage1 = std.ArrayList(u16).init(allocator); + defer stage1.deinit(); + + var stage2 = std.ArrayList(u8).init(allocator); + defer stage2.deinit(); + + var block: Block = [_]u8{0} ** block_size; + var block_len: u16 = 0; + + for (0..0x110000) |i| { + const cp: u21 = @intCast(i); + const cc = flat_map.get(cp) orelse 0; + + // Process block + block[block_len] = cc; + block_len += 1; + + if (block_len < block_size and cp != 0x10ffff) continue; + + const gop = try blocks_map.getOrPut(block); + if (!gop.found_existing) { + gop.value_ptr.* = @intCast(stage2.items.len); + try stage2.appendSlice(&block); + } + + try stage1.append(gop.value_ptr.*); + block_len = 0; + } + + var args_iter = try std.process.argsWithAllocator(allocator); + defer args_iter.deinit(); + _ = args_iter.skip(); + const output_path = args_iter.next() orelse @panic("No output file arg!"); + + const compressor = std.compress.deflate.compressor; + var out_file = try std.fs.cwd().createFile(output_path, .{}); + defer out_file.close(); + var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); + defer out_comp.deinit(); + const writer = out_comp.writer(); + + const endian = builtin.cpu.arch.endian(); + try writer.writeInt(u16, @intCast(stage1.items.len), endian); + for (stage1.items) |i| try writer.writeInt(u16, i, endian); + + try writer.writeInt(u16, @intCast(stage2.items.len), endian); + try writer.writeAll(stage2.items); + + try out_comp.flush(); +} diff --git a/codegen/dwp.zig b/codegen/dwp.zig index 9e387c6..76a14d3 100644 --- a/codegen/dwp.zig +++ b/codegen/dwp.zig @@ -1,4 +1,5 @@ const std = @import("std"); +const builtin = @import("builtin"); const options = @import("options"); @@ -229,21 +230,19 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); + const compressor = std.compress.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_buf = std.io.bufferedWriter(out_file.writer()); - const writer = out_buf.writer(); + var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); + defer out_comp.deinit(); + const writer = out_comp.writer(); - try writer.writeAll("const std = @import(\"std\");\n"); + const endian = builtin.cpu.arch.endian(); + try writer.writeInt(u16, @intCast(stage1.items.len), endian); + for (stage1.items) |i| try writer.writeInt(u16, i, endian); - try writer.print("const Stage2Int = std.math.IntFittingRange(0, {});\n", .{stage2.items.len}); - try writer.print("pub const stage_1 = [{}]Stage2Int{{", .{stage1.items.len}); - for (stage1.items) |v| try writer.print("{},", .{v}); - try writer.writeAll("};\n"); + try writer.writeInt(u16, @intCast(stage2.items.len), endian); + for (stage2.items) |i| try writer.writeInt(i8, i, endian); - try writer.print("pub const stage_2 = [{}]i3{{", .{stage2.items.len}); - for (stage2.items) |v| try writer.print("{},", .{v}); - try writer.writeAll("};\n"); - - try out_buf.flush(); + try out_comp.flush(); } diff --git a/codegen/gbp.zig b/codegen/gbp.zig index 3bd9a4d..39e0da3 100644 --- a/codegen/gbp.zig +++ b/codegen/gbp.zig @@ -1,4 +1,5 @@ const std = @import("std"); +const builtin = @import("builtin"); const Indic = enum { none, @@ -226,56 +227,23 @@ pub fn main() !void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); + const compressor = std.compress.deflate.compressor; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var out_buf = std.io.bufferedWriter(out_file.writer()); - const writer = out_buf.writer(); - - const prop_code = - \\const std = @import("std"); - \\ - \\pub const Indic = enum { - \\ none, - \\ - \\ Consonant, - \\ Extend, - \\ Linker, - \\}; - \\ - \\pub const Gbp = enum { - \\ none, - \\ Control, - \\ CR, - \\ Extend, - \\ L, - \\ LF, - \\ LV, - \\ LVT, - \\ Prepend, - \\ Regional_Indicator, - \\ SpacingMark, - \\ T, - \\ V, - \\ ZWJ, - \\}; - \\ - ; - - try writer.writeAll(prop_code); - - try writer.print("const Stage2Int = std.math.IntFittingRange(0, {});\n", .{stage2.items.len}); - try writer.print("pub const stage_1 = [{}]Stage2Int{{", .{stage1.items.len}); - for (stage1.items) |v| try writer.print("{},", .{v}); - try writer.writeAll("};\n"); - - try writer.print("const Stage3Int = std.math.IntFittingRange(0, {});\n", .{stage3_len}); - try writer.print("pub const stage_2 = [{}]Stage3Int{{", .{stage2.items.len}); - for (stage2.items) |v| try writer.print("{},", .{v}); - try writer.writeAll("};\n"); - - try writer.print("pub const stage_3 = [{}]u8{{", .{stage3_len}); - for (stage3.keys()) |v| try writer.print("{},", .{v}); - try writer.writeAll("};\n"); - - try out_buf.flush(); + var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); + defer out_comp.deinit(); + const writer = out_comp.writer(); + + const endian = builtin.cpu.arch.endian(); + try writer.writeInt(u16, @intCast(stage1.items.len), endian); + for (stage1.items) |i| try writer.writeInt(u16, i, endian); + + try writer.writeInt(u16, @intCast(stage2.items.len), endian); + for (stage2.items) |i| try writer.writeInt(u16, i, endian); + + const props_bytes = stage3.keys(); + try writer.writeInt(u16, @intCast(props_bytes.len), endian); + try writer.writeAll(props_bytes); + + try out_comp.flush(); } diff --git a/codegen/normp.zig b/codegen/normp.zig deleted file mode 100644 index 25af65c..0000000 --- a/codegen/normp.zig +++ /dev/null @@ -1,128 +0,0 @@ -const std = @import("std"); - -const options = @import("options"); - -const block_size = 256; -const Block = [block_size]u8; - -const BlockMap = std.HashMap( - Block, - u16, - struct { - pub fn hash(_: @This(), k: Block) u64 { - var hasher = std.hash.Wyhash.init(0); - std.hash.autoHashStrat(&hasher, k, .DeepRecursive); - return hasher.final(); - } - - pub fn eql(_: @This(), a: Block, b: Block) bool { - return std.mem.eql(u8, &a, &b); - } - }, - std.hash_map.default_max_load_percentage, -); - -pub fn main() !void { - var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); - defer arena.deinit(); - const allocator = arena.allocator(); - - var flat_map = std.AutoHashMap(u21, u8).init(allocator); - defer flat_map.deinit(); - - var line_buf: [4096]u8 = undefined; - - // Process DerivedEastAsianWidth.txt - var cc_file = try std.fs.cwd().openFile("data/unicode/extracted/DerivedCombiningClass.txt", .{}); - defer cc_file.close(); - var cc_buf = std.io.bufferedReader(cc_file.reader()); - const cc_reader = cc_buf.reader(); - - while (try cc_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { - if (line.len == 0 or line[0] == '#') continue; - const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line; - - var field_iter = std.mem.tokenizeAny(u8, no_comment, "; "); - var current_code: [2]u21 = undefined; - - var i: usize = 0; - while (field_iter.next()) |field| : (i += 1) { - switch (i) { - 0 => { - // Code point(s) - if (std.mem.indexOf(u8, field, "..")) |dots| { - current_code = .{ - try std.fmt.parseInt(u21, field[0..dots], 16), - try std.fmt.parseInt(u21, field[dots + 2 ..], 16), - }; - } else { - const code = try std.fmt.parseInt(u21, field, 16); - current_code = .{ code, code }; - } - }, - 1 => { - // Combining Class - if (std.mem.eql(u8, field, "0")) continue; - const cc = try std.fmt.parseInt(u8, field, 10); - for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), cc); - }, - else => {}, - } - } - } - - var blocks_map = BlockMap.init(allocator); - defer blocks_map.deinit(); - - var stage1 = std.ArrayList(u16).init(allocator); - defer stage1.deinit(); - - var stage2 = std.ArrayList(u8).init(allocator); - defer stage2.deinit(); - - var block: Block = [_]u8{0} ** block_size; - var block_len: u16 = 0; - - for (0..0x110000) |i| { - const cp: u21 = @intCast(i); - const cc = flat_map.get(cp) orelse 0; - - // Process block - block[block_len] = cc; - block_len += 1; - - if (block_len < block_size and cp != 0x10ffff) continue; - - const gop = try blocks_map.getOrPut(block); - if (!gop.found_existing) { - gop.value_ptr.* = @intCast(stage2.items.len); - try stage2.appendSlice(&block); - } - - try stage1.append(gop.value_ptr.*); - block_len = 0; - } - - var args_iter = try std.process.argsWithAllocator(allocator); - defer args_iter.deinit(); - _ = args_iter.skip(); - const output_path = args_iter.next() orelse @panic("No output file arg!"); - - var out_file = try std.fs.cwd().createFile(output_path, .{}); - defer out_file.close(); - var out_buf = std.io.bufferedWriter(out_file.writer()); - const writer = out_buf.writer(); - - try writer.writeAll("const std = @import(\"std\");\n"); - - try writer.print("const Stage2Int = std.math.IntFittingRange(0, {});\n", .{stage2.items.len}); - try writer.print("pub const stage_1 = [{}]Stage2Int{{", .{stage1.items.len}); - for (stage1.items) |v| try writer.print("{},", .{v}); - try writer.writeAll("};\n"); - - try writer.print("pub const stage_2 = [{}]u8{{", .{stage2.items.len}); - for (stage2.items) |v| try writer.print("{},", .{v}); - try writer.writeAll("};\n"); - - try out_buf.flush(); -} diff --git a/src/CombiningClassData.zig b/src/CombiningClassData.zig new file mode 100644 index 0000000..95c947d --- /dev/null +++ b/src/CombiningClassData.zig @@ -0,0 +1,48 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; + +allocator: mem.Allocator, +s1: []u16 = undefined, +s2: []u8 = undefined, + +const Self = @This(); + +pub fn init(allocator: mem.Allocator) !Self { + const decompressor = compress.deflate.decompressor; + const in_bytes = @embedFile("ccc"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = try decompressor(allocator, in_fbs.reader(), null); + defer in_decomp.deinit(); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + var self = Self{ .allocator = allocator }; + + const stage_1_len: u16 = try reader.readInt(u16, endian); + self.s1 = try allocator.alloc(u16, stage_1_len); + for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); + + const stage_2_len: u16 = try reader.readInt(u16, endian); + self.s2 = try allocator.alloc(u8, stage_2_len); + _ = try reader.readAll(self.s2); + + return self; +} + +pub fn deinit(self: *Self) void { + self.allocator.free(self.s1); + self.allocator.free(self.s2); +} + +/// Returns the canonical combining class for a code point. +pub inline fn ccc(self: Self, cp: u21) u8 { + return self.s2[self.s1[cp >> 8] + (cp & 0xff)]; +} + +/// True if `cp` is a starter code point, not a combining character. +pub inline fn isStarter(self: Self, cp: u21) bool { + return self.s2[self.s1[cp >> 8] + (cp & 0xff)] == 0; +} diff --git a/src/DisplayWidth.zig b/src/DisplayWidth.zig new file mode 100644 index 0000000..85d04a0 --- /dev/null +++ b/src/DisplayWidth.zig @@ -0,0 +1,351 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const ArrayList = std.ArrayList; +const mem = std.mem; +const simd = std.simd; +const testing = std.testing; + +const ascii = @import("ascii"); +const CodePointIterator = @import("code_point").Iterator; +const GraphemeIterator = @import("grapheme").Iterator; +pub const Data = @import("DisplayWidthData"); + +data: *Data, + +const Self = @This(); + +/// strWidth returns the total display width of `str` as the number of cells +/// required in a fixed-pitch font (i.e. a terminal screen). +pub fn strWidth(self: Self, str: []const u8) usize { + var total: isize = 0; + + // ASCII fast path + if (ascii.isAsciiOnly(str)) { + for (str) |b| total += self.data.codePointWidth(b); + return @intCast(@max(0, total)); + } + + var giter = GraphemeIterator.init(str, &self.data.g_data); + + while (giter.next()) |gc| { + var cp_iter = CodePointIterator{ .bytes = gc.bytes(str) }; + var gc_total: isize = 0; + + while (cp_iter.next()) |cp| { + var w = self.data.codePointWidth(cp.code); + + if (w != 0) { + // Handle text emoji sequence. + if (cp_iter.next()) |ncp| { + // emoji text sequence. + if (ncp.code == 0xFE0E) w = 1; + } + + // Only adding width of first non-zero-width code point. + if (gc_total == 0) { + gc_total = w; + break; + } + } + } + + total += gc_total; + } + + return @intCast(@max(0, total)); +} + +test "strWidth" { + var data = try Data.init(testing.allocator); + defer data.deinit(); + const self = Self{ .data = &data }; + + try testing.expectEqual(@as(usize, 5), self.strWidth("Hello\r\n")); + try testing.expectEqual(@as(usize, 1), self.strWidth("\u{0065}\u{0301}")); + try testing.expectEqual(@as(usize, 2), self.strWidth("\u{1F476}\u{1F3FF}\u{0308}\u{200D}\u{1F476}\u{1F3FF}")); + try testing.expectEqual(@as(usize, 8), self.strWidth("Hello 😊")); + try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo 😊")); + try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo :)")); + try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo 🇪🇸")); + try testing.expectEqual(@as(usize, 2), self.strWidth("\u{26A1}")); // Lone emoji + try testing.expectEqual(@as(usize, 1), self.strWidth("\u{26A1}\u{FE0E}")); // Text sequence + try testing.expectEqual(@as(usize, 2), self.strWidth("\u{26A1}\u{FE0F}")); // Presentation sequence + try testing.expectEqual(@as(usize, 0), self.strWidth("A\x08")); // Backspace + try testing.expectEqual(@as(usize, 0), self.strWidth("\x7FA")); // DEL + try testing.expectEqual(@as(usize, 0), self.strWidth("\x7FA\x08\x08")); // never less than o + + // wcwidth Python lib tests. See: https://github.com/jquast/wcwidth/blob/master/tests/test_core.py + const empty = ""; + try testing.expectEqual(@as(usize, 0), self.strWidth(empty)); + const with_null = "hello\x00world"; + try testing.expectEqual(@as(usize, 10), self.strWidth(with_null)); + const hello_jp = "コンニチハ, セカイ!"; + try testing.expectEqual(@as(usize, 19), self.strWidth(hello_jp)); + const control = "\x1b[0m"; + try testing.expectEqual(@as(usize, 3), self.strWidth(control)); + const balinese = "\u{1B13}\u{1B28}\u{1B2E}\u{1B44}"; + try testing.expectEqual(@as(usize, 3), self.strWidth(balinese)); + + // These commented out tests require a new specification for complex scripts. + // See: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf + // const jamo = "\u{1100}\u{1160}"; + // try testing.expectEqual(@as(usize, 3), strWidth(jamo)); + // const devengari = "\u{0915}\u{094D}\u{0937}\u{093F}"; + // try testing.expectEqual(@as(usize, 3), strWidth(devengari)); + // const tamal = "\u{0b95}\u{0bcd}\u{0bb7}\u{0bcc}"; + // try testing.expectEqual(@as(usize, 5), strWidth(tamal)); + // const kannada_1 = "\u{0cb0}\u{0ccd}\u{0c9d}\u{0cc8}"; + // try testing.expectEqual(@as(usize, 3), strWidth(kannada_1)); + // The following passes but as a mere coincidence. + const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}"; + try testing.expectEqual(@as(usize, 2), self.strWidth(kannada_2)); + + // From Rust https://github.com/jameslanska/unicode-display-width + try testing.expectEqual(@as(usize, 15), self.strWidth("🔥🗡🍩👩🏻‍🚀⏰💃🏼🔦👍🏻")); + try testing.expectEqual(@as(usize, 2), self.strWidth("🦀")); + try testing.expectEqual(@as(usize, 2), self.strWidth("👨‍👩‍👧‍👧")); + try testing.expectEqual(@as(usize, 2), self.strWidth("👩‍🔬")); + try testing.expectEqual(@as(usize, 9), self.strWidth("sane text")); + try testing.expectEqual(@as(usize, 9), self.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ")); + try testing.expectEqual(@as(usize, 17), self.strWidth("슬라바 우크라이나")); + try testing.expectEqual(@as(usize, 1), self.strWidth("\u{378}")); +} + +/// centers `str` in a new string of width `total_width` (in display cells) using `pad` as padding. +/// If the length of `str` and `total_width` have different parity, the right side of `str` will +/// receive one additional pad. This makes sure the returned string fills the requested width. +/// Caller must free returned bytes with `allocator`. +pub fn center( + self: Self, + allocator: mem.Allocator, + str: []const u8, + total_width: usize, + pad: []const u8, +) ![]u8 { + const str_width = self.strWidth(str); + if (str_width > total_width) return error.StrTooLong; + if (str_width == total_width) return try allocator.dupe(u8, str); + + const pad_width = self.strWidth(pad); + if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; + + const margin_width = @divFloor((total_width - str_width), 2); + if (pad_width > margin_width) return error.PadTooLong; + const extra_pad: usize = if (total_width % 2 != str_width % 2) 1 else 0; + const pads = @divFloor(margin_width, pad_width) * 2 + extra_pad; + + var result = try allocator.alloc(u8, pads * pad.len + str.len); + var bytes_index: usize = 0; + var pads_index: usize = 0; + + while (pads_index < pads / 2) : (pads_index += 1) { + @memcpy(result[bytes_index..][0..pad.len], pad); + bytes_index += pad.len; + } + + @memcpy(result[bytes_index..][0..str.len], str); + bytes_index += str.len; + + pads_index = 0; + while (pads_index < pads / 2 + extra_pad) : (pads_index += 1) { + @memcpy(result[bytes_index..][0..pad.len], pad); + bytes_index += pad.len; + } + + return result; +} + +test "center" { + const allocator = testing.allocator; + var data = try Data.init(allocator); + defer data.deinit(); + const self = Self{ .data = &data }; + + // Input and width both have odd length + var centered = try self.center(allocator, "abc", 9, "*"); + try testing.expectEqualSlices(u8, "***abc***", centered); + + // Input and width both have even length + testing.allocator.free(centered); + centered = try self.center(allocator, "w😊w", 10, "-"); + try testing.expectEqualSlices(u8, "---w😊w---", centered); + + // Input has even length, width has odd length + testing.allocator.free(centered); + centered = try self.center(allocator, "1234", 9, "-"); + try testing.expectEqualSlices(u8, "--1234---", centered); + + // Input has odd length, width has even length + testing.allocator.free(centered); + centered = try self.center(allocator, "123", 8, "-"); + try testing.expectEqualSlices(u8, "--123---", centered); + + // Input is the same length as the width + testing.allocator.free(centered); + centered = try self.center(allocator, "123", 3, "-"); + try testing.expectEqualSlices(u8, "123", centered); + + // Input is empty + testing.allocator.free(centered); + centered = try self.center(allocator, "", 3, "-"); + try testing.expectEqualSlices(u8, "---", centered); + + // Input is empty and width is zero + testing.allocator.free(centered); + centered = try self.center(allocator, "", 0, "-"); + try testing.expectEqualSlices(u8, "", centered); + + // Input is longer than the width, which is an error + testing.allocator.free(centered); + try testing.expectError(error.StrTooLong, self.center(allocator, "123", 2, "-")); +} + +/// padLeft returns a new string of width `total_width` (in display cells) using `pad` as padding +/// on the left side. Caller must free returned bytes with `allocator`. +pub fn padLeft( + self: Self, + allocator: mem.Allocator, + str: []const u8, + total_width: usize, + pad: []const u8, +) ![]u8 { + const str_width = self.strWidth(str); + if (str_width > total_width) return error.StrTooLong; + + const pad_width = self.strWidth(pad); + if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; + + const margin_width = total_width - str_width; + if (pad_width > margin_width) return error.PadTooLong; + + const pads = @divFloor(margin_width, pad_width); + + var result = try allocator.alloc(u8, pads * pad.len + str.len); + var bytes_index: usize = 0; + var pads_index: usize = 0; + + while (pads_index < pads) : (pads_index += 1) { + @memcpy(result[bytes_index..][0..pad.len], pad); + bytes_index += pad.len; + } + + @memcpy(result[bytes_index..][0..str.len], str); + + return result; +} + +test "padLeft" { + const allocator = testing.allocator; + var data = try Data.init(allocator); + defer data.deinit(); + const self = Self{ .data = &data }; + + var right_aligned = try self.padLeft(allocator, "abc", 9, "*"); + defer testing.allocator.free(right_aligned); + try testing.expectEqualSlices(u8, "******abc", right_aligned); + + testing.allocator.free(right_aligned); + right_aligned = try self.padLeft(allocator, "w😊w", 10, "-"); + try testing.expectEqualSlices(u8, "------w😊w", right_aligned); +} + +/// padRight returns a new string of width `total_width` (in display cells) using `pad` as padding +/// on the right side. Caller must free returned bytes with `allocator`. +pub fn padRight( + self: Self, + allocator: mem.Allocator, + str: []const u8, + total_width: usize, + pad: []const u8, +) ![]u8 { + const str_width = self.strWidth(str); + if (str_width > total_width) return error.StrTooLong; + + const pad_width = self.strWidth(pad); + if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; + + const margin_width = total_width - str_width; + if (pad_width > margin_width) return error.PadTooLong; + + const pads = @divFloor(margin_width, pad_width); + + var result = try allocator.alloc(u8, pads * pad.len + str.len); + var bytes_index: usize = 0; + var pads_index: usize = 0; + + @memcpy(result[bytes_index..][0..str.len], str); + bytes_index += str.len; + + while (pads_index < pads) : (pads_index += 1) { + @memcpy(result[bytes_index..][0..pad.len], pad); + bytes_index += pad.len; + } + + return result; +} + +test "padRight" { + const allocator = testing.allocator; + var data = try Data.init(allocator); + defer data.deinit(); + const self = Self{ .data = &data }; + + var left_aligned = try self.padRight(allocator, "abc", 9, "*"); + defer testing.allocator.free(left_aligned); + try testing.expectEqualSlices(u8, "abc******", left_aligned); + + testing.allocator.free(left_aligned); + left_aligned = try self.padRight(allocator, "w😊w", 10, "-"); + try testing.expectEqualSlices(u8, "w😊w------", left_aligned); +} + +/// Wraps a string approximately at the given number of colums per line. +/// `threshold` defines how far the last column of the last word can be +/// from the edge. Caller must free returned bytes with `allocator`. +pub fn wrap( + self: Self, + allocator: mem.Allocator, + str: []const u8, + columns: usize, + threshold: usize, +) ![]u8 { + var result = ArrayList(u8).init(allocator); + defer result.deinit(); + + var line_iter = mem.tokenizeAny(u8, str, "\r\n"); + var line_width: usize = 0; + + while (line_iter.next()) |line| { + var word_iter = mem.tokenizeScalar(u8, line, ' '); + + while (word_iter.next()) |word| { + try result.appendSlice(word); + try result.append(' '); + line_width += self.strWidth(word) + 1; + + if (line_width > columns or columns - line_width <= threshold) { + try result.append('\n'); + line_width = 0; + } + } + } + + // Remove trailing space and newline. + _ = result.pop(); + _ = result.pop(); + + return try result.toOwnedSlice(); +} + +test "wrap" { + const allocator = testing.allocator; + var data = try Data.init(allocator); + defer data.deinit(); + const self = Self{ .data = &data }; + + const input = "The quick brown fox\r\njumped over the lazy dog!"; + const got = try self.wrap(allocator, input, 10, 3); + defer testing.allocator.free(got); + const want = "The quick \nbrown fox \njumped \nover the \nlazy dog!"; + try testing.expectEqualStrings(want, got); +} diff --git a/src/DisplayWidthData.zig b/src/DisplayWidthData.zig new file mode 100644 index 0000000..32f8658 --- /dev/null +++ b/src/DisplayWidthData.zig @@ -0,0 +1,82 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; +const testing = std.testing; + +const GraphemeData = @import("GraphemeData"); + +allocator: mem.Allocator, +g_data: GraphemeData, +s1: []u16 = undefined, +s2: []i3 = undefined, + +const Self = @This(); + +pub fn init(allocator: mem.Allocator) !Self { + const decompressor = compress.deflate.decompressor; + const in_bytes = @embedFile("dwp"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = try decompressor(allocator, in_fbs.reader(), null); + defer in_decomp.deinit(); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + var self = Self{ + .allocator = allocator, + .g_data = try GraphemeData.init(allocator), + }; + + const stage_1_len: u16 = try reader.readInt(u16, endian); + self.s1 = try allocator.alloc(u16, stage_1_len); + for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); + + const stage_2_len: u16 = try reader.readInt(u16, endian); + self.s2 = try allocator.alloc(i3, stage_2_len); + for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(i8, endian)); + + return self; +} + +pub fn deinit(self: *Self) void { + self.allocator.free(self.s1); + self.allocator.free(self.s2); + self.g_data.deinit(); +} + +/// codePointWidth returns the number of cells `cp` requires when rendered +/// in a fixed-pitch font (i.e. a terminal screen). This can range from -1 to +/// 3, where BACKSPACE and DELETE return -1 and 3-em-dash returns 3. C0/C1 +/// control codes return 0. If `cjk` is true, ambiguous code points return 2, +/// otherwise they return 1. +pub inline fn codePointWidth(self: Self, cp: u21) i3 { + return self.s2[self.s1[cp >> 8] + (cp & 0xff)]; +} + +test "codePointWidth" { + try testing.expectEqual(@as(i3, 0), codePointWidth(0x0000)); // null + try testing.expectEqual(@as(i3, -1), codePointWidth(0x8)); // \b + try testing.expectEqual(@as(i3, -1), codePointWidth(0x7f)); // DEL + try testing.expectEqual(@as(i3, 0), codePointWidth(0x0005)); // Cf + try testing.expectEqual(@as(i3, 0), codePointWidth(0x0007)); // \a BEL + try testing.expectEqual(@as(i3, 0), codePointWidth(0x000A)); // \n LF + try testing.expectEqual(@as(i3, 0), codePointWidth(0x000B)); // \v VT + try testing.expectEqual(@as(i3, 0), codePointWidth(0x000C)); // \f FF + try testing.expectEqual(@as(i3, 0), codePointWidth(0x000D)); // \r CR + try testing.expectEqual(@as(i3, 0), codePointWidth(0x000E)); // SQ + try testing.expectEqual(@as(i3, 0), codePointWidth(0x000F)); // SI + + try testing.expectEqual(@as(i3, 0), codePointWidth(0x070F)); // Cf + try testing.expectEqual(@as(i3, 1), codePointWidth(0x0603)); // Cf Arabic + + try testing.expectEqual(@as(i3, 1), codePointWidth(0x00AD)); // soft-hyphen + try testing.expectEqual(@as(i3, 2), codePointWidth(0x2E3A)); // two-em dash + try testing.expectEqual(@as(i3, 3), codePointWidth(0x2E3B)); // three-em dash + + try testing.expectEqual(@as(i3, 1), codePointWidth(0x00BD)); // ambiguous halfwidth + + try testing.expectEqual(@as(i3, 1), codePointWidth('é')); + try testing.expectEqual(@as(i3, 2), codePointWidth('😊')); + try testing.expectEqual(@as(i3, 2), codePointWidth('统')); +} diff --git a/src/GraphemeData.zig b/src/GraphemeData.zig new file mode 100644 index 0000000..e418dea --- /dev/null +++ b/src/GraphemeData.zig @@ -0,0 +1,86 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; + +/// Indic syllable type. +pub const Indic = enum { + none, + + Consonant, + Extend, + Linker, +}; + +/// Grapheme break property. +pub const Gbp = enum { + none, + Control, + CR, + Extend, + L, + LF, + LV, + LVT, + Prepend, + Regional_Indicator, + SpacingMark, + T, + V, + ZWJ, +}; + +allocator: mem.Allocator, +s1: []u16 = undefined, +s2: []u16 = undefined, +s3: []u8 = undefined, + +const Self = @This(); + +pub fn init(allocator: mem.Allocator) !Self { + const decompressor = compress.deflate.decompressor; + const in_bytes = @embedFile("gbp"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = try decompressor(allocator, in_fbs.reader(), null); + defer in_decomp.deinit(); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + var self = Self{ .allocator = allocator }; + + const s1_len: u16 = try reader.readInt(u16, endian); + self.s1 = try allocator.alloc(u16, s1_len); + for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian); + + const s2_len: u16 = try reader.readInt(u16, endian); + self.s2 = try allocator.alloc(u16, s2_len); + for (0..s2_len) |i| self.s2[i] = try reader.readInt(u16, endian); + + const s3_len: u16 = try reader.readInt(u16, endian); + self.s3 = try allocator.alloc(u8, s3_len); + _ = try reader.readAll(self.s3); + + return self; +} + +pub fn deinit(self: *Self) void { + self.allocator.free(self.s1); + self.allocator.free(self.s2); + self.allocator.free(self.s3); +} + +/// Lookup the grapheme break property for a code point. +pub inline fn gbp(self: Self, cp: u21) Gbp { + return @enumFromInt(self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] >> 4); +} + +/// Lookup the indic syllable type for a code point. +pub inline fn indic(self: Self, cp: u21) Indic { + return @enumFromInt((self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7); +} + +/// Lookup the indic syllable type for a code point. +pub inline fn isEmoji(self: Self, cp: u21) bool { + return self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; +} diff --git a/src/Normalizer.zig b/src/Normalizer.zig index 1b4a2d5..6a19f47 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig @@ -8,16 +8,18 @@ const CodePointIterator = @import("code_point").Iterator; const case_fold_map = @import("ziglyph").case_folding; const hangul_map = @import("ziglyph").hangul; const norm_props = @import("ziglyph").normalization_props; -const normp = @import("normp"); - -const Self = @This(); +pub const Data = @import("CombiningClassData"); +ccc_data: *Data, nfc_map: std.AutoHashMap([2]u21, u21), nfd_map: std.AutoHashMap(u21, [2]u21), nfkd_map: std.AutoHashMap(u21, [18]u21), -pub fn init(allocator: std.mem.Allocator) !Self { +const Self = @This(); + +pub fn init(allocator: std.mem.Allocator, data: *Data) !Self { var self = Self{ + .ccc_data = data, .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator), .nfd_map = std.AutoHashMap(u21, [2]u21).init(allocator), .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), @@ -95,7 +97,9 @@ pub fn deinit(self: *Self) void { } test "init / deinit" { - var n = try init(std.testing.allocator); + var data = try Data.init(std.testing.allocator); + defer data.deinit(); + var n = try init(std.testing.allocator, &data); defer n.deinit(); } @@ -241,7 +245,9 @@ pub fn decompose(self: Self, cp: u21, form: Form) Decomp { test "decompose" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); var dc = n.decompose('é', .nfd); @@ -307,19 +313,17 @@ pub const Result = struct { }; // Compares code points by Canonical Combining Class order. -fn cccLess(_: void, lhs: u21, rhs: u21) bool { - const lcc = normp.stage_2[normp.stage_1[lhs >> 8] + (lhs & 0xff)]; - const rcc = normp.stage_2[normp.stage_1[rhs >> 8] + (rhs & 0xff)]; - return lcc < rcc; +fn cccLess(self: Self, lhs: u21, rhs: u21) bool { + return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs); } // Applies the Canonical Sorting Algorithm. -fn canonicalSort(cps: []u21) void { +fn canonicalSort(self: Self, cps: []u21) void { var i: usize = 0; while (i < cps.len) : (i += 1) { const start: usize = i; - while (i < cps.len and normp.stage_2[normp.stage_1[cps[i] >> 8] + (cps[i] & 0xff)] != 0) : (i += 1) {} - std.mem.sort(u21, cps[start..i], {}, cccLess); + while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} + std.mem.sort(u21, cps[start..i], self, cccLess); } } @@ -349,7 +353,7 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! try dcp_list.appendSlice(slice); } - canonicalSort(dcp_list.items); + self.canonicalSort(dcp_list.items); var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4); defer dstr_list.deinit(); @@ -365,7 +369,9 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! test "nfd ASCII / no-alloc" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); var result = try n.nfd(allocator, "Hello World!"); @@ -376,7 +382,9 @@ test "nfd ASCII / no-alloc" { test "nfd !ASCII / alloc" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); @@ -387,7 +395,9 @@ test "nfd !ASCII / alloc" { test "nfkd ASCII / no-alloc" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); var result = try n.nfkd(allocator, "Hello World!"); @@ -398,7 +408,9 @@ test "nfkd ASCII / no-alloc" { test "nfkd !ASCII / alloc" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); @@ -413,16 +425,8 @@ fn isHangul(cp: u21) bool { return cp >= 0x1100 and hangul_map.syllableType(cp) != null; } -fn isStarter(cp: u21) bool { - return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] == 0; -} - -fn isCombining(cp: u21) bool { - return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] != 0; -} - -fn isNonHangulStarter(cp: u21) bool { - return !isHangul(cp) and isStarter(cp); +fn isNonHangulStarter(self: Self, cp: u21) bool { + return !isHangul(cp) and self.ccc_data.isStarter(cp); } /// Normalizes `str` to NFC. @@ -464,7 +468,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! block_check: while (i < d_list.items.len) : (i += 1) { const C = d_list.items[i]; - const cc_C = normp.stage_2[normp.stage_1[C >> 8] + (C & 0xff)]; + const cc_C = self.ccc_data.ccc(C); var starter_index: ?usize = null; var j: usize = i; @@ -472,14 +476,14 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! j -= 1; // Check for starter. - if (isStarter(d_list.items[j])) { + if (self.ccc_data.isStarter(d_list.items[j])) { if (i - j > 1) { // If there's distance between the starting point and the current position. for (d_list.items[(j + 1)..i]) |B| { + const cc_B = self.ccc_data.ccc(B); // Check for blocking conditions. if (isHangul(C)) { - if (isCombining(B) or isNonHangulStarter(B)) continue :block_check; + if (cc_B != 0 or self.isNonHangulStarter(B)) continue :block_check; } - const cc_B = normp.stage_2[normp.stage_1[B >> 8] + (B & 0xff)]; if (cc_B >= cc_C) continue :block_check; } } @@ -560,7 +564,9 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! test "nfc" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); @@ -571,7 +577,9 @@ test "nfc" { test "nfkc" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); @@ -630,7 +638,9 @@ pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u test "eql" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); @@ -697,7 +707,9 @@ pub fn eqlCaseless(self: Self, allocator: std.mem.Allocator, a: []const u8, b: [ test "eqlCaseless" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}")); @@ -707,7 +719,7 @@ test "eqlCaseless" { // FCD fn getLeadCcc(self: Self, cp: u21) u8 { const dc = self.mapping(cp, .nfd); - return normp.stage_2[normp.stage_1[dc.cps[0] >> 8] + (dc.cps[0] & 0xff)]; + return self.ccc_data.ccc(dc.cps[0]); } fn getTrailCcc(self: Self, cp: u21) u8 { @@ -715,8 +727,7 @@ fn getTrailCcc(self: Self, cp: u21) u8 { const len = for (dc.cps, 0..) |dcp, i| { if (dcp == 0) break i; } else dc.cps.len; - const tcp = dc.cps[len -| 1]; - return normp.stage_2[normp.stage_1[tcp >> 8] + (tcp & 0xff)]; + return self.ccc_data.ccc(dc.cps[len - 1]); } /// Fast check to detect if a string is already in NFC or NFD form. @@ -733,7 +744,9 @@ pub fn isFcd(self: Self, str: []const u8) bool { test "isFcd" { const allocator = std.testing.allocator; - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); const is_nfc = "José \u{3D3}"; @@ -751,7 +764,9 @@ test "Unicode normalization tests" { defer arena.deinit(); var allocator = arena.allocator(); - var n = try init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + var n = try init(allocator, &data); defer n.deinit(); var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); diff --git a/src/display_width.zig b/src/display_width.zig deleted file mode 100644 index a916cac..0000000 --- a/src/display_width.zig +++ /dev/null @@ -1,360 +0,0 @@ -const std = @import("std"); -const simd = std.simd; -const mem = std.mem; -const testing = std.testing; - -const ascii = @import("ascii"); -const CodePointIterator = @import("code_point").Iterator; -const dwp = @import("dwp"); -const GraphemeIterator = @import("grapheme").Iterator; - -/// codePointWidth returns the number of cells `cp` requires when rendered -/// in a fixed-pitch font (i.e. a terminal screen). This can range from -1 to -/// 3, where BACKSPACE and DELETE return -1 and 3-em-dash returns 3. C0/C1 -/// control codes return 0. If `cjk` is true, ambiguous code points return 2, -/// otherwise they return 1. -pub fn codePointWidth(cp: u21) i3 { - return dwp.stage_2[dwp.stage_1[cp >> 8] + (cp & 0xff)]; -} - -test "codePointWidth" { - try testing.expectEqual(@as(i3, 0), codePointWidth(0x0000)); // null - try testing.expectEqual(@as(i3, -1), codePointWidth(0x8)); // \b - try testing.expectEqual(@as(i3, -1), codePointWidth(0x7f)); // DEL - try testing.expectEqual(@as(i3, 0), codePointWidth(0x0005)); // Cf - try testing.expectEqual(@as(i3, 0), codePointWidth(0x0007)); // \a BEL - try testing.expectEqual(@as(i3, 0), codePointWidth(0x000A)); // \n LF - try testing.expectEqual(@as(i3, 0), codePointWidth(0x000B)); // \v VT - try testing.expectEqual(@as(i3, 0), codePointWidth(0x000C)); // \f FF - try testing.expectEqual(@as(i3, 0), codePointWidth(0x000D)); // \r CR - try testing.expectEqual(@as(i3, 0), codePointWidth(0x000E)); // SQ - try testing.expectEqual(@as(i3, 0), codePointWidth(0x000F)); // SI - - try testing.expectEqual(@as(i3, 0), codePointWidth(0x070F)); // Cf - try testing.expectEqual(@as(i3, 1), codePointWidth(0x0603)); // Cf Arabic - - try testing.expectEqual(@as(i3, 1), codePointWidth(0x00AD)); // soft-hyphen - try testing.expectEqual(@as(i3, 2), codePointWidth(0x2E3A)); // two-em dash - try testing.expectEqual(@as(i3, 3), codePointWidth(0x2E3B)); // three-em dash - - try testing.expectEqual(@as(i3, 1), codePointWidth(0x00BD)); // ambiguous halfwidth - - try testing.expectEqual(@as(i3, 1), codePointWidth('é')); - try testing.expectEqual(@as(i3, 2), codePointWidth('😊')); - try testing.expectEqual(@as(i3, 2), codePointWidth('统')); -} - -/// strWidth returns the total display width of `str` as the number of cells -/// required in a fixed-pitch font (i.e. a terminal screen). -pub fn strWidth(str: []const u8) usize { - var total: isize = 0; - - // ASCII fast path - if (ascii.isAsciiOnly(str)) { - for (str) |b| total += codePointWidth(b); - return @intCast(@max(0, total)); - } - - var giter = GraphemeIterator.init(str); - - while (giter.next()) |gc| { - var cp_iter = CodePointIterator{ .bytes = gc.bytes(str) }; - var gc_total: isize = 0; - - while (cp_iter.next()) |cp| { - var w = codePointWidth(cp.code); - - if (w != 0) { - // Handle text emoji sequence. - if (cp_iter.next()) |ncp| { - // emoji text sequence. - if (ncp.code == 0xFE0E) w = 1; - } - - // Only adding width of first non-zero-width code point. - if (gc_total == 0) { - gc_total = w; - break; - } - } - } - - total += gc_total; - } - - return @intCast(@max(0, total)); -} - -test "strWidth" { - try testing.expectEqual(@as(usize, 5), strWidth("Hello\r\n")); - try testing.expectEqual(@as(usize, 1), strWidth("\u{0065}\u{0301}")); - try testing.expectEqual(@as(usize, 2), strWidth("\u{1F476}\u{1F3FF}\u{0308}\u{200D}\u{1F476}\u{1F3FF}")); - try testing.expectEqual(@as(usize, 8), strWidth("Hello 😊")); - try testing.expectEqual(@as(usize, 8), strWidth("Héllo 😊")); - try testing.expectEqual(@as(usize, 8), strWidth("Héllo :)")); - try testing.expectEqual(@as(usize, 8), strWidth("Héllo 🇪🇸")); - try testing.expectEqual(@as(usize, 2), strWidth("\u{26A1}")); // Lone emoji - try testing.expectEqual(@as(usize, 1), strWidth("\u{26A1}\u{FE0E}")); // Text sequence - try testing.expectEqual(@as(usize, 2), strWidth("\u{26A1}\u{FE0F}")); // Presentation sequence - try testing.expectEqual(@as(usize, 0), strWidth("A\x08")); // Backspace - try testing.expectEqual(@as(usize, 0), strWidth("\x7FA")); // DEL - try testing.expectEqual(@as(usize, 0), strWidth("\x7FA\x08\x08")); // never less than o - - // wcwidth Python lib tests. See: https://github.com/jquast/wcwidth/blob/master/tests/test_core.py - const empty = ""; - try testing.expectEqual(@as(usize, 0), strWidth(empty)); - const with_null = "hello\x00world"; - try testing.expectEqual(@as(usize, 10), strWidth(with_null)); - const hello_jp = "コンニチハ, セカイ!"; - try testing.expectEqual(@as(usize, 19), strWidth(hello_jp)); - const control = "\x1b[0m"; - try testing.expectEqual(@as(usize, 3), strWidth(control)); - const balinese = "\u{1B13}\u{1B28}\u{1B2E}\u{1B44}"; - try testing.expectEqual(@as(usize, 3), strWidth(balinese)); - - // These commented out tests require a new specification for complex scripts. - // See: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf - // const jamo = "\u{1100}\u{1160}"; - // try testing.expectEqual(@as(usize, 3), strWidth(jamo)); - // const devengari = "\u{0915}\u{094D}\u{0937}\u{093F}"; - // try testing.expectEqual(@as(usize, 3), strWidth(devengari)); - // const tamal = "\u{0b95}\u{0bcd}\u{0bb7}\u{0bcc}"; - // try testing.expectEqual(@as(usize, 5), strWidth(tamal)); - // const kannada_1 = "\u{0cb0}\u{0ccd}\u{0c9d}\u{0cc8}"; - // try testing.expectEqual(@as(usize, 3), strWidth(kannada_1)); - // The following passes but as a mere coincidence. - const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}"; - try testing.expectEqual(@as(usize, 2), strWidth(kannada_2)); - - // From Rust https://github.com/jameslanska/unicode-display-width - try testing.expectEqual(@as(usize, 15), strWidth("🔥🗡🍩👩🏻‍🚀⏰💃🏼🔦👍🏻")); - try testing.expectEqual(@as(usize, 2), strWidth("🦀")); - try testing.expectEqual(@as(usize, 2), strWidth("👨‍👩‍👧‍👧")); - try testing.expectEqual(@as(usize, 2), strWidth("👩‍🔬")); - try testing.expectEqual(@as(usize, 9), strWidth("sane text")); - try testing.expectEqual(@as(usize, 9), strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ")); - try testing.expectEqual(@as(usize, 17), strWidth("슬라바 우크라이나")); - try testing.expectEqual(@as(usize, 1), strWidth("\u{378}")); -} - -/// centers `str` in a new string of width `total_width` (in display cells) using `pad` as padding. -/// If the length of `str` and `total_width` have different parity, the right side of `str` will -/// receive one additional pad. This makes sure the returned string fills the requested width. -/// Caller must free returned bytes with `allocator`. -pub fn center( - allocator: mem.Allocator, - str: []const u8, - total_width: usize, - pad: []const u8, -) ![]u8 { - const str_width = strWidth(str); - if (str_width > total_width) return error.StrTooLong; - if (str_width == total_width) return try allocator.dupe(u8, str); - - const pad_width = strWidth(pad); - if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; - - const margin_width = @divFloor((total_width - str_width), 2); - if (pad_width > margin_width) return error.PadTooLong; - const extra_pad: usize = if (total_width % 2 != str_width % 2) 1 else 0; - const pads = @divFloor(margin_width, pad_width) * 2 + extra_pad; - - var result = try allocator.alloc(u8, pads * pad.len + str.len); - var bytes_index: usize = 0; - var pads_index: usize = 0; - - while (pads_index < pads / 2) : (pads_index += 1) { - @memcpy(result[bytes_index..][0..pad.len], pad); - bytes_index += pad.len; - } - - @memcpy(result[bytes_index..][0..str.len], str); - bytes_index += str.len; - - pads_index = 0; - while (pads_index < pads / 2 + extra_pad) : (pads_index += 1) { - @memcpy(result[bytes_index..][0..pad.len], pad); - bytes_index += pad.len; - } - - return result; -} - -test "center" { - var allocator = std.testing.allocator; - - // Input and width both have odd length - var centered = try center(allocator, "abc", 9, "*"); - try testing.expectEqualSlices(u8, "***abc***", centered); - - // Input and width both have even length - allocator.free(centered); - centered = try center(allocator, "w😊w", 10, "-"); - try testing.expectEqualSlices(u8, "---w😊w---", centered); - - // Input has even length, width has odd length - allocator.free(centered); - centered = try center(allocator, "1234", 9, "-"); - try testing.expectEqualSlices(u8, "--1234---", centered); - - // Input has odd length, width has even length - allocator.free(centered); - centered = try center(allocator, "123", 8, "-"); - try testing.expectEqualSlices(u8, "--123---", centered); - - // Input is the same length as the width - allocator.free(centered); - centered = try center(allocator, "123", 3, "-"); - try testing.expectEqualSlices(u8, "123", centered); - - // Input is empty - allocator.free(centered); - centered = try center(allocator, "", 3, "-"); - try testing.expectEqualSlices(u8, "---", centered); - - // Input is empty and width is zero - allocator.free(centered); - centered = try center(allocator, "", 0, "-"); - try testing.expectEqualSlices(u8, "", centered); - - // Input is longer than the width, which is an error - allocator.free(centered); - try testing.expectError(error.StrTooLong, center(allocator, "123", 2, "-")); -} - -/// padLeft returns a new string of width `total_width` (in display cells) using `pad` as padding -/// on the left side. Caller must free returned bytes with `allocator`. -pub fn padLeft( - allocator: std.mem.Allocator, - str: []const u8, - total_width: usize, - pad: []const u8, -) ![]u8 { - const str_width = strWidth(str); - if (str_width > total_width) return error.StrTooLong; - - const pad_width = strWidth(pad); - if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; - - const margin_width = total_width - str_width; - if (pad_width > margin_width) return error.PadTooLong; - - const pads = @divFloor(margin_width, pad_width); - - var result = try allocator.alloc(u8, pads * pad.len + str.len); - var bytes_index: usize = 0; - var pads_index: usize = 0; - - while (pads_index < pads) : (pads_index += 1) { - @memcpy(result[bytes_index..][0..pad.len], pad); - bytes_index += pad.len; - } - - @memcpy(result[bytes_index..][0..str.len], str); - - return result; -} - -test "padLeft" { - var allocator = std.testing.allocator; - - var right_aligned = try padLeft(allocator, "abc", 9, "*"); - defer allocator.free(right_aligned); - try testing.expectEqualSlices(u8, "******abc", right_aligned); - - allocator.free(right_aligned); - right_aligned = try padLeft(allocator, "w😊w", 10, "-"); - try testing.expectEqualSlices(u8, "------w😊w", right_aligned); -} - -/// padRight returns a new string of width `total_width` (in display cells) using `pad` as padding -/// on the right side. Caller must free returned bytes with `allocator`. -pub fn padRight( - allocator: std.mem.Allocator, - str: []const u8, - total_width: usize, - pad: []const u8, -) ![]u8 { - const str_width = strWidth(str); - if (str_width > total_width) return error.StrTooLong; - - const pad_width = strWidth(pad); - if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; - - const margin_width = total_width - str_width; - if (pad_width > margin_width) return error.PadTooLong; - - const pads = @divFloor(margin_width, pad_width); - - var result = try allocator.alloc(u8, pads * pad.len + str.len); - var bytes_index: usize = 0; - var pads_index: usize = 0; - - @memcpy(result[bytes_index..][0..str.len], str); - bytes_index += str.len; - - while (pads_index < pads) : (pads_index += 1) { - @memcpy(result[bytes_index..][0..pad.len], pad); - bytes_index += pad.len; - } - - return result; -} - -test "padRight" { - var allocator = std.testing.allocator; - - var left_aligned = try padRight(allocator, "abc", 9, "*"); - defer allocator.free(left_aligned); - try testing.expectEqualSlices(u8, "abc******", left_aligned); - - allocator.free(left_aligned); - left_aligned = try padRight(allocator, "w😊w", 10, "-"); - try testing.expectEqualSlices(u8, "w😊w------", left_aligned); -} - -/// Wraps a string approximately at the given number of colums per line. -/// `threshold` defines how far the last column of the last word can be -/// from the edge. Caller must free returned bytes with `allocator`. -pub fn wrap( - allocator: std.mem.Allocator, - str: []const u8, - columns: usize, - threshold: usize, -) ![]u8 { - var result = std.ArrayList(u8).init(allocator); - defer result.deinit(); - - var line_iter = mem.tokenizeAny(u8, str, "\r\n"); - var line_width: usize = 0; - - while (line_iter.next()) |line| { - var word_iter = mem.tokenizeScalar(u8, line, ' '); - - while (word_iter.next()) |word| { - try result.appendSlice(word); - try result.append(' '); - line_width += strWidth(word) + 1; - - if (line_width > columns or columns - line_width <= threshold) { - try result.append('\n'); - line_width = 0; - } - } - } - - // Remove trailing space and newline. - _ = result.pop(); - _ = result.pop(); - - return try result.toOwnedSlice(); -} - -test "wrap" { - var allocator = std.testing.allocator; - const input = "The quick brown fox\r\njumped over the lazy dog!"; - const got = try wrap(allocator, input, 10, 3); - defer allocator.free(got); - const want = "The quick \nbrown fox \njumped \nover the \nlazy dog!"; - try testing.expectEqualStrings(want, got); -} diff --git a/src/grapheme.zig b/src/grapheme.zig index 3fdf10b..7125b5b 100644 --- a/src/grapheme.zig +++ b/src/grapheme.zig @@ -1,9 +1,10 @@ const std = @import("std"); +const mem = std.mem; const unicode = std.unicode; const CodePoint = @import("code_point").CodePoint; const CodePointIterator = @import("code_point").Iterator; -const gbp = @import("gbp"); +pub const Data = @import("GraphemeData"); /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. pub const Grapheme = struct { @@ -21,12 +22,13 @@ pub const Grapheme = struct { pub const Iterator = struct { buf: [2]?CodePoint = .{ null, null }, cp_iter: CodePointIterator, + data: *Data, const Self = @This(); /// Assumes `src` is valid UTF-8. - pub fn init(str: []const u8) Self { - var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; + pub fn init(str: []const u8, data: *Data) Self { + var self = Self{ .cp_iter = .{ .bytes = str }, .data = data }; self.advance(); return self; } @@ -55,6 +57,7 @@ pub const Iterator = struct { if (graphemeBreak( self.buf[0].?.code, self.buf[1].?.code, + self.data, &state, )) return Grapheme{ .len = gc_len, .offset = gc_start }; @@ -67,6 +70,7 @@ pub const Iterator = struct { if (graphemeBreak( self.buf[0].?.code, if (self.buf[1]) |ncp| ncp.code else 0, + self.data, &state, )) break; } @@ -76,18 +80,12 @@ pub const Iterator = struct { }; // Predicates -fn isBreaker(cp: u21) bool { +fn isBreaker(cp: u21, data: *Data) bool { // Extract relevant properties. - const cp_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]]; - const cp_gbp_prop: gbp.Gbp = @enumFromInt(cp_props_byte >> 4); + const cp_gbp_prop = data.gbp(cp); return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; } -fn isIgnorable(cp: u21) bool { - const cp_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]]; - return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}'; -} - // Grapheme break state. const State = struct { bits: u3 = 0, @@ -135,18 +133,17 @@ const State = struct { pub fn graphemeBreak( cp1: u21, cp2: u21, + data: *Data, state: *State, ) bool { // Extract relevant properties. - const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; - const cp1_gbp_prop: gbp.Gbp = @enumFromInt(cp1_props_byte >> 4); - const cp1_indic_prop: gbp.Indic = @enumFromInt((cp1_props_byte >> 1) & 0x7); - const cp1_is_emoji = cp1_props_byte & 1 == 1; + const cp1_gbp_prop = data.gbp(cp1); + const cp1_indic_prop = data.indic(cp1); + const cp1_is_emoji = data.isEmoji(cp1); - const cp2_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp2 >> 8] + (cp2 & 0xff)]]; - const cp2_gbp_prop: gbp.Gbp = @enumFromInt(cp2_props_byte >> 4); - const cp2_indic_prop: gbp.Indic = @enumFromInt((cp2_props_byte >> 1) & 0x7); - const cp2_is_emoji = cp2_props_byte & 1 == 1; + const cp2_gbp_prop = data.gbp(cp2); + const cp2_indic_prop = data.indic(cp2); + const cp2_is_emoji = data.isEmoji(cp2); // GB11: Emoji Extend* ZWJ x Emoji if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); @@ -157,7 +154,7 @@ pub fn graphemeBreak( if (cp1 == '\r' and cp2 == '\n') return false; // GB4: Control - if (isBreaker(cp1)) return true; + if (isBreaker(cp1, data)) return true; // GB11: Emoji Extend* ZWJ x Emoji if (state.hasXpic() and @@ -175,7 +172,7 @@ pub fn graphemeBreak( if (cp2_gbp_prop == .SpacingMark) return false; // GB9b: Prepend x - if (cp1_gbp_prop == .Prepend and !isBreaker(cp2)) return false; + if (cp1_gbp_prop == .Prepend and !isBreaker(cp2, data)) return false; // GB12, GB13: RI x RI if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { @@ -240,6 +237,9 @@ test "Segmentation GraphemeIterator" { var buf_reader = std.io.bufferedReader(file.reader()); var input_stream = buf_reader.reader(); + var data = try Data.init(allocator); + defer data.deinit(); + var buf: [4096]u8 = undefined; var line_no: usize = 1; @@ -282,7 +282,7 @@ test "Segmentation GraphemeIterator" { } // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); - var iter = Iterator.init(all_bytes.items); + var iter = Iterator.init(all_bytes.items, &data); // Chaeck. for (want.items) |want_gc| { @@ -295,19 +295,6 @@ test "Segmentation GraphemeIterator" { } } -test "Segmentation comptime GraphemeIterator" { - const want = [_][]const u8{ "H", "é", "l", "l", "o" }; - - comptime { - const src = "Héllo"; - var ct_iter = Iterator.init(src); - var i = 0; - while (ct_iter.next()) |grapheme| : (i += 1) { - try std.testing.expectEqualStrings(grapheme.bytes(src), want[i]); - } - } -} - test "Segmentation ZWJ and ZWSP emoji sequences" { const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; @@ -315,18 +302,22 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; const no_joiner = seq_1 ++ seq_2; - var ct_iter = Iterator.init(with_zwj); + var data = try Data.init(std.testing.allocator); + defer data.deinit(); + + var iter = Iterator.init(with_zwj, &data); + var i: usize = 0; - while (ct_iter.next()) |_| : (i += 1) {} + while (iter.next()) |_| : (i += 1) {} try std.testing.expectEqual(@as(usize, 1), i); - ct_iter = Iterator.init(with_zwsp); + iter = Iterator.init(with_zwsp, &data); i = 0; - while (ct_iter.next()) |_| : (i += 1) {} + while (iter.next()) |_| : (i += 1) {} try std.testing.expectEqual(@as(usize, 3), i); - ct_iter = Iterator.init(no_joiner); + iter = Iterator.init(no_joiner, &data); i = 0; - while (ct_iter.next()) |_| : (i += 1) {} + while (iter.next()) |_| : (i += 1) {} try std.testing.expectEqual(@as(usize, 2), i); } diff --git a/src/main.zig b/src/main.zig index 946ae01..57db05b 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,29 +1,47 @@ const std = @import("std"); // const GraphemeIterator = @import("ziglyph").GraphemeIterator; -// const GraphemeIterator = @import("Grapheme").GraphemeIterator; +// const Data = @import("grapheme").Data; +// const GraphemeIterator = @import("grapheme").Iterator; + // const codePointWidth = @import("ziglyph").display_width.codePointWidth; -// const codePointWidth = @import("display_width").codePointWidth; // const strWidth = @import("ziglyph").display_width.strWidth; +// const Data = @import("display_width").Data; +// const codePointWidth = @import("display_width").codePointWidth; // const strWidth = @import("display_width").strWidth; -// const CodePointIterator = @import("CodePoint").CodePointIterator; + +// const CodePointIterator = @import("ziglyph").CodePointIterator; +// const CodePointIterator = @import("code_point").Iterator; + // const ascii = @import("ascii"); // const ascii = std.ascii; + // const norm = @import("ziglyph").Normalizer; +const Data = @import("Normalizer").Data; const norm = @import("Normalizer"); pub fn main() !void { + var args_iter = std.process.args(); + _ = args_iter.skip(); + const in_path = args_iter.next() orelse return error.MissingArg; + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; defer _ = gpa.deinit(); const allocator = gpa.allocator(); - const input = try std.fs.cwd().readFileAlloc(allocator, "data/lang_mix.txt", std.math.maxInt(u32)); + const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32)); defer allocator.free(input); - var n = try norm.init(allocator); + var data = try Data.init(allocator); + defer data.deinit(); + + var n = try norm.init(allocator, &data); defer n.deinit(); + // var n = try norm.init(allocator); + // defer n.deinit(); - // var iter = GraphemeIterator.init(input); + // var iter = GraphemeIterator.init(input, &data); + // defer iter.deinit(); // var iter = CodePointIterator{ .bytes = input }; var iter = std.mem.splitScalar(u8, input, '\n'); @@ -33,7 +51,7 @@ pub fn main() !void { // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); // while (iter.next()) |_| result += 1; - // while (iter.next()) |line| result += strWidth(line); + // while (iter.next()) |line| result += strWidth(line, &data); while (iter.next()) |line| { var nfc = try n.nfc(allocator, line); result += nfc.slice.len; -- cgit v1.2.3