diff options
| -rw-r--r-- | build.zig | 71 | ||||
| -rw-r--r-- | codegen/ccc.zig (renamed from codegen/normp.zig) | 25 | ||||
| -rw-r--r-- | codegen/dwp.zig | 23 | ||||
| -rw-r--r-- | codegen/gbp.zig | 68 | ||||
| -rw-r--r-- | src/CombiningClassData.zig | 48 | ||||
| -rw-r--r-- | src/DisplayWidth.zig (renamed from src/display_width.zig) | 205 | ||||
| -rw-r--r-- | src/DisplayWidthData.zig | 82 | ||||
| -rw-r--r-- | src/GraphemeData.zig | 86 | ||||
| -rw-r--r-- | src/Normalizer.zig | 97 | ||||
| -rw-r--r-- | src/grapheme.zig | 73 | ||||
| -rw-r--r-- | src/main.zig | 32 |
11 files changed, 514 insertions, 296 deletions
| @@ -16,7 +16,7 @@ pub fn build(b: *std.Build) void { | |||
| 16 | .optimize = .Debug, | 16 | .optimize = .Debug, |
| 17 | }); | 17 | }); |
| 18 | const run_gbp_gen_exe = b.addRunArtifact(gbp_gen_exe); | 18 | const run_gbp_gen_exe = b.addRunArtifact(gbp_gen_exe); |
| 19 | const gbp_gen_out = run_gbp_gen_exe.addOutputFileArg("gbp.zig"); | 19 | const gbp_gen_out = run_gbp_gen_exe.addOutputFileArg("gbp.bin.z"); |
| 20 | 20 | ||
| 21 | // Display width | 21 | // Display width |
| 22 | const cjk = b.option(bool, "cjk", "Ambiguouse code points are wide (display width: 2).") orelse false; | 22 | const cjk = b.option(bool, "cjk", "Ambiguouse code points are wide (display width: 2).") orelse false; |
| @@ -31,17 +31,17 @@ pub fn build(b: *std.Build) void { | |||
| 31 | }); | 31 | }); |
| 32 | dwp_gen_exe.root_module.addOptions("options", options); | 32 | dwp_gen_exe.root_module.addOptions("options", options); |
| 33 | const run_dwp_gen_exe = b.addRunArtifact(dwp_gen_exe); | 33 | const run_dwp_gen_exe = b.addRunArtifact(dwp_gen_exe); |
| 34 | const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.zig"); | 34 | const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.bin.z"); |
| 35 | 35 | ||
| 36 | // Normalization properties | 36 | // Normalization properties |
| 37 | const normp_gen_exe = b.addExecutable(.{ | 37 | const ccc_gen_exe = b.addExecutable(.{ |
| 38 | .name = "normp", | 38 | .name = "ccc", |
| 39 | .root_source_file = .{ .path = "codegen/normp.zig" }, | 39 | .root_source_file = .{ .path = "codegen/ccc.zig" }, |
| 40 | .target = b.host, | 40 | .target = b.host, |
| 41 | .optimize = .Debug, | 41 | .optimize = .Debug, |
| 42 | }); | 42 | }); |
| 43 | const run_normp_gen_exe = b.addRunArtifact(normp_gen_exe); | 43 | const run_ccc_gen_exe = b.addRunArtifact(ccc_gen_exe); |
| 44 | const normp_gen_out = run_normp_gen_exe.addOutputFileArg("normp.zig"); | 44 | const ccc_gen_out = run_ccc_gen_exe.addOutputFileArg("ccc.bin.z"); |
| 45 | 45 | ||
| 46 | // Modules we provide | 46 | // Modules we provide |
| 47 | // Code points | 47 | // Code points |
| @@ -52,13 +52,20 @@ pub fn build(b: *std.Build) void { | |||
| 52 | }); | 52 | }); |
| 53 | 53 | ||
| 54 | // Grapheme clusters | 54 | // Grapheme clusters |
| 55 | const grapheme_data = b.createModule(.{ | ||
| 56 | .root_source_file = .{ .path = "src/GraphemeData.zig" }, | ||
| 57 | .target = target, | ||
| 58 | .optimize = optimize, | ||
| 59 | }); | ||
| 60 | grapheme_data.addAnonymousImport("gbp", .{ .root_source_file = gbp_gen_out }); | ||
| 61 | |||
| 55 | const grapheme = b.addModule("grapheme", .{ | 62 | const grapheme = b.addModule("grapheme", .{ |
| 56 | .root_source_file = .{ .path = "src/grapheme.zig" }, | 63 | .root_source_file = .{ .path = "src/grapheme.zig" }, |
| 57 | .target = target, | 64 | .target = target, |
| 58 | .optimize = optimize, | 65 | .optimize = optimize, |
| 59 | }); | 66 | }); |
| 60 | grapheme.addImport("code_point", code_point); | 67 | grapheme.addImport("code_point", code_point); |
| 61 | grapheme.addAnonymousImport("gbp", .{ .root_source_file = gbp_gen_out }); | 68 | grapheme.addImport("GraphemeData", grapheme_data); |
| 62 | 69 | ||
| 63 | // ASCII utilities | 70 | // ASCII utilities |
| 64 | const ascii = b.addModule("ascii", .{ | 71 | const ascii = b.addModule("ascii", .{ |
| @@ -68,17 +75,32 @@ pub fn build(b: *std.Build) void { | |||
| 68 | }); | 75 | }); |
| 69 | 76 | ||
| 70 | // Fixed pitch font display width | 77 | // Fixed pitch font display width |
| 71 | const display_width = b.addModule("display_width", .{ | 78 | const dw_data = b.createModule(.{ |
| 72 | .root_source_file = .{ .path = "src/display_width.zig" }, | 79 | .root_source_file = .{ .path = "src/DisplayWidthData.zig" }, |
| 80 | .target = target, | ||
| 81 | .optimize = optimize, | ||
| 82 | }); | ||
| 83 | dw_data.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out }); | ||
| 84 | dw_data.addImport("GraphemeData", grapheme_data); | ||
| 85 | |||
| 86 | const display_width = b.addModule("DisplayWidth", .{ | ||
| 87 | .root_source_file = .{ .path = "src/DisplayWidth.zig" }, | ||
| 73 | .target = target, | 88 | .target = target, |
| 74 | .optimize = optimize, | 89 | .optimize = optimize, |
| 75 | }); | 90 | }); |
| 76 | display_width.addImport("ascii", ascii); | 91 | display_width.addImport("ascii", ascii); |
| 77 | display_width.addImport("code_point", code_point); | 92 | display_width.addImport("code_point", code_point); |
| 78 | display_width.addImport("grapheme", grapheme); | 93 | display_width.addImport("grapheme", grapheme); |
| 79 | display_width.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out }); | 94 | display_width.addImport("DisplayWidthData", dw_data); |
| 80 | 95 | ||
| 81 | // Normalization | 96 | // Normalization |
| 97 | const ccc_data = b.createModule(.{ | ||
| 98 | .root_source_file = .{ .path = "src/CombiningClassData.zig" }, | ||
| 99 | .target = target, | ||
| 100 | .optimize = optimize, | ||
| 101 | }); | ||
| 102 | ccc_data.addAnonymousImport("ccc", .{ .root_source_file = ccc_gen_out }); | ||
| 103 | |||
| 82 | const norm = b.addModule("Normalizer", .{ | 104 | const norm = b.addModule("Normalizer", .{ |
| 83 | .root_source_file = .{ .path = "src/Normalizer.zig" }, | 105 | .root_source_file = .{ .path = "src/Normalizer.zig" }, |
| 84 | .target = target, | 106 | .target = target, |
| @@ -86,7 +108,7 @@ pub fn build(b: *std.Build) void { | |||
| 86 | }); | 108 | }); |
| 87 | norm.addImport("code_point", code_point); | 109 | norm.addImport("code_point", code_point); |
| 88 | norm.addImport("ziglyph", ziglyph.module("ziglyph")); | 110 | norm.addImport("ziglyph", ziglyph.module("ziglyph")); |
| 89 | norm.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out }); | 111 | norm.addImport("CombiningClassData", ccc_data); |
| 90 | 112 | ||
| 91 | // Benchmark rig | 113 | // Benchmark rig |
| 92 | const exe = b.addExecutable(.{ | 114 | const exe = b.addExecutable(.{ |
| @@ -95,11 +117,11 @@ pub fn build(b: *std.Build) void { | |||
| 95 | .target = target, | 117 | .target = target, |
| 96 | .optimize = optimize, | 118 | .optimize = optimize, |
| 97 | }); | 119 | }); |
| 98 | exe.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); | 120 | // exe.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); |
| 99 | exe.root_module.addImport("ascii", ascii); | 121 | // exe.root_module.addImport("ascii", ascii); |
| 100 | exe.root_module.addImport("code_point", code_point); | 122 | // exe.root_module.addImport("code_point", code_point); |
| 101 | exe.root_module.addImport("grapheme", grapheme); | 123 | // exe.root_module.addImport("grapheme", grapheme); |
| 102 | exe.root_module.addImport("display_width", display_width); | 124 | // exe.root_module.addImport("DisplayWidth", display_width); |
| 103 | exe.root_module.addImport("Normalizer", norm); | 125 | exe.root_module.addImport("Normalizer", norm); |
| 104 | b.installArtifact(exe); | 126 | b.installArtifact(exe); |
| 105 | 127 | ||
| @@ -112,17 +134,18 @@ pub fn build(b: *std.Build) void { | |||
| 112 | 134 | ||
| 113 | // Tests | 135 | // Tests |
| 114 | const exe_unit_tests = b.addTest(.{ | 136 | const exe_unit_tests = b.addTest(.{ |
| 115 | .root_source_file = .{ .path = "src/Normalizer.zig" }, | 137 | .root_source_file = .{ .path = "src/DisplayWidth.zig" }, |
| 116 | .target = target, | 138 | .target = target, |
| 117 | .optimize = optimize, | 139 | .optimize = optimize, |
| 118 | }); | 140 | }); |
| 119 | // exe_unit_tests.root_module.addImport("ascii", ascii); | 141 | exe_unit_tests.root_module.addImport("ascii", ascii); |
| 120 | exe_unit_tests.root_module.addImport("code_point", code_point); | 142 | exe_unit_tests.root_module.addImport("code_point", code_point); |
| 121 | // exe_unit_tests.root_module.addImport("grapheme", grapheme); | 143 | // exe_unit_tests.root_module.addImport("GraphemeData", grapheme_data); |
| 122 | // exe_unit_tests.root_module.addAnonymousImport("gbp", .{ .root_source_file = gbp_gen_out }); | 144 | exe_unit_tests.root_module.addImport("grapheme", grapheme); |
| 123 | // exe_unit_tests.root_module.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out }); | 145 | // exe_unit_tests.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); |
| 124 | exe_unit_tests.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); | 146 | // exe_unit_tests.root_module.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out }); |
| 125 | exe_unit_tests.root_module.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out }); | 147 | exe_unit_tests.root_module.addImport("DisplayWidthData", dw_data); |
| 148 | // exe_unit_tests.root_module.addImport("CombiningClassData", ccc_data); | ||
| 126 | 149 | ||
| 127 | const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); | 150 | const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); |
| 128 | 151 | ||
diff --git a/codegen/normp.zig b/codegen/ccc.zig index 25af65c..93da6a0 100644 --- a/codegen/normp.zig +++ b/codegen/ccc.zig | |||
| @@ -1,6 +1,5 @@ | |||
| 1 | const std = @import("std"); | 1 | const std = @import("std"); |
| 2 | 2 | const builtin = @import("builtin"); | |
| 3 | const options = @import("options"); | ||
| 4 | 3 | ||
| 5 | const block_size = 256; | 4 | const block_size = 256; |
| 6 | const Block = [block_size]u8; | 5 | const Block = [block_size]u8; |
| @@ -108,21 +107,19 @@ pub fn main() !void { | |||
| 108 | _ = args_iter.skip(); | 107 | _ = args_iter.skip(); |
| 109 | const output_path = args_iter.next() orelse @panic("No output file arg!"); | 108 | const output_path = args_iter.next() orelse @panic("No output file arg!"); |
| 110 | 109 | ||
| 110 | const compressor = std.compress.deflate.compressor; | ||
| 111 | var out_file = try std.fs.cwd().createFile(output_path, .{}); | 111 | var out_file = try std.fs.cwd().createFile(output_path, .{}); |
| 112 | defer out_file.close(); | 112 | defer out_file.close(); |
| 113 | var out_buf = std.io.bufferedWriter(out_file.writer()); | 113 | var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); |
| 114 | const writer = out_buf.writer(); | 114 | defer out_comp.deinit(); |
| 115 | 115 | const writer = out_comp.writer(); | |
| 116 | try writer.writeAll("const std = @import(\"std\");\n"); | ||
| 117 | 116 | ||
| 118 | try writer.print("const Stage2Int = std.math.IntFittingRange(0, {});\n", .{stage2.items.len}); | 117 | const endian = builtin.cpu.arch.endian(); |
| 119 | try writer.print("pub const stage_1 = [{}]Stage2Int{{", .{stage1.items.len}); | 118 | try writer.writeInt(u16, @intCast(stage1.items.len), endian); |
| 120 | for (stage1.items) |v| try writer.print("{},", .{v}); | 119 | for (stage1.items) |i| try writer.writeInt(u16, i, endian); |
| 121 | try writer.writeAll("};\n"); | ||
| 122 | 120 | ||
| 123 | try writer.print("pub const stage_2 = [{}]u8{{", .{stage2.items.len}); | 121 | try writer.writeInt(u16, @intCast(stage2.items.len), endian); |
| 124 | for (stage2.items) |v| try writer.print("{},", .{v}); | 122 | try writer.writeAll(stage2.items); |
| 125 | try writer.writeAll("};\n"); | ||
| 126 | 123 | ||
| 127 | try out_buf.flush(); | 124 | try out_comp.flush(); |
| 128 | } | 125 | } |
diff --git a/codegen/dwp.zig b/codegen/dwp.zig index 9e387c6..76a14d3 100644 --- a/codegen/dwp.zig +++ b/codegen/dwp.zig | |||
| @@ -1,4 +1,5 @@ | |||
| 1 | const std = @import("std"); | 1 | const std = @import("std"); |
| 2 | const builtin = @import("builtin"); | ||
| 2 | 3 | ||
| 3 | const options = @import("options"); | 4 | const options = @import("options"); |
| 4 | 5 | ||
| @@ -229,21 +230,19 @@ pub fn main() !void { | |||
| 229 | _ = args_iter.skip(); | 230 | _ = args_iter.skip(); |
| 230 | const output_path = args_iter.next() orelse @panic("No output file arg!"); | 231 | const output_path = args_iter.next() orelse @panic("No output file arg!"); |
| 231 | 232 | ||
| 233 | const compressor = std.compress.deflate.compressor; | ||
| 232 | var out_file = try std.fs.cwd().createFile(output_path, .{}); | 234 | var out_file = try std.fs.cwd().createFile(output_path, .{}); |
| 233 | defer out_file.close(); | 235 | defer out_file.close(); |
| 234 | var out_buf = std.io.bufferedWriter(out_file.writer()); | 236 | var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); |
| 235 | const writer = out_buf.writer(); | 237 | defer out_comp.deinit(); |
| 238 | const writer = out_comp.writer(); | ||
| 236 | 239 | ||
| 237 | try writer.writeAll("const std = @import(\"std\");\n"); | 240 | const endian = builtin.cpu.arch.endian(); |
| 241 | try writer.writeInt(u16, @intCast(stage1.items.len), endian); | ||
| 242 | for (stage1.items) |i| try writer.writeInt(u16, i, endian); | ||
| 238 | 243 | ||
| 239 | try writer.print("const Stage2Int = std.math.IntFittingRange(0, {});\n", .{stage2.items.len}); | 244 | try writer.writeInt(u16, @intCast(stage2.items.len), endian); |
| 240 | try writer.print("pub const stage_1 = [{}]Stage2Int{{", .{stage1.items.len}); | 245 | for (stage2.items) |i| try writer.writeInt(i8, i, endian); |
| 241 | for (stage1.items) |v| try writer.print("{},", .{v}); | ||
| 242 | try writer.writeAll("};\n"); | ||
| 243 | 246 | ||
| 244 | try writer.print("pub const stage_2 = [{}]i3{{", .{stage2.items.len}); | 247 | try out_comp.flush(); |
| 245 | for (stage2.items) |v| try writer.print("{},", .{v}); | ||
| 246 | try writer.writeAll("};\n"); | ||
| 247 | |||
| 248 | try out_buf.flush(); | ||
| 249 | } | 248 | } |
diff --git a/codegen/gbp.zig b/codegen/gbp.zig index 3bd9a4d..39e0da3 100644 --- a/codegen/gbp.zig +++ b/codegen/gbp.zig | |||
| @@ -1,4 +1,5 @@ | |||
| 1 | const std = @import("std"); | 1 | const std = @import("std"); |
| 2 | const builtin = @import("builtin"); | ||
| 2 | 3 | ||
| 3 | const Indic = enum { | 4 | const Indic = enum { |
| 4 | none, | 5 | none, |
| @@ -226,56 +227,23 @@ pub fn main() !void { | |||
| 226 | _ = args_iter.skip(); | 227 | _ = args_iter.skip(); |
| 227 | const output_path = args_iter.next() orelse @panic("No output file arg!"); | 228 | const output_path = args_iter.next() orelse @panic("No output file arg!"); |
| 228 | 229 | ||
| 230 | const compressor = std.compress.deflate.compressor; | ||
| 229 | var out_file = try std.fs.cwd().createFile(output_path, .{}); | 231 | var out_file = try std.fs.cwd().createFile(output_path, .{}); |
| 230 | defer out_file.close(); | 232 | defer out_file.close(); |
| 231 | var out_buf = std.io.bufferedWriter(out_file.writer()); | 233 | var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); |
| 232 | const writer = out_buf.writer(); | 234 | defer out_comp.deinit(); |
| 233 | 235 | const writer = out_comp.writer(); | |
| 234 | const prop_code = | 236 | |
| 235 | \\const std = @import("std"); | 237 | const endian = builtin.cpu.arch.endian(); |
| 236 | \\ | 238 | try writer.writeInt(u16, @intCast(stage1.items.len), endian); |
| 237 | \\pub const Indic = enum { | 239 | for (stage1.items) |i| try writer.writeInt(u16, i, endian); |
| 238 | \\ none, | 240 | |
| 239 | \\ | 241 | try writer.writeInt(u16, @intCast(stage2.items.len), endian); |
| 240 | \\ Consonant, | 242 | for (stage2.items) |i| try writer.writeInt(u16, i, endian); |
| 241 | \\ Extend, | 243 | |
| 242 | \\ Linker, | 244 | const props_bytes = stage3.keys(); |
| 243 | \\}; | 245 | try writer.writeInt(u16, @intCast(props_bytes.len), endian); |
| 244 | \\ | 246 | try writer.writeAll(props_bytes); |
| 245 | \\pub const Gbp = enum { | 247 | |
| 246 | \\ none, | 248 | try out_comp.flush(); |
| 247 | \\ Control, | ||
| 248 | \\ CR, | ||
| 249 | \\ Extend, | ||
| 250 | \\ L, | ||
| 251 | \\ LF, | ||
| 252 | \\ LV, | ||
| 253 | \\ LVT, | ||
| 254 | \\ Prepend, | ||
| 255 | \\ Regional_Indicator, | ||
| 256 | \\ SpacingMark, | ||
| 257 | \\ T, | ||
| 258 | \\ V, | ||
| 259 | \\ ZWJ, | ||
| 260 | \\}; | ||
| 261 | \\ | ||
| 262 | ; | ||
| 263 | |||
| 264 | try writer.writeAll(prop_code); | ||
| 265 | |||
| 266 | try writer.print("const Stage2Int = std.math.IntFittingRange(0, {});\n", .{stage2.items.len}); | ||
| 267 | try writer.print("pub const stage_1 = [{}]Stage2Int{{", .{stage1.items.len}); | ||
| 268 | for (stage1.items) |v| try writer.print("{},", .{v}); | ||
| 269 | try writer.writeAll("};\n"); | ||
| 270 | |||
| 271 | try writer.print("const Stage3Int = std.math.IntFittingRange(0, {});\n", .{stage3_len}); | ||
| 272 | try writer.print("pub const stage_2 = [{}]Stage3Int{{", .{stage2.items.len}); | ||
| 273 | for (stage2.items) |v| try writer.print("{},", .{v}); | ||
| 274 | try writer.writeAll("};\n"); | ||
| 275 | |||
| 276 | try writer.print("pub const stage_3 = [{}]u8{{", .{stage3_len}); | ||
| 277 | for (stage3.keys()) |v| try writer.print("{},", .{v}); | ||
| 278 | try writer.writeAll("};\n"); | ||
| 279 | |||
| 280 | try out_buf.flush(); | ||
| 281 | } | 249 | } |
diff --git a/src/CombiningClassData.zig b/src/CombiningClassData.zig new file mode 100644 index 0000000..95c947d --- /dev/null +++ b/src/CombiningClassData.zig | |||
| @@ -0,0 +1,48 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | const compress = std.compress; | ||
| 4 | const mem = std.mem; | ||
| 5 | |||
| 6 | allocator: mem.Allocator, | ||
| 7 | s1: []u16 = undefined, | ||
| 8 | s2: []u8 = undefined, | ||
| 9 | |||
| 10 | const Self = @This(); | ||
| 11 | |||
| 12 | pub fn init(allocator: mem.Allocator) !Self { | ||
| 13 | const decompressor = compress.deflate.decompressor; | ||
| 14 | const in_bytes = @embedFile("ccc"); | ||
| 15 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 16 | var in_decomp = try decompressor(allocator, in_fbs.reader(), null); | ||
| 17 | defer in_decomp.deinit(); | ||
| 18 | var reader = in_decomp.reader(); | ||
| 19 | |||
| 20 | const endian = builtin.cpu.arch.endian(); | ||
| 21 | |||
| 22 | var self = Self{ .allocator = allocator }; | ||
| 23 | |||
| 24 | const stage_1_len: u16 = try reader.readInt(u16, endian); | ||
| 25 | self.s1 = try allocator.alloc(u16, stage_1_len); | ||
| 26 | for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); | ||
| 27 | |||
| 28 | const stage_2_len: u16 = try reader.readInt(u16, endian); | ||
| 29 | self.s2 = try allocator.alloc(u8, stage_2_len); | ||
| 30 | _ = try reader.readAll(self.s2); | ||
| 31 | |||
| 32 | return self; | ||
| 33 | } | ||
| 34 | |||
| 35 | pub fn deinit(self: *Self) void { | ||
| 36 | self.allocator.free(self.s1); | ||
| 37 | self.allocator.free(self.s2); | ||
| 38 | } | ||
| 39 | |||
| 40 | /// Returns the canonical combining class for a code point. | ||
| 41 | pub inline fn ccc(self: Self, cp: u21) u8 { | ||
| 42 | return self.s2[self.s1[cp >> 8] + (cp & 0xff)]; | ||
| 43 | } | ||
| 44 | |||
| 45 | /// True if `cp` is a starter code point, not a combining character. | ||
| 46 | pub inline fn isStarter(self: Self, cp: u21) bool { | ||
| 47 | return self.s2[self.s1[cp >> 8] + (cp & 0xff)] == 0; | ||
| 48 | } | ||
diff --git a/src/display_width.zig b/src/DisplayWidth.zig index a916cac..85d04a0 100644 --- a/src/display_width.zig +++ b/src/DisplayWidth.zig | |||
| @@ -1,68 +1,38 @@ | |||
| 1 | const std = @import("std"); | 1 | const std = @import("std"); |
| 2 | const simd = std.simd; | 2 | const builtin = @import("builtin"); |
| 3 | const ArrayList = std.ArrayList; | ||
| 3 | const mem = std.mem; | 4 | const mem = std.mem; |
| 5 | const simd = std.simd; | ||
| 4 | const testing = std.testing; | 6 | const testing = std.testing; |
| 5 | 7 | ||
| 6 | const ascii = @import("ascii"); | 8 | const ascii = @import("ascii"); |
| 7 | const CodePointIterator = @import("code_point").Iterator; | 9 | const CodePointIterator = @import("code_point").Iterator; |
| 8 | const dwp = @import("dwp"); | ||
| 9 | const GraphemeIterator = @import("grapheme").Iterator; | 10 | const GraphemeIterator = @import("grapheme").Iterator; |
| 11 | pub const Data = @import("DisplayWidthData"); | ||
| 10 | 12 | ||
| 11 | /// codePointWidth returns the number of cells `cp` requires when rendered | 13 | data: *Data, |
| 12 | /// in a fixed-pitch font (i.e. a terminal screen). This can range from -1 to | ||
| 13 | /// 3, where BACKSPACE and DELETE return -1 and 3-em-dash returns 3. C0/C1 | ||
| 14 | /// control codes return 0. If `cjk` is true, ambiguous code points return 2, | ||
| 15 | /// otherwise they return 1. | ||
| 16 | pub fn codePointWidth(cp: u21) i3 { | ||
| 17 | return dwp.stage_2[dwp.stage_1[cp >> 8] + (cp & 0xff)]; | ||
| 18 | } | ||
| 19 | 14 | ||
| 20 | test "codePointWidth" { | 15 | const Self = @This(); |
| 21 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x0000)); // null | ||
| 22 | try testing.expectEqual(@as(i3, -1), codePointWidth(0x8)); // \b | ||
| 23 | try testing.expectEqual(@as(i3, -1), codePointWidth(0x7f)); // DEL | ||
| 24 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x0005)); // Cf | ||
| 25 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x0007)); // \a BEL | ||
| 26 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000A)); // \n LF | ||
| 27 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000B)); // \v VT | ||
| 28 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000C)); // \f FF | ||
| 29 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000D)); // \r CR | ||
| 30 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000E)); // SQ | ||
| 31 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000F)); // SI | ||
| 32 | |||
| 33 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x070F)); // Cf | ||
| 34 | try testing.expectEqual(@as(i3, 1), codePointWidth(0x0603)); // Cf Arabic | ||
| 35 | |||
| 36 | try testing.expectEqual(@as(i3, 1), codePointWidth(0x00AD)); // soft-hyphen | ||
| 37 | try testing.expectEqual(@as(i3, 2), codePointWidth(0x2E3A)); // two-em dash | ||
| 38 | try testing.expectEqual(@as(i3, 3), codePointWidth(0x2E3B)); // three-em dash | ||
| 39 | |||
| 40 | try testing.expectEqual(@as(i3, 1), codePointWidth(0x00BD)); // ambiguous halfwidth | ||
| 41 | |||
| 42 | try testing.expectEqual(@as(i3, 1), codePointWidth('é')); | ||
| 43 | try testing.expectEqual(@as(i3, 2), codePointWidth('😊')); | ||
| 44 | try testing.expectEqual(@as(i3, 2), codePointWidth('统')); | ||
| 45 | } | ||
| 46 | 16 | ||
| 47 | /// strWidth returns the total display width of `str` as the number of cells | 17 | /// strWidth returns the total display width of `str` as the number of cells |
| 48 | /// required in a fixed-pitch font (i.e. a terminal screen). | 18 | /// required in a fixed-pitch font (i.e. a terminal screen). |
| 49 | pub fn strWidth(str: []const u8) usize { | 19 | pub fn strWidth(self: Self, str: []const u8) usize { |
| 50 | var total: isize = 0; | 20 | var total: isize = 0; |
| 51 | 21 | ||
| 52 | // ASCII fast path | 22 | // ASCII fast path |
| 53 | if (ascii.isAsciiOnly(str)) { | 23 | if (ascii.isAsciiOnly(str)) { |
| 54 | for (str) |b| total += codePointWidth(b); | 24 | for (str) |b| total += self.data.codePointWidth(b); |
| 55 | return @intCast(@max(0, total)); | 25 | return @intCast(@max(0, total)); |
| 56 | } | 26 | } |
| 57 | 27 | ||
| 58 | var giter = GraphemeIterator.init(str); | 28 | var giter = GraphemeIterator.init(str, &self.data.g_data); |
| 59 | 29 | ||
| 60 | while (giter.next()) |gc| { | 30 | while (giter.next()) |gc| { |
| 61 | var cp_iter = CodePointIterator{ .bytes = gc.bytes(str) }; | 31 | var cp_iter = CodePointIterator{ .bytes = gc.bytes(str) }; |
| 62 | var gc_total: isize = 0; | 32 | var gc_total: isize = 0; |
| 63 | 33 | ||
| 64 | while (cp_iter.next()) |cp| { | 34 | while (cp_iter.next()) |cp| { |
| 65 | var w = codePointWidth(cp.code); | 35 | var w = self.data.codePointWidth(cp.code); |
| 66 | 36 | ||
| 67 | if (w != 0) { | 37 | if (w != 0) { |
| 68 | // Handle text emoji sequence. | 38 | // Handle text emoji sequence. |
| @@ -86,31 +56,35 @@ pub fn strWidth(str: []const u8) usize { | |||
| 86 | } | 56 | } |
| 87 | 57 | ||
| 88 | test "strWidth" { | 58 | test "strWidth" { |
| 89 | try testing.expectEqual(@as(usize, 5), strWidth("Hello\r\n")); | 59 | var data = try Data.init(testing.allocator); |
| 90 | try testing.expectEqual(@as(usize, 1), strWidth("\u{0065}\u{0301}")); | 60 | defer data.deinit(); |
| 91 | try testing.expectEqual(@as(usize, 2), strWidth("\u{1F476}\u{1F3FF}\u{0308}\u{200D}\u{1F476}\u{1F3FF}")); | 61 | const self = Self{ .data = &data }; |
| 92 | try testing.expectEqual(@as(usize, 8), strWidth("Hello 😊")); | 62 | |
| 93 | try testing.expectEqual(@as(usize, 8), strWidth("Héllo 😊")); | 63 | try testing.expectEqual(@as(usize, 5), self.strWidth("Hello\r\n")); |
| 94 | try testing.expectEqual(@as(usize, 8), strWidth("Héllo :)")); | 64 | try testing.expectEqual(@as(usize, 1), self.strWidth("\u{0065}\u{0301}")); |
| 95 | try testing.expectEqual(@as(usize, 8), strWidth("Héllo 🇪🇸")); | 65 | try testing.expectEqual(@as(usize, 2), self.strWidth("\u{1F476}\u{1F3FF}\u{0308}\u{200D}\u{1F476}\u{1F3FF}")); |
| 96 | try testing.expectEqual(@as(usize, 2), strWidth("\u{26A1}")); // Lone emoji | 66 | try testing.expectEqual(@as(usize, 8), self.strWidth("Hello 😊")); |
| 97 | try testing.expectEqual(@as(usize, 1), strWidth("\u{26A1}\u{FE0E}")); // Text sequence | 67 | try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo 😊")); |
| 98 | try testing.expectEqual(@as(usize, 2), strWidth("\u{26A1}\u{FE0F}")); // Presentation sequence | 68 | try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo :)")); |
| 99 | try testing.expectEqual(@as(usize, 0), strWidth("A\x08")); // Backspace | 69 | try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo 🇪🇸")); |
| 100 | try testing.expectEqual(@as(usize, 0), strWidth("\x7FA")); // DEL | 70 | try testing.expectEqual(@as(usize, 2), self.strWidth("\u{26A1}")); // Lone emoji |
| 101 | try testing.expectEqual(@as(usize, 0), strWidth("\x7FA\x08\x08")); // never less than o | 71 | try testing.expectEqual(@as(usize, 1), self.strWidth("\u{26A1}\u{FE0E}")); // Text sequence |
| 72 | try testing.expectEqual(@as(usize, 2), self.strWidth("\u{26A1}\u{FE0F}")); // Presentation sequence | ||
| 73 | try testing.expectEqual(@as(usize, 0), self.strWidth("A\x08")); // Backspace | ||
| 74 | try testing.expectEqual(@as(usize, 0), self.strWidth("\x7FA")); // DEL | ||
| 75 | try testing.expectEqual(@as(usize, 0), self.strWidth("\x7FA\x08\x08")); // never less than o | ||
| 102 | 76 | ||
| 103 | // wcwidth Python lib tests. See: https://github.com/jquast/wcwidth/blob/master/tests/test_core.py | 77 | // wcwidth Python lib tests. See: https://github.com/jquast/wcwidth/blob/master/tests/test_core.py |
| 104 | const empty = ""; | 78 | const empty = ""; |
| 105 | try testing.expectEqual(@as(usize, 0), strWidth(empty)); | 79 | try testing.expectEqual(@as(usize, 0), self.strWidth(empty)); |
| 106 | const with_null = "hello\x00world"; | 80 | const with_null = "hello\x00world"; |
| 107 | try testing.expectEqual(@as(usize, 10), strWidth(with_null)); | 81 | try testing.expectEqual(@as(usize, 10), self.strWidth(with_null)); |
| 108 | const hello_jp = "コンニチハ, セカイ!"; | 82 | const hello_jp = "コンニチハ, セカイ!"; |
| 109 | try testing.expectEqual(@as(usize, 19), strWidth(hello_jp)); | 83 | try testing.expectEqual(@as(usize, 19), self.strWidth(hello_jp)); |
| 110 | const control = "\x1b[0m"; | 84 | const control = "\x1b[0m"; |
| 111 | try testing.expectEqual(@as(usize, 3), strWidth(control)); | 85 | try testing.expectEqual(@as(usize, 3), self.strWidth(control)); |
| 112 | const balinese = "\u{1B13}\u{1B28}\u{1B2E}\u{1B44}"; | 86 | const balinese = "\u{1B13}\u{1B28}\u{1B2E}\u{1B44}"; |
| 113 | try testing.expectEqual(@as(usize, 3), strWidth(balinese)); | 87 | try testing.expectEqual(@as(usize, 3), self.strWidth(balinese)); |
| 114 | 88 | ||
| 115 | // These commented out tests require a new specification for complex scripts. | 89 | // These commented out tests require a new specification for complex scripts. |
| 116 | // See: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf | 90 | // See: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf |
| @@ -124,17 +98,17 @@ test "strWidth" { | |||
| 124 | // try testing.expectEqual(@as(usize, 3), strWidth(kannada_1)); | 98 | // try testing.expectEqual(@as(usize, 3), strWidth(kannada_1)); |
| 125 | // The following passes but as a mere coincidence. | 99 | // The following passes but as a mere coincidence. |
| 126 | const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}"; | 100 | const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}"; |
| 127 | try testing.expectEqual(@as(usize, 2), strWidth(kannada_2)); | 101 | try testing.expectEqual(@as(usize, 2), self.strWidth(kannada_2)); |
| 128 | 102 | ||
| 129 | // From Rust https://github.com/jameslanska/unicode-display-width | 103 | // From Rust https://github.com/jameslanska/unicode-display-width |
| 130 | try testing.expectEqual(@as(usize, 15), strWidth("🔥🗡🍩👩🏻🚀⏰💃🏼🔦👍🏻")); | 104 | try testing.expectEqual(@as(usize, 15), self.strWidth("🔥🗡🍩👩🏻🚀⏰💃🏼🔦👍🏻")); |
| 131 | try testing.expectEqual(@as(usize, 2), strWidth("🦀")); | 105 | try testing.expectEqual(@as(usize, 2), self.strWidth("🦀")); |
| 132 | try testing.expectEqual(@as(usize, 2), strWidth("👨👩👧👧")); | 106 | try testing.expectEqual(@as(usize, 2), self.strWidth("👨👩👧👧")); |
| 133 | try testing.expectEqual(@as(usize, 2), strWidth("👩🔬")); | 107 | try testing.expectEqual(@as(usize, 2), self.strWidth("👩🔬")); |
| 134 | try testing.expectEqual(@as(usize, 9), strWidth("sane text")); | 108 | try testing.expectEqual(@as(usize, 9), self.strWidth("sane text")); |
| 135 | try testing.expectEqual(@as(usize, 9), strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ")); | 109 | try testing.expectEqual(@as(usize, 9), self.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ")); |
| 136 | try testing.expectEqual(@as(usize, 17), strWidth("슬라바 우크라이나")); | 110 | try testing.expectEqual(@as(usize, 17), self.strWidth("슬라바 우크라이나")); |
| 137 | try testing.expectEqual(@as(usize, 1), strWidth("\u{378}")); | 111 | try testing.expectEqual(@as(usize, 1), self.strWidth("\u{378}")); |
| 138 | } | 112 | } |
| 139 | 113 | ||
| 140 | /// centers `str` in a new string of width `total_width` (in display cells) using `pad` as padding. | 114 | /// centers `str` in a new string of width `total_width` (in display cells) using `pad` as padding. |
| @@ -142,16 +116,17 @@ test "strWidth" { | |||
| 142 | /// receive one additional pad. This makes sure the returned string fills the requested width. | 116 | /// receive one additional pad. This makes sure the returned string fills the requested width. |
| 143 | /// Caller must free returned bytes with `allocator`. | 117 | /// Caller must free returned bytes with `allocator`. |
| 144 | pub fn center( | 118 | pub fn center( |
| 119 | self: Self, | ||
| 145 | allocator: mem.Allocator, | 120 | allocator: mem.Allocator, |
| 146 | str: []const u8, | 121 | str: []const u8, |
| 147 | total_width: usize, | 122 | total_width: usize, |
| 148 | pad: []const u8, | 123 | pad: []const u8, |
| 149 | ) ![]u8 { | 124 | ) ![]u8 { |
| 150 | const str_width = strWidth(str); | 125 | const str_width = self.strWidth(str); |
| 151 | if (str_width > total_width) return error.StrTooLong; | 126 | if (str_width > total_width) return error.StrTooLong; |
| 152 | if (str_width == total_width) return try allocator.dupe(u8, str); | 127 | if (str_width == total_width) return try allocator.dupe(u8, str); |
| 153 | 128 | ||
| 154 | const pad_width = strWidth(pad); | 129 | const pad_width = self.strWidth(pad); |
| 155 | if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; | 130 | if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; |
| 156 | 131 | ||
| 157 | const margin_width = @divFloor((total_width - str_width), 2); | 132 | const margin_width = @divFloor((total_width - str_width), 2); |
| @@ -181,59 +156,63 @@ pub fn center( | |||
| 181 | } | 156 | } |
| 182 | 157 | ||
| 183 | test "center" { | 158 | test "center" { |
| 184 | var allocator = std.testing.allocator; | 159 | const allocator = testing.allocator; |
| 160 | var data = try Data.init(allocator); | ||
| 161 | defer data.deinit(); | ||
| 162 | const self = Self{ .data = &data }; | ||
| 185 | 163 | ||
| 186 | // Input and width both have odd length | 164 | // Input and width both have odd length |
| 187 | var centered = try center(allocator, "abc", 9, "*"); | 165 | var centered = try self.center(allocator, "abc", 9, "*"); |
| 188 | try testing.expectEqualSlices(u8, "***abc***", centered); | 166 | try testing.expectEqualSlices(u8, "***abc***", centered); |
| 189 | 167 | ||
| 190 | // Input and width both have even length | 168 | // Input and width both have even length |
| 191 | allocator.free(centered); | 169 | testing.allocator.free(centered); |
| 192 | centered = try center(allocator, "w😊w", 10, "-"); | 170 | centered = try self.center(allocator, "w😊w", 10, "-"); |
| 193 | try testing.expectEqualSlices(u8, "---w😊w---", centered); | 171 | try testing.expectEqualSlices(u8, "---w😊w---", centered); |
| 194 | 172 | ||
| 195 | // Input has even length, width has odd length | 173 | // Input has even length, width has odd length |
| 196 | allocator.free(centered); | 174 | testing.allocator.free(centered); |
| 197 | centered = try center(allocator, "1234", 9, "-"); | 175 | centered = try self.center(allocator, "1234", 9, "-"); |
| 198 | try testing.expectEqualSlices(u8, "--1234---", centered); | 176 | try testing.expectEqualSlices(u8, "--1234---", centered); |
| 199 | 177 | ||
| 200 | // Input has odd length, width has even length | 178 | // Input has odd length, width has even length |
| 201 | allocator.free(centered); | 179 | testing.allocator.free(centered); |
| 202 | centered = try center(allocator, "123", 8, "-"); | 180 | centered = try self.center(allocator, "123", 8, "-"); |
| 203 | try testing.expectEqualSlices(u8, "--123---", centered); | 181 | try testing.expectEqualSlices(u8, "--123---", centered); |
| 204 | 182 | ||
| 205 | // Input is the same length as the width | 183 | // Input is the same length as the width |
| 206 | allocator.free(centered); | 184 | testing.allocator.free(centered); |
| 207 | centered = try center(allocator, "123", 3, "-"); | 185 | centered = try self.center(allocator, "123", 3, "-"); |
| 208 | try testing.expectEqualSlices(u8, "123", centered); | 186 | try testing.expectEqualSlices(u8, "123", centered); |
| 209 | 187 | ||
| 210 | // Input is empty | 188 | // Input is empty |
| 211 | allocator.free(centered); | 189 | testing.allocator.free(centered); |
| 212 | centered = try center(allocator, "", 3, "-"); | 190 | centered = try self.center(allocator, "", 3, "-"); |
| 213 | try testing.expectEqualSlices(u8, "---", centered); | 191 | try testing.expectEqualSlices(u8, "---", centered); |
| 214 | 192 | ||
| 215 | // Input is empty and width is zero | 193 | // Input is empty and width is zero |
| 216 | allocator.free(centered); | 194 | testing.allocator.free(centered); |
| 217 | centered = try center(allocator, "", 0, "-"); | 195 | centered = try self.center(allocator, "", 0, "-"); |
| 218 | try testing.expectEqualSlices(u8, "", centered); | 196 | try testing.expectEqualSlices(u8, "", centered); |
| 219 | 197 | ||
| 220 | // Input is longer than the width, which is an error | 198 | // Input is longer than the width, which is an error |
| 221 | allocator.free(centered); | 199 | testing.allocator.free(centered); |
| 222 | try testing.expectError(error.StrTooLong, center(allocator, "123", 2, "-")); | 200 | try testing.expectError(error.StrTooLong, self.center(allocator, "123", 2, "-")); |
| 223 | } | 201 | } |
| 224 | 202 | ||
| 225 | /// padLeft returns a new string of width `total_width` (in display cells) using `pad` as padding | 203 | /// padLeft returns a new string of width `total_width` (in display cells) using `pad` as padding |
| 226 | /// on the left side. Caller must free returned bytes with `allocator`. | 204 | /// on the left side. Caller must free returned bytes with `allocator`. |
| 227 | pub fn padLeft( | 205 | pub fn padLeft( |
| 228 | allocator: std.mem.Allocator, | 206 | self: Self, |
| 207 | allocator: mem.Allocator, | ||
| 229 | str: []const u8, | 208 | str: []const u8, |
| 230 | total_width: usize, | 209 | total_width: usize, |
| 231 | pad: []const u8, | 210 | pad: []const u8, |
| 232 | ) ![]u8 { | 211 | ) ![]u8 { |
| 233 | const str_width = strWidth(str); | 212 | const str_width = self.strWidth(str); |
| 234 | if (str_width > total_width) return error.StrTooLong; | 213 | if (str_width > total_width) return error.StrTooLong; |
| 235 | 214 | ||
| 236 | const pad_width = strWidth(pad); | 215 | const pad_width = self.strWidth(pad); |
| 237 | if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; | 216 | if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; |
| 238 | 217 | ||
| 239 | const margin_width = total_width - str_width; | 218 | const margin_width = total_width - str_width; |
| @@ -256,29 +235,33 @@ pub fn padLeft( | |||
| 256 | } | 235 | } |
| 257 | 236 | ||
| 258 | test "padLeft" { | 237 | test "padLeft" { |
| 259 | var allocator = std.testing.allocator; | 238 | const allocator = testing.allocator; |
| 239 | var data = try Data.init(allocator); | ||
| 240 | defer data.deinit(); | ||
| 241 | const self = Self{ .data = &data }; | ||
| 260 | 242 | ||
| 261 | var right_aligned = try padLeft(allocator, "abc", 9, "*"); | 243 | var right_aligned = try self.padLeft(allocator, "abc", 9, "*"); |
| 262 | defer allocator.free(right_aligned); | 244 | defer testing.allocator.free(right_aligned); |
| 263 | try testing.expectEqualSlices(u8, "******abc", right_aligned); | 245 | try testing.expectEqualSlices(u8, "******abc", right_aligned); |
| 264 | 246 | ||
| 265 | allocator.free(right_aligned); | 247 | testing.allocator.free(right_aligned); |
| 266 | right_aligned = try padLeft(allocator, "w😊w", 10, "-"); | 248 | right_aligned = try self.padLeft(allocator, "w😊w", 10, "-"); |
| 267 | try testing.expectEqualSlices(u8, "------w😊w", right_aligned); | 249 | try testing.expectEqualSlices(u8, "------w😊w", right_aligned); |
| 268 | } | 250 | } |
| 269 | 251 | ||
| 270 | /// padRight returns a new string of width `total_width` (in display cells) using `pad` as padding | 252 | /// padRight returns a new string of width `total_width` (in display cells) using `pad` as padding |
| 271 | /// on the right side. Caller must free returned bytes with `allocator`. | 253 | /// on the right side. Caller must free returned bytes with `allocator`. |
| 272 | pub fn padRight( | 254 | pub fn padRight( |
| 273 | allocator: std.mem.Allocator, | 255 | self: Self, |
| 256 | allocator: mem.Allocator, | ||
| 274 | str: []const u8, | 257 | str: []const u8, |
| 275 | total_width: usize, | 258 | total_width: usize, |
| 276 | pad: []const u8, | 259 | pad: []const u8, |
| 277 | ) ![]u8 { | 260 | ) ![]u8 { |
| 278 | const str_width = strWidth(str); | 261 | const str_width = self.strWidth(str); |
| 279 | if (str_width > total_width) return error.StrTooLong; | 262 | if (str_width > total_width) return error.StrTooLong; |
| 280 | 263 | ||
| 281 | const pad_width = strWidth(pad); | 264 | const pad_width = self.strWidth(pad); |
| 282 | if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; | 265 | if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; |
| 283 | 266 | ||
| 284 | const margin_width = total_width - str_width; | 267 | const margin_width = total_width - str_width; |
| @@ -302,14 +285,17 @@ pub fn padRight( | |||
| 302 | } | 285 | } |
| 303 | 286 | ||
| 304 | test "padRight" { | 287 | test "padRight" { |
| 305 | var allocator = std.testing.allocator; | 288 | const allocator = testing.allocator; |
| 289 | var data = try Data.init(allocator); | ||
| 290 | defer data.deinit(); | ||
| 291 | const self = Self{ .data = &data }; | ||
| 306 | 292 | ||
| 307 | var left_aligned = try padRight(allocator, "abc", 9, "*"); | 293 | var left_aligned = try self.padRight(allocator, "abc", 9, "*"); |
| 308 | defer allocator.free(left_aligned); | 294 | defer testing.allocator.free(left_aligned); |
| 309 | try testing.expectEqualSlices(u8, "abc******", left_aligned); | 295 | try testing.expectEqualSlices(u8, "abc******", left_aligned); |
| 310 | 296 | ||
| 311 | allocator.free(left_aligned); | 297 | testing.allocator.free(left_aligned); |
| 312 | left_aligned = try padRight(allocator, "w😊w", 10, "-"); | 298 | left_aligned = try self.padRight(allocator, "w😊w", 10, "-"); |
| 313 | try testing.expectEqualSlices(u8, "w😊w------", left_aligned); | 299 | try testing.expectEqualSlices(u8, "w😊w------", left_aligned); |
| 314 | } | 300 | } |
| 315 | 301 | ||
| @@ -317,12 +303,13 @@ test "padRight" { | |||
| 317 | /// `threshold` defines how far the last column of the last word can be | 303 | /// `threshold` defines how far the last column of the last word can be |
| 318 | /// from the edge. Caller must free returned bytes with `allocator`. | 304 | /// from the edge. Caller must free returned bytes with `allocator`. |
| 319 | pub fn wrap( | 305 | pub fn wrap( |
| 320 | allocator: std.mem.Allocator, | 306 | self: Self, |
| 307 | allocator: mem.Allocator, | ||
| 321 | str: []const u8, | 308 | str: []const u8, |
| 322 | columns: usize, | 309 | columns: usize, |
| 323 | threshold: usize, | 310 | threshold: usize, |
| 324 | ) ![]u8 { | 311 | ) ![]u8 { |
| 325 | var result = std.ArrayList(u8).init(allocator); | 312 | var result = ArrayList(u8).init(allocator); |
| 326 | defer result.deinit(); | 313 | defer result.deinit(); |
| 327 | 314 | ||
| 328 | var line_iter = mem.tokenizeAny(u8, str, "\r\n"); | 315 | var line_iter = mem.tokenizeAny(u8, str, "\r\n"); |
| @@ -334,7 +321,7 @@ pub fn wrap( | |||
| 334 | while (word_iter.next()) |word| { | 321 | while (word_iter.next()) |word| { |
| 335 | try result.appendSlice(word); | 322 | try result.appendSlice(word); |
| 336 | try result.append(' '); | 323 | try result.append(' '); |
| 337 | line_width += strWidth(word) + 1; | 324 | line_width += self.strWidth(word) + 1; |
| 338 | 325 | ||
| 339 | if (line_width > columns or columns - line_width <= threshold) { | 326 | if (line_width > columns or columns - line_width <= threshold) { |
| 340 | try result.append('\n'); | 327 | try result.append('\n'); |
| @@ -351,10 +338,14 @@ pub fn wrap( | |||
| 351 | } | 338 | } |
| 352 | 339 | ||
| 353 | test "wrap" { | 340 | test "wrap" { |
| 354 | var allocator = std.testing.allocator; | 341 | const allocator = testing.allocator; |
| 342 | var data = try Data.init(allocator); | ||
| 343 | defer data.deinit(); | ||
| 344 | const self = Self{ .data = &data }; | ||
| 345 | |||
| 355 | const input = "The quick brown fox\r\njumped over the lazy dog!"; | 346 | const input = "The quick brown fox\r\njumped over the lazy dog!"; |
| 356 | const got = try wrap(allocator, input, 10, 3); | 347 | const got = try self.wrap(allocator, input, 10, 3); |
| 357 | defer allocator.free(got); | 348 | defer testing.allocator.free(got); |
| 358 | const want = "The quick \nbrown fox \njumped \nover the \nlazy dog!"; | 349 | const want = "The quick \nbrown fox \njumped \nover the \nlazy dog!"; |
| 359 | try testing.expectEqualStrings(want, got); | 350 | try testing.expectEqualStrings(want, got); |
| 360 | } | 351 | } |
diff --git a/src/DisplayWidthData.zig b/src/DisplayWidthData.zig new file mode 100644 index 0000000..32f8658 --- /dev/null +++ b/src/DisplayWidthData.zig | |||
| @@ -0,0 +1,82 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | const compress = std.compress; | ||
| 4 | const mem = std.mem; | ||
| 5 | const testing = std.testing; | ||
| 6 | |||
| 7 | const GraphemeData = @import("GraphemeData"); | ||
| 8 | |||
| 9 | allocator: mem.Allocator, | ||
| 10 | g_data: GraphemeData, | ||
| 11 | s1: []u16 = undefined, | ||
| 12 | s2: []i3 = undefined, | ||
| 13 | |||
| 14 | const Self = @This(); | ||
| 15 | |||
| 16 | pub fn init(allocator: mem.Allocator) !Self { | ||
| 17 | const decompressor = compress.deflate.decompressor; | ||
| 18 | const in_bytes = @embedFile("dwp"); | ||
| 19 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 20 | var in_decomp = try decompressor(allocator, in_fbs.reader(), null); | ||
| 21 | defer in_decomp.deinit(); | ||
| 22 | var reader = in_decomp.reader(); | ||
| 23 | |||
| 24 | const endian = builtin.cpu.arch.endian(); | ||
| 25 | |||
| 26 | var self = Self{ | ||
| 27 | .allocator = allocator, | ||
| 28 | .g_data = try GraphemeData.init(allocator), | ||
| 29 | }; | ||
| 30 | |||
| 31 | const stage_1_len: u16 = try reader.readInt(u16, endian); | ||
| 32 | self.s1 = try allocator.alloc(u16, stage_1_len); | ||
| 33 | for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); | ||
| 34 | |||
| 35 | const stage_2_len: u16 = try reader.readInt(u16, endian); | ||
| 36 | self.s2 = try allocator.alloc(i3, stage_2_len); | ||
| 37 | for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(i8, endian)); | ||
| 38 | |||
| 39 | return self; | ||
| 40 | } | ||
| 41 | |||
| 42 | pub fn deinit(self: *Self) void { | ||
| 43 | self.allocator.free(self.s1); | ||
| 44 | self.allocator.free(self.s2); | ||
| 45 | self.g_data.deinit(); | ||
| 46 | } | ||
| 47 | |||
| 48 | /// codePointWidth returns the number of cells `cp` requires when rendered | ||
| 49 | /// in a fixed-pitch font (i.e. a terminal screen). This can range from -1 to | ||
| 50 | /// 3, where BACKSPACE and DELETE return -1 and 3-em-dash returns 3. C0/C1 | ||
| 51 | /// control codes return 0. If `cjk` is true, ambiguous code points return 2, | ||
| 52 | /// otherwise they return 1. | ||
| 53 | pub inline fn codePointWidth(self: Self, cp: u21) i3 { | ||
| 54 | return self.s2[self.s1[cp >> 8] + (cp & 0xff)]; | ||
| 55 | } | ||
| 56 | |||
| 57 | test "codePointWidth" { | ||
| 58 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x0000)); // null | ||
| 59 | try testing.expectEqual(@as(i3, -1), codePointWidth(0x8)); // \b | ||
| 60 | try testing.expectEqual(@as(i3, -1), codePointWidth(0x7f)); // DEL | ||
| 61 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x0005)); // Cf | ||
| 62 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x0007)); // \a BEL | ||
| 63 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000A)); // \n LF | ||
| 64 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000B)); // \v VT | ||
| 65 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000C)); // \f FF | ||
| 66 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000D)); // \r CR | ||
| 67 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000E)); // SQ | ||
| 68 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x000F)); // SI | ||
| 69 | |||
| 70 | try testing.expectEqual(@as(i3, 0), codePointWidth(0x070F)); // Cf | ||
| 71 | try testing.expectEqual(@as(i3, 1), codePointWidth(0x0603)); // Cf Arabic | ||
| 72 | |||
| 73 | try testing.expectEqual(@as(i3, 1), codePointWidth(0x00AD)); // soft-hyphen | ||
| 74 | try testing.expectEqual(@as(i3, 2), codePointWidth(0x2E3A)); // two-em dash | ||
| 75 | try testing.expectEqual(@as(i3, 3), codePointWidth(0x2E3B)); // three-em dash | ||
| 76 | |||
| 77 | try testing.expectEqual(@as(i3, 1), codePointWidth(0x00BD)); // ambiguous halfwidth | ||
| 78 | |||
| 79 | try testing.expectEqual(@as(i3, 1), codePointWidth('é')); | ||
| 80 | try testing.expectEqual(@as(i3, 2), codePointWidth('😊')); | ||
| 81 | try testing.expectEqual(@as(i3, 2), codePointWidth('统')); | ||
| 82 | } | ||
diff --git a/src/GraphemeData.zig b/src/GraphemeData.zig new file mode 100644 index 0000000..e418dea --- /dev/null +++ b/src/GraphemeData.zig | |||
| @@ -0,0 +1,86 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | const compress = std.compress; | ||
| 4 | const mem = std.mem; | ||
| 5 | |||
| 6 | /// Indic syllable type. | ||
| 7 | pub const Indic = enum { | ||
| 8 | none, | ||
| 9 | |||
| 10 | Consonant, | ||
| 11 | Extend, | ||
| 12 | Linker, | ||
| 13 | }; | ||
| 14 | |||
| 15 | /// Grapheme break property. | ||
| 16 | pub const Gbp = enum { | ||
| 17 | none, | ||
| 18 | Control, | ||
| 19 | CR, | ||
| 20 | Extend, | ||
| 21 | L, | ||
| 22 | LF, | ||
| 23 | LV, | ||
| 24 | LVT, | ||
| 25 | Prepend, | ||
| 26 | Regional_Indicator, | ||
| 27 | SpacingMark, | ||
| 28 | T, | ||
| 29 | V, | ||
| 30 | ZWJ, | ||
| 31 | }; | ||
| 32 | |||
| 33 | allocator: mem.Allocator, | ||
| 34 | s1: []u16 = undefined, | ||
| 35 | s2: []u16 = undefined, | ||
| 36 | s3: []u8 = undefined, | ||
| 37 | |||
| 38 | const Self = @This(); | ||
| 39 | |||
| 40 | pub fn init(allocator: mem.Allocator) !Self { | ||
| 41 | const decompressor = compress.deflate.decompressor; | ||
| 42 | const in_bytes = @embedFile("gbp"); | ||
| 43 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 44 | var in_decomp = try decompressor(allocator, in_fbs.reader(), null); | ||
| 45 | defer in_decomp.deinit(); | ||
| 46 | var reader = in_decomp.reader(); | ||
| 47 | |||
| 48 | const endian = builtin.cpu.arch.endian(); | ||
| 49 | |||
| 50 | var self = Self{ .allocator = allocator }; | ||
| 51 | |||
| 52 | const s1_len: u16 = try reader.readInt(u16, endian); | ||
| 53 | self.s1 = try allocator.alloc(u16, s1_len); | ||
| 54 | for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian); | ||
| 55 | |||
| 56 | const s2_len: u16 = try reader.readInt(u16, endian); | ||
| 57 | self.s2 = try allocator.alloc(u16, s2_len); | ||
| 58 | for (0..s2_len) |i| self.s2[i] = try reader.readInt(u16, endian); | ||
| 59 | |||
| 60 | const s3_len: u16 = try reader.readInt(u16, endian); | ||
| 61 | self.s3 = try allocator.alloc(u8, s3_len); | ||
| 62 | _ = try reader.readAll(self.s3); | ||
| 63 | |||
| 64 | return self; | ||
| 65 | } | ||
| 66 | |||
| 67 | pub fn deinit(self: *Self) void { | ||
| 68 | self.allocator.free(self.s1); | ||
| 69 | self.allocator.free(self.s2); | ||
| 70 | self.allocator.free(self.s3); | ||
| 71 | } | ||
| 72 | |||
| 73 | /// Lookup the grapheme break property for a code point. | ||
| 74 | pub inline fn gbp(self: Self, cp: u21) Gbp { | ||
| 75 | return @enumFromInt(self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] >> 4); | ||
| 76 | } | ||
| 77 | |||
| 78 | /// Lookup the indic syllable type for a code point. | ||
| 79 | pub inline fn indic(self: Self, cp: u21) Indic { | ||
| 80 | return @enumFromInt((self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7); | ||
| 81 | } | ||
| 82 | |||
| 83 | /// Lookup the indic syllable type for a code point. | ||
| 84 | pub inline fn isEmoji(self: Self, cp: u21) bool { | ||
| 85 | return self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; | ||
| 86 | } | ||
diff --git a/src/Normalizer.zig b/src/Normalizer.zig index 1b4a2d5..6a19f47 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig | |||
| @@ -8,16 +8,18 @@ const CodePointIterator = @import("code_point").Iterator; | |||
| 8 | const case_fold_map = @import("ziglyph").case_folding; | 8 | const case_fold_map = @import("ziglyph").case_folding; |
| 9 | const hangul_map = @import("ziglyph").hangul; | 9 | const hangul_map = @import("ziglyph").hangul; |
| 10 | const norm_props = @import("ziglyph").normalization_props; | 10 | const norm_props = @import("ziglyph").normalization_props; |
| 11 | const normp = @import("normp"); | 11 | pub const Data = @import("CombiningClassData"); |
| 12 | |||
| 13 | const Self = @This(); | ||
| 14 | 12 | ||
| 13 | ccc_data: *Data, | ||
| 15 | nfc_map: std.AutoHashMap([2]u21, u21), | 14 | nfc_map: std.AutoHashMap([2]u21, u21), |
| 16 | nfd_map: std.AutoHashMap(u21, [2]u21), | 15 | nfd_map: std.AutoHashMap(u21, [2]u21), |
| 17 | nfkd_map: std.AutoHashMap(u21, [18]u21), | 16 | nfkd_map: std.AutoHashMap(u21, [18]u21), |
| 18 | 17 | ||
| 19 | pub fn init(allocator: std.mem.Allocator) !Self { | 18 | const Self = @This(); |
| 19 | |||
| 20 | pub fn init(allocator: std.mem.Allocator, data: *Data) !Self { | ||
| 20 | var self = Self{ | 21 | var self = Self{ |
| 22 | .ccc_data = data, | ||
| 21 | .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator), | 23 | .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator), |
| 22 | .nfd_map = std.AutoHashMap(u21, [2]u21).init(allocator), | 24 | .nfd_map = std.AutoHashMap(u21, [2]u21).init(allocator), |
| 23 | .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), | 25 | .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), |
| @@ -95,7 +97,9 @@ pub fn deinit(self: *Self) void { | |||
| 95 | } | 97 | } |
| 96 | 98 | ||
| 97 | test "init / deinit" { | 99 | test "init / deinit" { |
| 98 | var n = try init(std.testing.allocator); | 100 | var data = try Data.init(std.testing.allocator); |
| 101 | defer data.deinit(); | ||
| 102 | var n = try init(std.testing.allocator, &data); | ||
| 99 | defer n.deinit(); | 103 | defer n.deinit(); |
| 100 | } | 104 | } |
| 101 | 105 | ||
| @@ -241,7 +245,9 @@ pub fn decompose(self: Self, cp: u21, form: Form) Decomp { | |||
| 241 | 245 | ||
| 242 | test "decompose" { | 246 | test "decompose" { |
| 243 | const allocator = std.testing.allocator; | 247 | const allocator = std.testing.allocator; |
| 244 | var n = try init(allocator); | 248 | var data = try Data.init(allocator); |
| 249 | defer data.deinit(); | ||
| 250 | var n = try init(allocator, &data); | ||
| 245 | defer n.deinit(); | 251 | defer n.deinit(); |
| 246 | 252 | ||
| 247 | var dc = n.decompose('é', .nfd); | 253 | var dc = n.decompose('é', .nfd); |
| @@ -307,19 +313,17 @@ pub const Result = struct { | |||
| 307 | }; | 313 | }; |
| 308 | 314 | ||
| 309 | // Compares code points by Canonical Combining Class order. | 315 | // Compares code points by Canonical Combining Class order. |
| 310 | fn cccLess(_: void, lhs: u21, rhs: u21) bool { | 316 | fn cccLess(self: Self, lhs: u21, rhs: u21) bool { |
| 311 | const lcc = normp.stage_2[normp.stage_1[lhs >> 8] + (lhs & 0xff)]; | 317 | return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs); |
| 312 | const rcc = normp.stage_2[normp.stage_1[rhs >> 8] + (rhs & 0xff)]; | ||
| 313 | return lcc < rcc; | ||
| 314 | } | 318 | } |
| 315 | 319 | ||
| 316 | // Applies the Canonical Sorting Algorithm. | 320 | // Applies the Canonical Sorting Algorithm. |
| 317 | fn canonicalSort(cps: []u21) void { | 321 | fn canonicalSort(self: Self, cps: []u21) void { |
| 318 | var i: usize = 0; | 322 | var i: usize = 0; |
| 319 | while (i < cps.len) : (i += 1) { | 323 | while (i < cps.len) : (i += 1) { |
| 320 | const start: usize = i; | 324 | const start: usize = i; |
| 321 | while (i < cps.len and normp.stage_2[normp.stage_1[cps[i] >> 8] + (cps[i] & 0xff)] != 0) : (i += 1) {} | 325 | while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} |
| 322 | std.mem.sort(u21, cps[start..i], {}, cccLess); | 326 | std.mem.sort(u21, cps[start..i], self, cccLess); |
| 323 | } | 327 | } |
| 324 | } | 328 | } |
| 325 | 329 | ||
| @@ -349,7 +353,7 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 349 | try dcp_list.appendSlice(slice); | 353 | try dcp_list.appendSlice(slice); |
| 350 | } | 354 | } |
| 351 | 355 | ||
| 352 | canonicalSort(dcp_list.items); | 356 | self.canonicalSort(dcp_list.items); |
| 353 | 357 | ||
| 354 | var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4); | 358 | var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4); |
| 355 | defer dstr_list.deinit(); | 359 | defer dstr_list.deinit(); |
| @@ -365,7 +369,9 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 365 | 369 | ||
| 366 | test "nfd ASCII / no-alloc" { | 370 | test "nfd ASCII / no-alloc" { |
| 367 | const allocator = std.testing.allocator; | 371 | const allocator = std.testing.allocator; |
| 368 | var n = try init(allocator); | 372 | var data = try Data.init(allocator); |
| 373 | defer data.deinit(); | ||
| 374 | var n = try init(allocator, &data); | ||
| 369 | defer n.deinit(); | 375 | defer n.deinit(); |
| 370 | 376 | ||
| 371 | var result = try n.nfd(allocator, "Hello World!"); | 377 | var result = try n.nfd(allocator, "Hello World!"); |
| @@ -376,7 +382,9 @@ test "nfd ASCII / no-alloc" { | |||
| 376 | 382 | ||
| 377 | test "nfd !ASCII / alloc" { | 383 | test "nfd !ASCII / alloc" { |
| 378 | const allocator = std.testing.allocator; | 384 | const allocator = std.testing.allocator; |
| 379 | var n = try init(allocator); | 385 | var data = try Data.init(allocator); |
| 386 | defer data.deinit(); | ||
| 387 | var n = try init(allocator, &data); | ||
| 380 | defer n.deinit(); | 388 | defer n.deinit(); |
| 381 | 389 | ||
| 382 | var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); | 390 | var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); |
| @@ -387,7 +395,9 @@ test "nfd !ASCII / alloc" { | |||
| 387 | 395 | ||
| 388 | test "nfkd ASCII / no-alloc" { | 396 | test "nfkd ASCII / no-alloc" { |
| 389 | const allocator = std.testing.allocator; | 397 | const allocator = std.testing.allocator; |
| 390 | var n = try init(allocator); | 398 | var data = try Data.init(allocator); |
| 399 | defer data.deinit(); | ||
| 400 | var n = try init(allocator, &data); | ||
| 391 | defer n.deinit(); | 401 | defer n.deinit(); |
| 392 | 402 | ||
| 393 | var result = try n.nfkd(allocator, "Hello World!"); | 403 | var result = try n.nfkd(allocator, "Hello World!"); |
| @@ -398,7 +408,9 @@ test "nfkd ASCII / no-alloc" { | |||
| 398 | 408 | ||
| 399 | test "nfkd !ASCII / alloc" { | 409 | test "nfkd !ASCII / alloc" { |
| 400 | const allocator = std.testing.allocator; | 410 | const allocator = std.testing.allocator; |
| 401 | var n = try init(allocator); | 411 | var data = try Data.init(allocator); |
| 412 | defer data.deinit(); | ||
| 413 | var n = try init(allocator, &data); | ||
| 402 | defer n.deinit(); | 414 | defer n.deinit(); |
| 403 | 415 | ||
| 404 | var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); | 416 | var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); |
| @@ -413,16 +425,8 @@ fn isHangul(cp: u21) bool { | |||
| 413 | return cp >= 0x1100 and hangul_map.syllableType(cp) != null; | 425 | return cp >= 0x1100 and hangul_map.syllableType(cp) != null; |
| 414 | } | 426 | } |
| 415 | 427 | ||
| 416 | fn isStarter(cp: u21) bool { | 428 | fn isNonHangulStarter(self: Self, cp: u21) bool { |
| 417 | return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] == 0; | 429 | return !isHangul(cp) and self.ccc_data.isStarter(cp); |
| 418 | } | ||
| 419 | |||
| 420 | fn isCombining(cp: u21) bool { | ||
| 421 | return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] != 0; | ||
| 422 | } | ||
| 423 | |||
| 424 | fn isNonHangulStarter(cp: u21) bool { | ||
| 425 | return !isHangul(cp) and isStarter(cp); | ||
| 426 | } | 430 | } |
| 427 | 431 | ||
| 428 | /// Normalizes `str` to NFC. | 432 | /// Normalizes `str` to NFC. |
| @@ -464,7 +468,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 464 | 468 | ||
| 465 | block_check: while (i < d_list.items.len) : (i += 1) { | 469 | block_check: while (i < d_list.items.len) : (i += 1) { |
| 466 | const C = d_list.items[i]; | 470 | const C = d_list.items[i]; |
| 467 | const cc_C = normp.stage_2[normp.stage_1[C >> 8] + (C & 0xff)]; | 471 | const cc_C = self.ccc_data.ccc(C); |
| 468 | var starter_index: ?usize = null; | 472 | var starter_index: ?usize = null; |
| 469 | var j: usize = i; | 473 | var j: usize = i; |
| 470 | 474 | ||
| @@ -472,14 +476,14 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 472 | j -= 1; | 476 | j -= 1; |
| 473 | 477 | ||
| 474 | // Check for starter. | 478 | // Check for starter. |
| 475 | if (isStarter(d_list.items[j])) { | 479 | if (self.ccc_data.isStarter(d_list.items[j])) { |
| 476 | if (i - j > 1) { // If there's distance between the starting point and the current position. | 480 | if (i - j > 1) { // If there's distance between the starting point and the current position. |
| 477 | for (d_list.items[(j + 1)..i]) |B| { | 481 | for (d_list.items[(j + 1)..i]) |B| { |
| 482 | const cc_B = self.ccc_data.ccc(B); | ||
| 478 | // Check for blocking conditions. | 483 | // Check for blocking conditions. |
| 479 | if (isHangul(C)) { | 484 | if (isHangul(C)) { |
| 480 | if (isCombining(B) or isNonHangulStarter(B)) continue :block_check; | 485 | if (cc_B != 0 or self.isNonHangulStarter(B)) continue :block_check; |
| 481 | } | 486 | } |
| 482 | const cc_B = normp.stage_2[normp.stage_1[B >> 8] + (B & 0xff)]; | ||
| 483 | if (cc_B >= cc_C) continue :block_check; | 487 | if (cc_B >= cc_C) continue :block_check; |
| 484 | } | 488 | } |
| 485 | } | 489 | } |
| @@ -560,7 +564,9 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) ! | |||
| 560 | 564 | ||
| 561 | test "nfc" { | 565 | test "nfc" { |
| 562 | const allocator = std.testing.allocator; | 566 | const allocator = std.testing.allocator; |
| 563 | var n = try init(allocator); | 567 | var data = try Data.init(allocator); |
| 568 | defer data.deinit(); | ||
| 569 | var n = try init(allocator, &data); | ||
| 564 | defer n.deinit(); | 570 | defer n.deinit(); |
| 565 | 571 | ||
| 566 | var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); | 572 | var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); |
| @@ -571,7 +577,9 @@ test "nfc" { | |||
| 571 | 577 | ||
| 572 | test "nfkc" { | 578 | test "nfkc" { |
| 573 | const allocator = std.testing.allocator; | 579 | const allocator = std.testing.allocator; |
| 574 | var n = try init(allocator); | 580 | var data = try Data.init(allocator); |
| 581 | defer data.deinit(); | ||
| 582 | var n = try init(allocator, &data); | ||
| 575 | defer n.deinit(); | 583 | defer n.deinit(); |
| 576 | 584 | ||
| 577 | var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); | 585 | var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); |
| @@ -630,7 +638,9 @@ pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u | |||
| 630 | 638 | ||
| 631 | test "eql" { | 639 | test "eql" { |
| 632 | const allocator = std.testing.allocator; | 640 | const allocator = std.testing.allocator; |
| 633 | var n = try init(allocator); | 641 | var data = try Data.init(allocator); |
| 642 | defer data.deinit(); | ||
| 643 | var n = try init(allocator, &data); | ||
| 634 | defer n.deinit(); | 644 | defer n.deinit(); |
| 635 | 645 | ||
| 636 | try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); | 646 | try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); |
| @@ -697,7 +707,9 @@ pub fn eqlCaseless(self: Self, allocator: std.mem.Allocator, a: []const u8, b: [ | |||
| 697 | 707 | ||
| 698 | test "eqlCaseless" { | 708 | test "eqlCaseless" { |
| 699 | const allocator = std.testing.allocator; | 709 | const allocator = std.testing.allocator; |
| 700 | var n = try init(allocator); | 710 | var data = try Data.init(allocator); |
| 711 | defer data.deinit(); | ||
| 712 | var n = try init(allocator, &data); | ||
| 701 | defer n.deinit(); | 713 | defer n.deinit(); |
| 702 | 714 | ||
| 703 | try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}")); | 715 | try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}")); |
| @@ -707,7 +719,7 @@ test "eqlCaseless" { | |||
| 707 | // FCD | 719 | // FCD |
| 708 | fn getLeadCcc(self: Self, cp: u21) u8 { | 720 | fn getLeadCcc(self: Self, cp: u21) u8 { |
| 709 | const dc = self.mapping(cp, .nfd); | 721 | const dc = self.mapping(cp, .nfd); |
| 710 | return normp.stage_2[normp.stage_1[dc.cps[0] >> 8] + (dc.cps[0] & 0xff)]; | 722 | return self.ccc_data.ccc(dc.cps[0]); |
| 711 | } | 723 | } |
| 712 | 724 | ||
| 713 | fn getTrailCcc(self: Self, cp: u21) u8 { | 725 | fn getTrailCcc(self: Self, cp: u21) u8 { |
| @@ -715,8 +727,7 @@ fn getTrailCcc(self: Self, cp: u21) u8 { | |||
| 715 | const len = for (dc.cps, 0..) |dcp, i| { | 727 | const len = for (dc.cps, 0..) |dcp, i| { |
| 716 | if (dcp == 0) break i; | 728 | if (dcp == 0) break i; |
| 717 | } else dc.cps.len; | 729 | } else dc.cps.len; |
| 718 | const tcp = dc.cps[len -| 1]; | 730 | return self.ccc_data.ccc(dc.cps[len - 1]); |
| 719 | return normp.stage_2[normp.stage_1[tcp >> 8] + (tcp & 0xff)]; | ||
| 720 | } | 731 | } |
| 721 | 732 | ||
| 722 | /// Fast check to detect if a string is already in NFC or NFD form. | 733 | /// Fast check to detect if a string is already in NFC or NFD form. |
| @@ -733,7 +744,9 @@ pub fn isFcd(self: Self, str: []const u8) bool { | |||
| 733 | 744 | ||
| 734 | test "isFcd" { | 745 | test "isFcd" { |
| 735 | const allocator = std.testing.allocator; | 746 | const allocator = std.testing.allocator; |
| 736 | var n = try init(allocator); | 747 | var data = try Data.init(allocator); |
| 748 | defer data.deinit(); | ||
| 749 | var n = try init(allocator, &data); | ||
| 737 | defer n.deinit(); | 750 | defer n.deinit(); |
| 738 | 751 | ||
| 739 | const is_nfc = "José \u{3D3}"; | 752 | const is_nfc = "José \u{3D3}"; |
| @@ -751,7 +764,9 @@ test "Unicode normalization tests" { | |||
| 751 | defer arena.deinit(); | 764 | defer arena.deinit(); |
| 752 | var allocator = arena.allocator(); | 765 | var allocator = arena.allocator(); |
| 753 | 766 | ||
| 754 | var n = try init(allocator); | 767 | var data = try Data.init(allocator); |
| 768 | defer data.deinit(); | ||
| 769 | var n = try init(allocator, &data); | ||
| 755 | defer n.deinit(); | 770 | defer n.deinit(); |
| 756 | 771 | ||
| 757 | var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); | 772 | var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); |
diff --git a/src/grapheme.zig b/src/grapheme.zig index 3fdf10b..7125b5b 100644 --- a/src/grapheme.zig +++ b/src/grapheme.zig | |||
| @@ -1,9 +1,10 @@ | |||
| 1 | const std = @import("std"); | 1 | const std = @import("std"); |
| 2 | const mem = std.mem; | ||
| 2 | const unicode = std.unicode; | 3 | const unicode = std.unicode; |
| 3 | 4 | ||
| 4 | const CodePoint = @import("code_point").CodePoint; | 5 | const CodePoint = @import("code_point").CodePoint; |
| 5 | const CodePointIterator = @import("code_point").Iterator; | 6 | const CodePointIterator = @import("code_point").Iterator; |
| 6 | const gbp = @import("gbp"); | 7 | pub const Data = @import("GraphemeData"); |
| 7 | 8 | ||
| 8 | /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. | 9 | /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. |
| 9 | pub const Grapheme = struct { | 10 | pub const Grapheme = struct { |
| @@ -21,12 +22,13 @@ pub const Grapheme = struct { | |||
| 21 | pub const Iterator = struct { | 22 | pub const Iterator = struct { |
| 22 | buf: [2]?CodePoint = .{ null, null }, | 23 | buf: [2]?CodePoint = .{ null, null }, |
| 23 | cp_iter: CodePointIterator, | 24 | cp_iter: CodePointIterator, |
| 25 | data: *Data, | ||
| 24 | 26 | ||
| 25 | const Self = @This(); | 27 | const Self = @This(); |
| 26 | 28 | ||
| 27 | /// Assumes `src` is valid UTF-8. | 29 | /// Assumes `src` is valid UTF-8. |
| 28 | pub fn init(str: []const u8) Self { | 30 | pub fn init(str: []const u8, data: *Data) Self { |
| 29 | var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; | 31 | var self = Self{ .cp_iter = .{ .bytes = str }, .data = data }; |
| 30 | self.advance(); | 32 | self.advance(); |
| 31 | return self; | 33 | return self; |
| 32 | } | 34 | } |
| @@ -55,6 +57,7 @@ pub const Iterator = struct { | |||
| 55 | if (graphemeBreak( | 57 | if (graphemeBreak( |
| 56 | self.buf[0].?.code, | 58 | self.buf[0].?.code, |
| 57 | self.buf[1].?.code, | 59 | self.buf[1].?.code, |
| 60 | self.data, | ||
| 58 | &state, | 61 | &state, |
| 59 | )) return Grapheme{ .len = gc_len, .offset = gc_start }; | 62 | )) return Grapheme{ .len = gc_len, .offset = gc_start }; |
| 60 | 63 | ||
| @@ -67,6 +70,7 @@ pub const Iterator = struct { | |||
| 67 | if (graphemeBreak( | 70 | if (graphemeBreak( |
| 68 | self.buf[0].?.code, | 71 | self.buf[0].?.code, |
| 69 | if (self.buf[1]) |ncp| ncp.code else 0, | 72 | if (self.buf[1]) |ncp| ncp.code else 0, |
| 73 | self.data, | ||
| 70 | &state, | 74 | &state, |
| 71 | )) break; | 75 | )) break; |
| 72 | } | 76 | } |
| @@ -76,18 +80,12 @@ pub const Iterator = struct { | |||
| 76 | }; | 80 | }; |
| 77 | 81 | ||
| 78 | // Predicates | 82 | // Predicates |
| 79 | fn isBreaker(cp: u21) bool { | 83 | fn isBreaker(cp: u21, data: *Data) bool { |
| 80 | // Extract relevant properties. | 84 | // Extract relevant properties. |
| 81 | const cp_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]]; | 85 | const cp_gbp_prop = data.gbp(cp); |
| 82 | const cp_gbp_prop: gbp.Gbp = @enumFromInt(cp_props_byte >> 4); | ||
| 83 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; | 86 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; |
| 84 | } | 87 | } |
| 85 | 88 | ||
| 86 | fn isIgnorable(cp: u21) bool { | ||
| 87 | const cp_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]]; | ||
| 88 | return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}'; | ||
| 89 | } | ||
| 90 | |||
| 91 | // Grapheme break state. | 89 | // Grapheme break state. |
| 92 | const State = struct { | 90 | const State = struct { |
| 93 | bits: u3 = 0, | 91 | bits: u3 = 0, |
| @@ -135,18 +133,17 @@ const State = struct { | |||
| 135 | pub fn graphemeBreak( | 133 | pub fn graphemeBreak( |
| 136 | cp1: u21, | 134 | cp1: u21, |
| 137 | cp2: u21, | 135 | cp2: u21, |
| 136 | data: *Data, | ||
| 138 | state: *State, | 137 | state: *State, |
| 139 | ) bool { | 138 | ) bool { |
| 140 | // Extract relevant properties. | 139 | // Extract relevant properties. |
| 141 | const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; | 140 | const cp1_gbp_prop = data.gbp(cp1); |
| 142 | const cp1_gbp_prop: gbp.Gbp = @enumFromInt(cp1_props_byte >> 4); | 141 | const cp1_indic_prop = data.indic(cp1); |
| 143 | const cp1_indic_prop: gbp.Indic = @enumFromInt((cp1_props_byte >> 1) & 0x7); | 142 | const cp1_is_emoji = data.isEmoji(cp1); |
| 144 | const cp1_is_emoji = cp1_props_byte & 1 == 1; | ||
| 145 | 143 | ||
| 146 | const cp2_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp2 >> 8] + (cp2 & 0xff)]]; | 144 | const cp2_gbp_prop = data.gbp(cp2); |
| 147 | const cp2_gbp_prop: gbp.Gbp = @enumFromInt(cp2_props_byte >> 4); | 145 | const cp2_indic_prop = data.indic(cp2); |
| 148 | const cp2_indic_prop: gbp.Indic = @enumFromInt((cp2_props_byte >> 1) & 0x7); | 146 | const cp2_is_emoji = data.isEmoji(cp2); |
| 149 | const cp2_is_emoji = cp2_props_byte & 1 == 1; | ||
| 150 | 147 | ||
| 151 | // GB11: Emoji Extend* ZWJ x Emoji | 148 | // GB11: Emoji Extend* ZWJ x Emoji |
| 152 | if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); | 149 | if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); |
| @@ -157,7 +154,7 @@ pub fn graphemeBreak( | |||
| 157 | if (cp1 == '\r' and cp2 == '\n') return false; | 154 | if (cp1 == '\r' and cp2 == '\n') return false; |
| 158 | 155 | ||
| 159 | // GB4: Control | 156 | // GB4: Control |
| 160 | if (isBreaker(cp1)) return true; | 157 | if (isBreaker(cp1, data)) return true; |
| 161 | 158 | ||
| 162 | // GB11: Emoji Extend* ZWJ x Emoji | 159 | // GB11: Emoji Extend* ZWJ x Emoji |
| 163 | if (state.hasXpic() and | 160 | if (state.hasXpic() and |
| @@ -175,7 +172,7 @@ pub fn graphemeBreak( | |||
| 175 | if (cp2_gbp_prop == .SpacingMark) return false; | 172 | if (cp2_gbp_prop == .SpacingMark) return false; |
| 176 | 173 | ||
| 177 | // GB9b: Prepend x | 174 | // GB9b: Prepend x |
| 178 | if (cp1_gbp_prop == .Prepend and !isBreaker(cp2)) return false; | 175 | if (cp1_gbp_prop == .Prepend and !isBreaker(cp2, data)) return false; |
| 179 | 176 | ||
| 180 | // GB12, GB13: RI x RI | 177 | // GB12, GB13: RI x RI |
| 181 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { | 178 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { |
| @@ -240,6 +237,9 @@ test "Segmentation GraphemeIterator" { | |||
| 240 | var buf_reader = std.io.bufferedReader(file.reader()); | 237 | var buf_reader = std.io.bufferedReader(file.reader()); |
| 241 | var input_stream = buf_reader.reader(); | 238 | var input_stream = buf_reader.reader(); |
| 242 | 239 | ||
| 240 | var data = try Data.init(allocator); | ||
| 241 | defer data.deinit(); | ||
| 242 | |||
| 243 | var buf: [4096]u8 = undefined; | 243 | var buf: [4096]u8 = undefined; |
| 244 | var line_no: usize = 1; | 244 | var line_no: usize = 1; |
| 245 | 245 | ||
| @@ -282,7 +282,7 @@ test "Segmentation GraphemeIterator" { | |||
| 282 | } | 282 | } |
| 283 | 283 | ||
| 284 | // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); | 284 | // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); |
| 285 | var iter = Iterator.init(all_bytes.items); | 285 | var iter = Iterator.init(all_bytes.items, &data); |
| 286 | 286 | ||
| 287 | // Chaeck. | 287 | // Chaeck. |
| 288 | for (want.items) |want_gc| { | 288 | for (want.items) |want_gc| { |
| @@ -295,19 +295,6 @@ test "Segmentation GraphemeIterator" { | |||
| 295 | } | 295 | } |
| 296 | } | 296 | } |
| 297 | 297 | ||
| 298 | test "Segmentation comptime GraphemeIterator" { | ||
| 299 | const want = [_][]const u8{ "H", "é", "l", "l", "o" }; | ||
| 300 | |||
| 301 | comptime { | ||
| 302 | const src = "Héllo"; | ||
| 303 | var ct_iter = Iterator.init(src); | ||
| 304 | var i = 0; | ||
| 305 | while (ct_iter.next()) |grapheme| : (i += 1) { | ||
| 306 | try std.testing.expectEqualStrings(grapheme.bytes(src), want[i]); | ||
| 307 | } | ||
| 308 | } | ||
| 309 | } | ||
| 310 | |||
| 311 | test "Segmentation ZWJ and ZWSP emoji sequences" { | 298 | test "Segmentation ZWJ and ZWSP emoji sequences" { |
| 312 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | 299 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; |
| 313 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | 300 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; |
| @@ -315,18 +302,22 @@ test "Segmentation ZWJ and ZWSP emoji sequences" { | |||
| 315 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; | 302 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; |
| 316 | const no_joiner = seq_1 ++ seq_2; | 303 | const no_joiner = seq_1 ++ seq_2; |
| 317 | 304 | ||
| 318 | var ct_iter = Iterator.init(with_zwj); | 305 | var data = try Data.init(std.testing.allocator); |
| 306 | defer data.deinit(); | ||
| 307 | |||
| 308 | var iter = Iterator.init(with_zwj, &data); | ||
| 309 | |||
| 319 | var i: usize = 0; | 310 | var i: usize = 0; |
| 320 | while (ct_iter.next()) |_| : (i += 1) {} | 311 | while (iter.next()) |_| : (i += 1) {} |
| 321 | try std.testing.expectEqual(@as(usize, 1), i); | 312 | try std.testing.expectEqual(@as(usize, 1), i); |
| 322 | 313 | ||
| 323 | ct_iter = Iterator.init(with_zwsp); | 314 | iter = Iterator.init(with_zwsp, &data); |
| 324 | i = 0; | 315 | i = 0; |
| 325 | while (ct_iter.next()) |_| : (i += 1) {} | 316 | while (iter.next()) |_| : (i += 1) {} |
| 326 | try std.testing.expectEqual(@as(usize, 3), i); | 317 | try std.testing.expectEqual(@as(usize, 3), i); |
| 327 | 318 | ||
| 328 | ct_iter = Iterator.init(no_joiner); | 319 | iter = Iterator.init(no_joiner, &data); |
| 329 | i = 0; | 320 | i = 0; |
| 330 | while (ct_iter.next()) |_| : (i += 1) {} | 321 | while (iter.next()) |_| : (i += 1) {} |
| 331 | try std.testing.expectEqual(@as(usize, 2), i); | 322 | try std.testing.expectEqual(@as(usize, 2), i); |
| 332 | } | 323 | } |
diff --git a/src/main.zig b/src/main.zig index 946ae01..57db05b 100644 --- a/src/main.zig +++ b/src/main.zig | |||
| @@ -1,29 +1,47 @@ | |||
| 1 | const std = @import("std"); | 1 | const std = @import("std"); |
| 2 | 2 | ||
| 3 | // const GraphemeIterator = @import("ziglyph").GraphemeIterator; | 3 | // const GraphemeIterator = @import("ziglyph").GraphemeIterator; |
| 4 | // const GraphemeIterator = @import("Grapheme").GraphemeIterator; | 4 | // const Data = @import("grapheme").Data; |
| 5 | // const GraphemeIterator = @import("grapheme").Iterator; | ||
| 6 | |||
| 5 | // const codePointWidth = @import("ziglyph").display_width.codePointWidth; | 7 | // const codePointWidth = @import("ziglyph").display_width.codePointWidth; |
| 6 | // const codePointWidth = @import("display_width").codePointWidth; | ||
| 7 | // const strWidth = @import("ziglyph").display_width.strWidth; | 8 | // const strWidth = @import("ziglyph").display_width.strWidth; |
| 9 | // const Data = @import("display_width").Data; | ||
| 10 | // const codePointWidth = @import("display_width").codePointWidth; | ||
| 8 | // const strWidth = @import("display_width").strWidth; | 11 | // const strWidth = @import("display_width").strWidth; |
| 9 | // const CodePointIterator = @import("CodePoint").CodePointIterator; | 12 | |
| 13 | // const CodePointIterator = @import("ziglyph").CodePointIterator; | ||
| 14 | // const CodePointIterator = @import("code_point").Iterator; | ||
| 15 | |||
| 10 | // const ascii = @import("ascii"); | 16 | // const ascii = @import("ascii"); |
| 11 | // const ascii = std.ascii; | 17 | // const ascii = std.ascii; |
| 18 | |||
| 12 | // const norm = @import("ziglyph").Normalizer; | 19 | // const norm = @import("ziglyph").Normalizer; |
| 20 | const Data = @import("Normalizer").Data; | ||
| 13 | const norm = @import("Normalizer"); | 21 | const norm = @import("Normalizer"); |
| 14 | 22 | ||
| 15 | pub fn main() !void { | 23 | pub fn main() !void { |
| 24 | var args_iter = std.process.args(); | ||
| 25 | _ = args_iter.skip(); | ||
| 26 | const in_path = args_iter.next() orelse return error.MissingArg; | ||
| 27 | |||
| 16 | var gpa = std.heap.GeneralPurposeAllocator(.{}){}; | 28 | var gpa = std.heap.GeneralPurposeAllocator(.{}){}; |
| 17 | defer _ = gpa.deinit(); | 29 | defer _ = gpa.deinit(); |
| 18 | const allocator = gpa.allocator(); | 30 | const allocator = gpa.allocator(); |
| 19 | 31 | ||
| 20 | const input = try std.fs.cwd().readFileAlloc(allocator, "data/lang_mix.txt", std.math.maxInt(u32)); | 32 | const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32)); |
| 21 | defer allocator.free(input); | 33 | defer allocator.free(input); |
| 22 | 34 | ||
| 23 | var n = try norm.init(allocator); | 35 | var data = try Data.init(allocator); |
| 36 | defer data.deinit(); | ||
| 37 | |||
| 38 | var n = try norm.init(allocator, &data); | ||
| 24 | defer n.deinit(); | 39 | defer n.deinit(); |
| 40 | // var n = try norm.init(allocator); | ||
| 41 | // defer n.deinit(); | ||
| 25 | 42 | ||
| 26 | // var iter = GraphemeIterator.init(input); | 43 | // var iter = GraphemeIterator.init(input, &data); |
| 44 | // defer iter.deinit(); | ||
| 27 | // var iter = CodePointIterator{ .bytes = input }; | 45 | // var iter = CodePointIterator{ .bytes = input }; |
| 28 | var iter = std.mem.splitScalar(u8, input, '\n'); | 46 | var iter = std.mem.splitScalar(u8, input, '\n'); |
| 29 | 47 | ||
| @@ -33,7 +51,7 @@ pub fn main() !void { | |||
| 33 | 51 | ||
| 34 | // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); | 52 | // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); |
| 35 | // while (iter.next()) |_| result += 1; | 53 | // while (iter.next()) |_| result += 1; |
| 36 | // while (iter.next()) |line| result += strWidth(line); | 54 | // while (iter.next()) |line| result += strWidth(line, &data); |
| 37 | while (iter.next()) |line| { | 55 | while (iter.next()) |line| { |
| 38 | var nfc = try n.nfc(allocator, line); | 56 | var nfc = try n.nfc(allocator, line); |
| 39 | result += nfc.slice.len; | 57 | result += nfc.slice.len; |