summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--build.zig71
-rw-r--r--codegen/ccc.zig (renamed from codegen/normp.zig)25
-rw-r--r--codegen/dwp.zig23
-rw-r--r--codegen/gbp.zig68
-rw-r--r--src/CombiningClassData.zig48
-rw-r--r--src/DisplayWidth.zig (renamed from src/display_width.zig)205
-rw-r--r--src/DisplayWidthData.zig82
-rw-r--r--src/GraphemeData.zig86
-rw-r--r--src/Normalizer.zig97
-rw-r--r--src/grapheme.zig73
-rw-r--r--src/main.zig32
11 files changed, 514 insertions, 296 deletions
diff --git a/build.zig b/build.zig
index def8b24..7cfb979 100644
--- a/build.zig
+++ b/build.zig
@@ -16,7 +16,7 @@ pub fn build(b: *std.Build) void {
16 .optimize = .Debug, 16 .optimize = .Debug,
17 }); 17 });
18 const run_gbp_gen_exe = b.addRunArtifact(gbp_gen_exe); 18 const run_gbp_gen_exe = b.addRunArtifact(gbp_gen_exe);
19 const gbp_gen_out = run_gbp_gen_exe.addOutputFileArg("gbp.zig"); 19 const gbp_gen_out = run_gbp_gen_exe.addOutputFileArg("gbp.bin.z");
20 20
21 // Display width 21 // Display width
22 const cjk = b.option(bool, "cjk", "Ambiguouse code points are wide (display width: 2).") orelse false; 22 const cjk = b.option(bool, "cjk", "Ambiguouse code points are wide (display width: 2).") orelse false;
@@ -31,17 +31,17 @@ pub fn build(b: *std.Build) void {
31 }); 31 });
32 dwp_gen_exe.root_module.addOptions("options", options); 32 dwp_gen_exe.root_module.addOptions("options", options);
33 const run_dwp_gen_exe = b.addRunArtifact(dwp_gen_exe); 33 const run_dwp_gen_exe = b.addRunArtifact(dwp_gen_exe);
34 const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.zig"); 34 const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.bin.z");
35 35
36 // Normalization properties 36 // Normalization properties
37 const normp_gen_exe = b.addExecutable(.{ 37 const ccc_gen_exe = b.addExecutable(.{
38 .name = "normp", 38 .name = "ccc",
39 .root_source_file = .{ .path = "codegen/normp.zig" }, 39 .root_source_file = .{ .path = "codegen/ccc.zig" },
40 .target = b.host, 40 .target = b.host,
41 .optimize = .Debug, 41 .optimize = .Debug,
42 }); 42 });
43 const run_normp_gen_exe = b.addRunArtifact(normp_gen_exe); 43 const run_ccc_gen_exe = b.addRunArtifact(ccc_gen_exe);
44 const normp_gen_out = run_normp_gen_exe.addOutputFileArg("normp.zig"); 44 const ccc_gen_out = run_ccc_gen_exe.addOutputFileArg("ccc.bin.z");
45 45
46 // Modules we provide 46 // Modules we provide
47 // Code points 47 // Code points
@@ -52,13 +52,20 @@ pub fn build(b: *std.Build) void {
52 }); 52 });
53 53
54 // Grapheme clusters 54 // Grapheme clusters
55 const grapheme_data = b.createModule(.{
56 .root_source_file = .{ .path = "src/GraphemeData.zig" },
57 .target = target,
58 .optimize = optimize,
59 });
60 grapheme_data.addAnonymousImport("gbp", .{ .root_source_file = gbp_gen_out });
61
55 const grapheme = b.addModule("grapheme", .{ 62 const grapheme = b.addModule("grapheme", .{
56 .root_source_file = .{ .path = "src/grapheme.zig" }, 63 .root_source_file = .{ .path = "src/grapheme.zig" },
57 .target = target, 64 .target = target,
58 .optimize = optimize, 65 .optimize = optimize,
59 }); 66 });
60 grapheme.addImport("code_point", code_point); 67 grapheme.addImport("code_point", code_point);
61 grapheme.addAnonymousImport("gbp", .{ .root_source_file = gbp_gen_out }); 68 grapheme.addImport("GraphemeData", grapheme_data);
62 69
63 // ASCII utilities 70 // ASCII utilities
64 const ascii = b.addModule("ascii", .{ 71 const ascii = b.addModule("ascii", .{
@@ -68,17 +75,32 @@ pub fn build(b: *std.Build) void {
68 }); 75 });
69 76
70 // Fixed pitch font display width 77 // Fixed pitch font display width
71 const display_width = b.addModule("display_width", .{ 78 const dw_data = b.createModule(.{
72 .root_source_file = .{ .path = "src/display_width.zig" }, 79 .root_source_file = .{ .path = "src/DisplayWidthData.zig" },
80 .target = target,
81 .optimize = optimize,
82 });
83 dw_data.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out });
84 dw_data.addImport("GraphemeData", grapheme_data);
85
86 const display_width = b.addModule("DisplayWidth", .{
87 .root_source_file = .{ .path = "src/DisplayWidth.zig" },
73 .target = target, 88 .target = target,
74 .optimize = optimize, 89 .optimize = optimize,
75 }); 90 });
76 display_width.addImport("ascii", ascii); 91 display_width.addImport("ascii", ascii);
77 display_width.addImport("code_point", code_point); 92 display_width.addImport("code_point", code_point);
78 display_width.addImport("grapheme", grapheme); 93 display_width.addImport("grapheme", grapheme);
79 display_width.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out }); 94 display_width.addImport("DisplayWidthData", dw_data);
80 95
81 // Normalization 96 // Normalization
97 const ccc_data = b.createModule(.{
98 .root_source_file = .{ .path = "src/CombiningClassData.zig" },
99 .target = target,
100 .optimize = optimize,
101 });
102 ccc_data.addAnonymousImport("ccc", .{ .root_source_file = ccc_gen_out });
103
82 const norm = b.addModule("Normalizer", .{ 104 const norm = b.addModule("Normalizer", .{
83 .root_source_file = .{ .path = "src/Normalizer.zig" }, 105 .root_source_file = .{ .path = "src/Normalizer.zig" },
84 .target = target, 106 .target = target,
@@ -86,7 +108,7 @@ pub fn build(b: *std.Build) void {
86 }); 108 });
87 norm.addImport("code_point", code_point); 109 norm.addImport("code_point", code_point);
88 norm.addImport("ziglyph", ziglyph.module("ziglyph")); 110 norm.addImport("ziglyph", ziglyph.module("ziglyph"));
89 norm.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out }); 111 norm.addImport("CombiningClassData", ccc_data);
90 112
91 // Benchmark rig 113 // Benchmark rig
92 const exe = b.addExecutable(.{ 114 const exe = b.addExecutable(.{
@@ -95,11 +117,11 @@ pub fn build(b: *std.Build) void {
95 .target = target, 117 .target = target,
96 .optimize = optimize, 118 .optimize = optimize,
97 }); 119 });
98 exe.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); 120 // exe.root_module.addImport("ziglyph", ziglyph.module("ziglyph"));
99 exe.root_module.addImport("ascii", ascii); 121 // exe.root_module.addImport("ascii", ascii);
100 exe.root_module.addImport("code_point", code_point); 122 // exe.root_module.addImport("code_point", code_point);
101 exe.root_module.addImport("grapheme", grapheme); 123 // exe.root_module.addImport("grapheme", grapheme);
102 exe.root_module.addImport("display_width", display_width); 124 // exe.root_module.addImport("DisplayWidth", display_width);
103 exe.root_module.addImport("Normalizer", norm); 125 exe.root_module.addImport("Normalizer", norm);
104 b.installArtifact(exe); 126 b.installArtifact(exe);
105 127
@@ -112,17 +134,18 @@ pub fn build(b: *std.Build) void {
112 134
113 // Tests 135 // Tests
114 const exe_unit_tests = b.addTest(.{ 136 const exe_unit_tests = b.addTest(.{
115 .root_source_file = .{ .path = "src/Normalizer.zig" }, 137 .root_source_file = .{ .path = "src/DisplayWidth.zig" },
116 .target = target, 138 .target = target,
117 .optimize = optimize, 139 .optimize = optimize,
118 }); 140 });
119 // exe_unit_tests.root_module.addImport("ascii", ascii); 141 exe_unit_tests.root_module.addImport("ascii", ascii);
120 exe_unit_tests.root_module.addImport("code_point", code_point); 142 exe_unit_tests.root_module.addImport("code_point", code_point);
121 // exe_unit_tests.root_module.addImport("grapheme", grapheme); 143 // exe_unit_tests.root_module.addImport("GraphemeData", grapheme_data);
122 // exe_unit_tests.root_module.addAnonymousImport("gbp", .{ .root_source_file = gbp_gen_out }); 144 exe_unit_tests.root_module.addImport("grapheme", grapheme);
123 // exe_unit_tests.root_module.addAnonymousImport("dwp", .{ .root_source_file = dwp_gen_out }); 145 // exe_unit_tests.root_module.addImport("ziglyph", ziglyph.module("ziglyph"));
124 exe_unit_tests.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); 146 // exe_unit_tests.root_module.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out });
125 exe_unit_tests.root_module.addAnonymousImport("normp", .{ .root_source_file = normp_gen_out }); 147 exe_unit_tests.root_module.addImport("DisplayWidthData", dw_data);
148 // exe_unit_tests.root_module.addImport("CombiningClassData", ccc_data);
126 149
127 const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); 150 const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
128 151
diff --git a/codegen/normp.zig b/codegen/ccc.zig
index 25af65c..93da6a0 100644
--- a/codegen/normp.zig
+++ b/codegen/ccc.zig
@@ -1,6 +1,5 @@
1const std = @import("std"); 1const std = @import("std");
2 2const builtin = @import("builtin");
3const options = @import("options");
4 3
5const block_size = 256; 4const block_size = 256;
6const Block = [block_size]u8; 5const Block = [block_size]u8;
@@ -108,21 +107,19 @@ pub fn main() !void {
108 _ = args_iter.skip(); 107 _ = args_iter.skip();
109 const output_path = args_iter.next() orelse @panic("No output file arg!"); 108 const output_path = args_iter.next() orelse @panic("No output file arg!");
110 109
110 const compressor = std.compress.deflate.compressor;
111 var out_file = try std.fs.cwd().createFile(output_path, .{}); 111 var out_file = try std.fs.cwd().createFile(output_path, .{});
112 defer out_file.close(); 112 defer out_file.close();
113 var out_buf = std.io.bufferedWriter(out_file.writer()); 113 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression });
114 const writer = out_buf.writer(); 114 defer out_comp.deinit();
115 115 const writer = out_comp.writer();
116 try writer.writeAll("const std = @import(\"std\");\n");
117 116
118 try writer.print("const Stage2Int = std.math.IntFittingRange(0, {});\n", .{stage2.items.len}); 117 const endian = builtin.cpu.arch.endian();
119 try writer.print("pub const stage_1 = [{}]Stage2Int{{", .{stage1.items.len}); 118 try writer.writeInt(u16, @intCast(stage1.items.len), endian);
120 for (stage1.items) |v| try writer.print("{},", .{v}); 119 for (stage1.items) |i| try writer.writeInt(u16, i, endian);
121 try writer.writeAll("};\n");
122 120
123 try writer.print("pub const stage_2 = [{}]u8{{", .{stage2.items.len}); 121 try writer.writeInt(u16, @intCast(stage2.items.len), endian);
124 for (stage2.items) |v| try writer.print("{},", .{v}); 122 try writer.writeAll(stage2.items);
125 try writer.writeAll("};\n");
126 123
127 try out_buf.flush(); 124 try out_comp.flush();
128} 125}
diff --git a/codegen/dwp.zig b/codegen/dwp.zig
index 9e387c6..76a14d3 100644
--- a/codegen/dwp.zig
+++ b/codegen/dwp.zig
@@ -1,4 +1,5 @@
1const std = @import("std"); 1const std = @import("std");
2const builtin = @import("builtin");
2 3
3const options = @import("options"); 4const options = @import("options");
4 5
@@ -229,21 +230,19 @@ pub fn main() !void {
229 _ = args_iter.skip(); 230 _ = args_iter.skip();
230 const output_path = args_iter.next() orelse @panic("No output file arg!"); 231 const output_path = args_iter.next() orelse @panic("No output file arg!");
231 232
233 const compressor = std.compress.deflate.compressor;
232 var out_file = try std.fs.cwd().createFile(output_path, .{}); 234 var out_file = try std.fs.cwd().createFile(output_path, .{});
233 defer out_file.close(); 235 defer out_file.close();
234 var out_buf = std.io.bufferedWriter(out_file.writer()); 236 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression });
235 const writer = out_buf.writer(); 237 defer out_comp.deinit();
238 const writer = out_comp.writer();
236 239
237 try writer.writeAll("const std = @import(\"std\");\n"); 240 const endian = builtin.cpu.arch.endian();
241 try writer.writeInt(u16, @intCast(stage1.items.len), endian);
242 for (stage1.items) |i| try writer.writeInt(u16, i, endian);
238 243
239 try writer.print("const Stage2Int = std.math.IntFittingRange(0, {});\n", .{stage2.items.len}); 244 try writer.writeInt(u16, @intCast(stage2.items.len), endian);
240 try writer.print("pub const stage_1 = [{}]Stage2Int{{", .{stage1.items.len}); 245 for (stage2.items) |i| try writer.writeInt(i8, i, endian);
241 for (stage1.items) |v| try writer.print("{},", .{v});
242 try writer.writeAll("};\n");
243 246
244 try writer.print("pub const stage_2 = [{}]i3{{", .{stage2.items.len}); 247 try out_comp.flush();
245 for (stage2.items) |v| try writer.print("{},", .{v});
246 try writer.writeAll("};\n");
247
248 try out_buf.flush();
249} 248}
diff --git a/codegen/gbp.zig b/codegen/gbp.zig
index 3bd9a4d..39e0da3 100644
--- a/codegen/gbp.zig
+++ b/codegen/gbp.zig
@@ -1,4 +1,5 @@
1const std = @import("std"); 1const std = @import("std");
2const builtin = @import("builtin");
2 3
3const Indic = enum { 4const Indic = enum {
4 none, 5 none,
@@ -226,56 +227,23 @@ pub fn main() !void {
226 _ = args_iter.skip(); 227 _ = args_iter.skip();
227 const output_path = args_iter.next() orelse @panic("No output file arg!"); 228 const output_path = args_iter.next() orelse @panic("No output file arg!");
228 229
230 const compressor = std.compress.deflate.compressor;
229 var out_file = try std.fs.cwd().createFile(output_path, .{}); 231 var out_file = try std.fs.cwd().createFile(output_path, .{});
230 defer out_file.close(); 232 defer out_file.close();
231 var out_buf = std.io.bufferedWriter(out_file.writer()); 233 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression });
232 const writer = out_buf.writer(); 234 defer out_comp.deinit();
233 235 const writer = out_comp.writer();
234 const prop_code = 236
235 \\const std = @import("std"); 237 const endian = builtin.cpu.arch.endian();
236 \\ 238 try writer.writeInt(u16, @intCast(stage1.items.len), endian);
237 \\pub const Indic = enum { 239 for (stage1.items) |i| try writer.writeInt(u16, i, endian);
238 \\ none, 240
239 \\ 241 try writer.writeInt(u16, @intCast(stage2.items.len), endian);
240 \\ Consonant, 242 for (stage2.items) |i| try writer.writeInt(u16, i, endian);
241 \\ Extend, 243
242 \\ Linker, 244 const props_bytes = stage3.keys();
243 \\}; 245 try writer.writeInt(u16, @intCast(props_bytes.len), endian);
244 \\ 246 try writer.writeAll(props_bytes);
245 \\pub const Gbp = enum { 247
246 \\ none, 248 try out_comp.flush();
247 \\ Control,
248 \\ CR,
249 \\ Extend,
250 \\ L,
251 \\ LF,
252 \\ LV,
253 \\ LVT,
254 \\ Prepend,
255 \\ Regional_Indicator,
256 \\ SpacingMark,
257 \\ T,
258 \\ V,
259 \\ ZWJ,
260 \\};
261 \\
262 ;
263
264 try writer.writeAll(prop_code);
265
266 try writer.print("const Stage2Int = std.math.IntFittingRange(0, {});\n", .{stage2.items.len});
267 try writer.print("pub const stage_1 = [{}]Stage2Int{{", .{stage1.items.len});
268 for (stage1.items) |v| try writer.print("{},", .{v});
269 try writer.writeAll("};\n");
270
271 try writer.print("const Stage3Int = std.math.IntFittingRange(0, {});\n", .{stage3_len});
272 try writer.print("pub const stage_2 = [{}]Stage3Int{{", .{stage2.items.len});
273 for (stage2.items) |v| try writer.print("{},", .{v});
274 try writer.writeAll("};\n");
275
276 try writer.print("pub const stage_3 = [{}]u8{{", .{stage3_len});
277 for (stage3.keys()) |v| try writer.print("{},", .{v});
278 try writer.writeAll("};\n");
279
280 try out_buf.flush();
281} 249}
diff --git a/src/CombiningClassData.zig b/src/CombiningClassData.zig
new file mode 100644
index 0000000..95c947d
--- /dev/null
+++ b/src/CombiningClassData.zig
@@ -0,0 +1,48 @@
1const std = @import("std");
2const builtin = @import("builtin");
3const compress = std.compress;
4const mem = std.mem;
5
6allocator: mem.Allocator,
7s1: []u16 = undefined,
8s2: []u8 = undefined,
9
10const Self = @This();
11
12pub fn init(allocator: mem.Allocator) !Self {
13 const decompressor = compress.deflate.decompressor;
14 const in_bytes = @embedFile("ccc");
15 var in_fbs = std.io.fixedBufferStream(in_bytes);
16 var in_decomp = try decompressor(allocator, in_fbs.reader(), null);
17 defer in_decomp.deinit();
18 var reader = in_decomp.reader();
19
20 const endian = builtin.cpu.arch.endian();
21
22 var self = Self{ .allocator = allocator };
23
24 const stage_1_len: u16 = try reader.readInt(u16, endian);
25 self.s1 = try allocator.alloc(u16, stage_1_len);
26 for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
27
28 const stage_2_len: u16 = try reader.readInt(u16, endian);
29 self.s2 = try allocator.alloc(u8, stage_2_len);
30 _ = try reader.readAll(self.s2);
31
32 return self;
33}
34
35pub fn deinit(self: *Self) void {
36 self.allocator.free(self.s1);
37 self.allocator.free(self.s2);
38}
39
40/// Returns the canonical combining class for a code point.
41pub inline fn ccc(self: Self, cp: u21) u8 {
42 return self.s2[self.s1[cp >> 8] + (cp & 0xff)];
43}
44
45/// True if `cp` is a starter code point, not a combining character.
46pub inline fn isStarter(self: Self, cp: u21) bool {
47 return self.s2[self.s1[cp >> 8] + (cp & 0xff)] == 0;
48}
diff --git a/src/display_width.zig b/src/DisplayWidth.zig
index a916cac..85d04a0 100644
--- a/src/display_width.zig
+++ b/src/DisplayWidth.zig
@@ -1,68 +1,38 @@
1const std = @import("std"); 1const std = @import("std");
2const simd = std.simd; 2const builtin = @import("builtin");
3const ArrayList = std.ArrayList;
3const mem = std.mem; 4const mem = std.mem;
5const simd = std.simd;
4const testing = std.testing; 6const testing = std.testing;
5 7
6const ascii = @import("ascii"); 8const ascii = @import("ascii");
7const CodePointIterator = @import("code_point").Iterator; 9const CodePointIterator = @import("code_point").Iterator;
8const dwp = @import("dwp");
9const GraphemeIterator = @import("grapheme").Iterator; 10const GraphemeIterator = @import("grapheme").Iterator;
11pub const Data = @import("DisplayWidthData");
10 12
11/// codePointWidth returns the number of cells `cp` requires when rendered 13data: *Data,
12/// in a fixed-pitch font (i.e. a terminal screen). This can range from -1 to
13/// 3, where BACKSPACE and DELETE return -1 and 3-em-dash returns 3. C0/C1
14/// control codes return 0. If `cjk` is true, ambiguous code points return 2,
15/// otherwise they return 1.
16pub fn codePointWidth(cp: u21) i3 {
17 return dwp.stage_2[dwp.stage_1[cp >> 8] + (cp & 0xff)];
18}
19 14
20test "codePointWidth" { 15const Self = @This();
21 try testing.expectEqual(@as(i3, 0), codePointWidth(0x0000)); // null
22 try testing.expectEqual(@as(i3, -1), codePointWidth(0x8)); // \b
23 try testing.expectEqual(@as(i3, -1), codePointWidth(0x7f)); // DEL
24 try testing.expectEqual(@as(i3, 0), codePointWidth(0x0005)); // Cf
25 try testing.expectEqual(@as(i3, 0), codePointWidth(0x0007)); // \a BEL
26 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000A)); // \n LF
27 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000B)); // \v VT
28 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000C)); // \f FF
29 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000D)); // \r CR
30 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000E)); // SQ
31 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000F)); // SI
32
33 try testing.expectEqual(@as(i3, 0), codePointWidth(0x070F)); // Cf
34 try testing.expectEqual(@as(i3, 1), codePointWidth(0x0603)); // Cf Arabic
35
36 try testing.expectEqual(@as(i3, 1), codePointWidth(0x00AD)); // soft-hyphen
37 try testing.expectEqual(@as(i3, 2), codePointWidth(0x2E3A)); // two-em dash
38 try testing.expectEqual(@as(i3, 3), codePointWidth(0x2E3B)); // three-em dash
39
40 try testing.expectEqual(@as(i3, 1), codePointWidth(0x00BD)); // ambiguous halfwidth
41
42 try testing.expectEqual(@as(i3, 1), codePointWidth('é'));
43 try testing.expectEqual(@as(i3, 2), codePointWidth('😊'));
44 try testing.expectEqual(@as(i3, 2), codePointWidth('统'));
45}
46 16
47/// strWidth returns the total display width of `str` as the number of cells 17/// strWidth returns the total display width of `str` as the number of cells
48/// required in a fixed-pitch font (i.e. a terminal screen). 18/// required in a fixed-pitch font (i.e. a terminal screen).
49pub fn strWidth(str: []const u8) usize { 19pub fn strWidth(self: Self, str: []const u8) usize {
50 var total: isize = 0; 20 var total: isize = 0;
51 21
52 // ASCII fast path 22 // ASCII fast path
53 if (ascii.isAsciiOnly(str)) { 23 if (ascii.isAsciiOnly(str)) {
54 for (str) |b| total += codePointWidth(b); 24 for (str) |b| total += self.data.codePointWidth(b);
55 return @intCast(@max(0, total)); 25 return @intCast(@max(0, total));
56 } 26 }
57 27
58 var giter = GraphemeIterator.init(str); 28 var giter = GraphemeIterator.init(str, &self.data.g_data);
59 29
60 while (giter.next()) |gc| { 30 while (giter.next()) |gc| {
61 var cp_iter = CodePointIterator{ .bytes = gc.bytes(str) }; 31 var cp_iter = CodePointIterator{ .bytes = gc.bytes(str) };
62 var gc_total: isize = 0; 32 var gc_total: isize = 0;
63 33
64 while (cp_iter.next()) |cp| { 34 while (cp_iter.next()) |cp| {
65 var w = codePointWidth(cp.code); 35 var w = self.data.codePointWidth(cp.code);
66 36
67 if (w != 0) { 37 if (w != 0) {
68 // Handle text emoji sequence. 38 // Handle text emoji sequence.
@@ -86,31 +56,35 @@ pub fn strWidth(str: []const u8) usize {
86} 56}
87 57
88test "strWidth" { 58test "strWidth" {
89 try testing.expectEqual(@as(usize, 5), strWidth("Hello\r\n")); 59 var data = try Data.init(testing.allocator);
90 try testing.expectEqual(@as(usize, 1), strWidth("\u{0065}\u{0301}")); 60 defer data.deinit();
91 try testing.expectEqual(@as(usize, 2), strWidth("\u{1F476}\u{1F3FF}\u{0308}\u{200D}\u{1F476}\u{1F3FF}")); 61 const self = Self{ .data = &data };
92 try testing.expectEqual(@as(usize, 8), strWidth("Hello 😊")); 62
93 try testing.expectEqual(@as(usize, 8), strWidth("Héllo 😊")); 63 try testing.expectEqual(@as(usize, 5), self.strWidth("Hello\r\n"));
94 try testing.expectEqual(@as(usize, 8), strWidth("Héllo :)")); 64 try testing.expectEqual(@as(usize, 1), self.strWidth("\u{0065}\u{0301}"));
95 try testing.expectEqual(@as(usize, 8), strWidth("Héllo 🇪🇸")); 65 try testing.expectEqual(@as(usize, 2), self.strWidth("\u{1F476}\u{1F3FF}\u{0308}\u{200D}\u{1F476}\u{1F3FF}"));
96 try testing.expectEqual(@as(usize, 2), strWidth("\u{26A1}")); // Lone emoji 66 try testing.expectEqual(@as(usize, 8), self.strWidth("Hello 😊"));
97 try testing.expectEqual(@as(usize, 1), strWidth("\u{26A1}\u{FE0E}")); // Text sequence 67 try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo 😊"));
98 try testing.expectEqual(@as(usize, 2), strWidth("\u{26A1}\u{FE0F}")); // Presentation sequence 68 try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo :)"));
99 try testing.expectEqual(@as(usize, 0), strWidth("A\x08")); // Backspace 69 try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo 🇪🇸"));
100 try testing.expectEqual(@as(usize, 0), strWidth("\x7FA")); // DEL 70 try testing.expectEqual(@as(usize, 2), self.strWidth("\u{26A1}")); // Lone emoji
101 try testing.expectEqual(@as(usize, 0), strWidth("\x7FA\x08\x08")); // never less than o 71 try testing.expectEqual(@as(usize, 1), self.strWidth("\u{26A1}\u{FE0E}")); // Text sequence
72 try testing.expectEqual(@as(usize, 2), self.strWidth("\u{26A1}\u{FE0F}")); // Presentation sequence
73 try testing.expectEqual(@as(usize, 0), self.strWidth("A\x08")); // Backspace
74 try testing.expectEqual(@as(usize, 0), self.strWidth("\x7FA")); // DEL
75 try testing.expectEqual(@as(usize, 0), self.strWidth("\x7FA\x08\x08")); // never less than o
102 76
103 // wcwidth Python lib tests. See: https://github.com/jquast/wcwidth/blob/master/tests/test_core.py 77 // wcwidth Python lib tests. See: https://github.com/jquast/wcwidth/blob/master/tests/test_core.py
104 const empty = ""; 78 const empty = "";
105 try testing.expectEqual(@as(usize, 0), strWidth(empty)); 79 try testing.expectEqual(@as(usize, 0), self.strWidth(empty));
106 const with_null = "hello\x00world"; 80 const with_null = "hello\x00world";
107 try testing.expectEqual(@as(usize, 10), strWidth(with_null)); 81 try testing.expectEqual(@as(usize, 10), self.strWidth(with_null));
108 const hello_jp = "コンニチハ, セカイ!"; 82 const hello_jp = "コンニチハ, セカイ!";
109 try testing.expectEqual(@as(usize, 19), strWidth(hello_jp)); 83 try testing.expectEqual(@as(usize, 19), self.strWidth(hello_jp));
110 const control = "\x1b[0m"; 84 const control = "\x1b[0m";
111 try testing.expectEqual(@as(usize, 3), strWidth(control)); 85 try testing.expectEqual(@as(usize, 3), self.strWidth(control));
112 const balinese = "\u{1B13}\u{1B28}\u{1B2E}\u{1B44}"; 86 const balinese = "\u{1B13}\u{1B28}\u{1B2E}\u{1B44}";
113 try testing.expectEqual(@as(usize, 3), strWidth(balinese)); 87 try testing.expectEqual(@as(usize, 3), self.strWidth(balinese));
114 88
115 // These commented out tests require a new specification for complex scripts. 89 // These commented out tests require a new specification for complex scripts.
116 // See: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf 90 // See: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf
@@ -124,17 +98,17 @@ test "strWidth" {
124 // try testing.expectEqual(@as(usize, 3), strWidth(kannada_1)); 98 // try testing.expectEqual(@as(usize, 3), strWidth(kannada_1));
125 // The following passes but as a mere coincidence. 99 // The following passes but as a mere coincidence.
126 const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}"; 100 const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}";
127 try testing.expectEqual(@as(usize, 2), strWidth(kannada_2)); 101 try testing.expectEqual(@as(usize, 2), self.strWidth(kannada_2));
128 102
129 // From Rust https://github.com/jameslanska/unicode-display-width 103 // From Rust https://github.com/jameslanska/unicode-display-width
130 try testing.expectEqual(@as(usize, 15), strWidth("🔥🗡🍩👩🏻‍🚀⏰💃🏼🔦👍🏻")); 104 try testing.expectEqual(@as(usize, 15), self.strWidth("🔥🗡🍩👩🏻‍🚀⏰💃🏼🔦👍🏻"));
131 try testing.expectEqual(@as(usize, 2), strWidth("🦀")); 105 try testing.expectEqual(@as(usize, 2), self.strWidth("🦀"));
132 try testing.expectEqual(@as(usize, 2), strWidth("👨‍👩‍👧‍👧")); 106 try testing.expectEqual(@as(usize, 2), self.strWidth("👨‍👩‍👧‍👧"));
133 try testing.expectEqual(@as(usize, 2), strWidth("👩‍🔬")); 107 try testing.expectEqual(@as(usize, 2), self.strWidth("👩‍🔬"));
134 try testing.expectEqual(@as(usize, 9), strWidth("sane text")); 108 try testing.expectEqual(@as(usize, 9), self.strWidth("sane text"));
135 try testing.expectEqual(@as(usize, 9), strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ")); 109 try testing.expectEqual(@as(usize, 9), self.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ"));
136 try testing.expectEqual(@as(usize, 17), strWidth("슬라바 우크라이나")); 110 try testing.expectEqual(@as(usize, 17), self.strWidth("슬라바 우크라이나"));
137 try testing.expectEqual(@as(usize, 1), strWidth("\u{378}")); 111 try testing.expectEqual(@as(usize, 1), self.strWidth("\u{378}"));
138} 112}
139 113
140/// centers `str` in a new string of width `total_width` (in display cells) using `pad` as padding. 114/// centers `str` in a new string of width `total_width` (in display cells) using `pad` as padding.
@@ -142,16 +116,17 @@ test "strWidth" {
142/// receive one additional pad. This makes sure the returned string fills the requested width. 116/// receive one additional pad. This makes sure the returned string fills the requested width.
143/// Caller must free returned bytes with `allocator`. 117/// Caller must free returned bytes with `allocator`.
144pub fn center( 118pub fn center(
119 self: Self,
145 allocator: mem.Allocator, 120 allocator: mem.Allocator,
146 str: []const u8, 121 str: []const u8,
147 total_width: usize, 122 total_width: usize,
148 pad: []const u8, 123 pad: []const u8,
149) ![]u8 { 124) ![]u8 {
150 const str_width = strWidth(str); 125 const str_width = self.strWidth(str);
151 if (str_width > total_width) return error.StrTooLong; 126 if (str_width > total_width) return error.StrTooLong;
152 if (str_width == total_width) return try allocator.dupe(u8, str); 127 if (str_width == total_width) return try allocator.dupe(u8, str);
153 128
154 const pad_width = strWidth(pad); 129 const pad_width = self.strWidth(pad);
155 if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; 130 if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong;
156 131
157 const margin_width = @divFloor((total_width - str_width), 2); 132 const margin_width = @divFloor((total_width - str_width), 2);
@@ -181,59 +156,63 @@ pub fn center(
181} 156}
182 157
183test "center" { 158test "center" {
184 var allocator = std.testing.allocator; 159 const allocator = testing.allocator;
160 var data = try Data.init(allocator);
161 defer data.deinit();
162 const self = Self{ .data = &data };
185 163
186 // Input and width both have odd length 164 // Input and width both have odd length
187 var centered = try center(allocator, "abc", 9, "*"); 165 var centered = try self.center(allocator, "abc", 9, "*");
188 try testing.expectEqualSlices(u8, "***abc***", centered); 166 try testing.expectEqualSlices(u8, "***abc***", centered);
189 167
190 // Input and width both have even length 168 // Input and width both have even length
191 allocator.free(centered); 169 testing.allocator.free(centered);
192 centered = try center(allocator, "w😊w", 10, "-"); 170 centered = try self.center(allocator, "w😊w", 10, "-");
193 try testing.expectEqualSlices(u8, "---w😊w---", centered); 171 try testing.expectEqualSlices(u8, "---w😊w---", centered);
194 172
195 // Input has even length, width has odd length 173 // Input has even length, width has odd length
196 allocator.free(centered); 174 testing.allocator.free(centered);
197 centered = try center(allocator, "1234", 9, "-"); 175 centered = try self.center(allocator, "1234", 9, "-");
198 try testing.expectEqualSlices(u8, "--1234---", centered); 176 try testing.expectEqualSlices(u8, "--1234---", centered);
199 177
200 // Input has odd length, width has even length 178 // Input has odd length, width has even length
201 allocator.free(centered); 179 testing.allocator.free(centered);
202 centered = try center(allocator, "123", 8, "-"); 180 centered = try self.center(allocator, "123", 8, "-");
203 try testing.expectEqualSlices(u8, "--123---", centered); 181 try testing.expectEqualSlices(u8, "--123---", centered);
204 182
205 // Input is the same length as the width 183 // Input is the same length as the width
206 allocator.free(centered); 184 testing.allocator.free(centered);
207 centered = try center(allocator, "123", 3, "-"); 185 centered = try self.center(allocator, "123", 3, "-");
208 try testing.expectEqualSlices(u8, "123", centered); 186 try testing.expectEqualSlices(u8, "123", centered);
209 187
210 // Input is empty 188 // Input is empty
211 allocator.free(centered); 189 testing.allocator.free(centered);
212 centered = try center(allocator, "", 3, "-"); 190 centered = try self.center(allocator, "", 3, "-");
213 try testing.expectEqualSlices(u8, "---", centered); 191 try testing.expectEqualSlices(u8, "---", centered);
214 192
215 // Input is empty and width is zero 193 // Input is empty and width is zero
216 allocator.free(centered); 194 testing.allocator.free(centered);
217 centered = try center(allocator, "", 0, "-"); 195 centered = try self.center(allocator, "", 0, "-");
218 try testing.expectEqualSlices(u8, "", centered); 196 try testing.expectEqualSlices(u8, "", centered);
219 197
220 // Input is longer than the width, which is an error 198 // Input is longer than the width, which is an error
221 allocator.free(centered); 199 testing.allocator.free(centered);
222 try testing.expectError(error.StrTooLong, center(allocator, "123", 2, "-")); 200 try testing.expectError(error.StrTooLong, self.center(allocator, "123", 2, "-"));
223} 201}
224 202
225/// padLeft returns a new string of width `total_width` (in display cells) using `pad` as padding 203/// padLeft returns a new string of width `total_width` (in display cells) using `pad` as padding
226/// on the left side. Caller must free returned bytes with `allocator`. 204/// on the left side. Caller must free returned bytes with `allocator`.
227pub fn padLeft( 205pub fn padLeft(
228 allocator: std.mem.Allocator, 206 self: Self,
207 allocator: mem.Allocator,
229 str: []const u8, 208 str: []const u8,
230 total_width: usize, 209 total_width: usize,
231 pad: []const u8, 210 pad: []const u8,
232) ![]u8 { 211) ![]u8 {
233 const str_width = strWidth(str); 212 const str_width = self.strWidth(str);
234 if (str_width > total_width) return error.StrTooLong; 213 if (str_width > total_width) return error.StrTooLong;
235 214
236 const pad_width = strWidth(pad); 215 const pad_width = self.strWidth(pad);
237 if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; 216 if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong;
238 217
239 const margin_width = total_width - str_width; 218 const margin_width = total_width - str_width;
@@ -256,29 +235,33 @@ pub fn padLeft(
256} 235}
257 236
258test "padLeft" { 237test "padLeft" {
259 var allocator = std.testing.allocator; 238 const allocator = testing.allocator;
239 var data = try Data.init(allocator);
240 defer data.deinit();
241 const self = Self{ .data = &data };
260 242
261 var right_aligned = try padLeft(allocator, "abc", 9, "*"); 243 var right_aligned = try self.padLeft(allocator, "abc", 9, "*");
262 defer allocator.free(right_aligned); 244 defer testing.allocator.free(right_aligned);
263 try testing.expectEqualSlices(u8, "******abc", right_aligned); 245 try testing.expectEqualSlices(u8, "******abc", right_aligned);
264 246
265 allocator.free(right_aligned); 247 testing.allocator.free(right_aligned);
266 right_aligned = try padLeft(allocator, "w😊w", 10, "-"); 248 right_aligned = try self.padLeft(allocator, "w😊w", 10, "-");
267 try testing.expectEqualSlices(u8, "------w😊w", right_aligned); 249 try testing.expectEqualSlices(u8, "------w😊w", right_aligned);
268} 250}
269 251
270/// padRight returns a new string of width `total_width` (in display cells) using `pad` as padding 252/// padRight returns a new string of width `total_width` (in display cells) using `pad` as padding
271/// on the right side. Caller must free returned bytes with `allocator`. 253/// on the right side. Caller must free returned bytes with `allocator`.
272pub fn padRight( 254pub fn padRight(
273 allocator: std.mem.Allocator, 255 self: Self,
256 allocator: mem.Allocator,
274 str: []const u8, 257 str: []const u8,
275 total_width: usize, 258 total_width: usize,
276 pad: []const u8, 259 pad: []const u8,
277) ![]u8 { 260) ![]u8 {
278 const str_width = strWidth(str); 261 const str_width = self.strWidth(str);
279 if (str_width > total_width) return error.StrTooLong; 262 if (str_width > total_width) return error.StrTooLong;
280 263
281 const pad_width = strWidth(pad); 264 const pad_width = self.strWidth(pad);
282 if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong; 265 if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong;
283 266
284 const margin_width = total_width - str_width; 267 const margin_width = total_width - str_width;
@@ -302,14 +285,17 @@ pub fn padRight(
302} 285}
303 286
304test "padRight" { 287test "padRight" {
305 var allocator = std.testing.allocator; 288 const allocator = testing.allocator;
289 var data = try Data.init(allocator);
290 defer data.deinit();
291 const self = Self{ .data = &data };
306 292
307 var left_aligned = try padRight(allocator, "abc", 9, "*"); 293 var left_aligned = try self.padRight(allocator, "abc", 9, "*");
308 defer allocator.free(left_aligned); 294 defer testing.allocator.free(left_aligned);
309 try testing.expectEqualSlices(u8, "abc******", left_aligned); 295 try testing.expectEqualSlices(u8, "abc******", left_aligned);
310 296
311 allocator.free(left_aligned); 297 testing.allocator.free(left_aligned);
312 left_aligned = try padRight(allocator, "w😊w", 10, "-"); 298 left_aligned = try self.padRight(allocator, "w😊w", 10, "-");
313 try testing.expectEqualSlices(u8, "w😊w------", left_aligned); 299 try testing.expectEqualSlices(u8, "w😊w------", left_aligned);
314} 300}
315 301
@@ -317,12 +303,13 @@ test "padRight" {
317/// `threshold` defines how far the last column of the last word can be 303/// `threshold` defines how far the last column of the last word can be
318/// from the edge. Caller must free returned bytes with `allocator`. 304/// from the edge. Caller must free returned bytes with `allocator`.
319pub fn wrap( 305pub fn wrap(
320 allocator: std.mem.Allocator, 306 self: Self,
307 allocator: mem.Allocator,
321 str: []const u8, 308 str: []const u8,
322 columns: usize, 309 columns: usize,
323 threshold: usize, 310 threshold: usize,
324) ![]u8 { 311) ![]u8 {
325 var result = std.ArrayList(u8).init(allocator); 312 var result = ArrayList(u8).init(allocator);
326 defer result.deinit(); 313 defer result.deinit();
327 314
328 var line_iter = mem.tokenizeAny(u8, str, "\r\n"); 315 var line_iter = mem.tokenizeAny(u8, str, "\r\n");
@@ -334,7 +321,7 @@ pub fn wrap(
334 while (word_iter.next()) |word| { 321 while (word_iter.next()) |word| {
335 try result.appendSlice(word); 322 try result.appendSlice(word);
336 try result.append(' '); 323 try result.append(' ');
337 line_width += strWidth(word) + 1; 324 line_width += self.strWidth(word) + 1;
338 325
339 if (line_width > columns or columns - line_width <= threshold) { 326 if (line_width > columns or columns - line_width <= threshold) {
340 try result.append('\n'); 327 try result.append('\n');
@@ -351,10 +338,14 @@ pub fn wrap(
351} 338}
352 339
353test "wrap" { 340test "wrap" {
354 var allocator = std.testing.allocator; 341 const allocator = testing.allocator;
342 var data = try Data.init(allocator);
343 defer data.deinit();
344 const self = Self{ .data = &data };
345
355 const input = "The quick brown fox\r\njumped over the lazy dog!"; 346 const input = "The quick brown fox\r\njumped over the lazy dog!";
356 const got = try wrap(allocator, input, 10, 3); 347 const got = try self.wrap(allocator, input, 10, 3);
357 defer allocator.free(got); 348 defer testing.allocator.free(got);
358 const want = "The quick \nbrown fox \njumped \nover the \nlazy dog!"; 349 const want = "The quick \nbrown fox \njumped \nover the \nlazy dog!";
359 try testing.expectEqualStrings(want, got); 350 try testing.expectEqualStrings(want, got);
360} 351}
diff --git a/src/DisplayWidthData.zig b/src/DisplayWidthData.zig
new file mode 100644
index 0000000..32f8658
--- /dev/null
+++ b/src/DisplayWidthData.zig
@@ -0,0 +1,82 @@
1const std = @import("std");
2const builtin = @import("builtin");
3const compress = std.compress;
4const mem = std.mem;
5const testing = std.testing;
6
7const GraphemeData = @import("GraphemeData");
8
9allocator: mem.Allocator,
10g_data: GraphemeData,
11s1: []u16 = undefined,
12s2: []i3 = undefined,
13
14const Self = @This();
15
16pub fn init(allocator: mem.Allocator) !Self {
17 const decompressor = compress.deflate.decompressor;
18 const in_bytes = @embedFile("dwp");
19 var in_fbs = std.io.fixedBufferStream(in_bytes);
20 var in_decomp = try decompressor(allocator, in_fbs.reader(), null);
21 defer in_decomp.deinit();
22 var reader = in_decomp.reader();
23
24 const endian = builtin.cpu.arch.endian();
25
26 var self = Self{
27 .allocator = allocator,
28 .g_data = try GraphemeData.init(allocator),
29 };
30
31 const stage_1_len: u16 = try reader.readInt(u16, endian);
32 self.s1 = try allocator.alloc(u16, stage_1_len);
33 for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
34
35 const stage_2_len: u16 = try reader.readInt(u16, endian);
36 self.s2 = try allocator.alloc(i3, stage_2_len);
37 for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(i8, endian));
38
39 return self;
40}
41
42pub fn deinit(self: *Self) void {
43 self.allocator.free(self.s1);
44 self.allocator.free(self.s2);
45 self.g_data.deinit();
46}
47
48/// codePointWidth returns the number of cells `cp` requires when rendered
49/// in a fixed-pitch font (i.e. a terminal screen). This can range from -1 to
50/// 3, where BACKSPACE and DELETE return -1 and 3-em-dash returns 3. C0/C1
51/// control codes return 0. If `cjk` is true, ambiguous code points return 2,
52/// otherwise they return 1.
53pub inline fn codePointWidth(self: Self, cp: u21) i3 {
54 return self.s2[self.s1[cp >> 8] + (cp & 0xff)];
55}
56
57test "codePointWidth" {
58 try testing.expectEqual(@as(i3, 0), codePointWidth(0x0000)); // null
59 try testing.expectEqual(@as(i3, -1), codePointWidth(0x8)); // \b
60 try testing.expectEqual(@as(i3, -1), codePointWidth(0x7f)); // DEL
61 try testing.expectEqual(@as(i3, 0), codePointWidth(0x0005)); // Cf
62 try testing.expectEqual(@as(i3, 0), codePointWidth(0x0007)); // \a BEL
63 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000A)); // \n LF
64 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000B)); // \v VT
65 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000C)); // \f FF
66 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000D)); // \r CR
67 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000E)); // SQ
68 try testing.expectEqual(@as(i3, 0), codePointWidth(0x000F)); // SI
69
70 try testing.expectEqual(@as(i3, 0), codePointWidth(0x070F)); // Cf
71 try testing.expectEqual(@as(i3, 1), codePointWidth(0x0603)); // Cf Arabic
72
73 try testing.expectEqual(@as(i3, 1), codePointWidth(0x00AD)); // soft-hyphen
74 try testing.expectEqual(@as(i3, 2), codePointWidth(0x2E3A)); // two-em dash
75 try testing.expectEqual(@as(i3, 3), codePointWidth(0x2E3B)); // three-em dash
76
77 try testing.expectEqual(@as(i3, 1), codePointWidth(0x00BD)); // ambiguous halfwidth
78
79 try testing.expectEqual(@as(i3, 1), codePointWidth('é'));
80 try testing.expectEqual(@as(i3, 2), codePointWidth('😊'));
81 try testing.expectEqual(@as(i3, 2), codePointWidth('统'));
82}
diff --git a/src/GraphemeData.zig b/src/GraphemeData.zig
new file mode 100644
index 0000000..e418dea
--- /dev/null
+++ b/src/GraphemeData.zig
@@ -0,0 +1,86 @@
1const std = @import("std");
2const builtin = @import("builtin");
3const compress = std.compress;
4const mem = std.mem;
5
6/// Indic syllable type.
7pub const Indic = enum {
8 none,
9
10 Consonant,
11 Extend,
12 Linker,
13};
14
15/// Grapheme break property.
16pub const Gbp = enum {
17 none,
18 Control,
19 CR,
20 Extend,
21 L,
22 LF,
23 LV,
24 LVT,
25 Prepend,
26 Regional_Indicator,
27 SpacingMark,
28 T,
29 V,
30 ZWJ,
31};
32
33allocator: mem.Allocator,
34s1: []u16 = undefined,
35s2: []u16 = undefined,
36s3: []u8 = undefined,
37
38const Self = @This();
39
40pub fn init(allocator: mem.Allocator) !Self {
41 const decompressor = compress.deflate.decompressor;
42 const in_bytes = @embedFile("gbp");
43 var in_fbs = std.io.fixedBufferStream(in_bytes);
44 var in_decomp = try decompressor(allocator, in_fbs.reader(), null);
45 defer in_decomp.deinit();
46 var reader = in_decomp.reader();
47
48 const endian = builtin.cpu.arch.endian();
49
50 var self = Self{ .allocator = allocator };
51
52 const s1_len: u16 = try reader.readInt(u16, endian);
53 self.s1 = try allocator.alloc(u16, s1_len);
54 for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
55
56 const s2_len: u16 = try reader.readInt(u16, endian);
57 self.s2 = try allocator.alloc(u16, s2_len);
58 for (0..s2_len) |i| self.s2[i] = try reader.readInt(u16, endian);
59
60 const s3_len: u16 = try reader.readInt(u16, endian);
61 self.s3 = try allocator.alloc(u8, s3_len);
62 _ = try reader.readAll(self.s3);
63
64 return self;
65}
66
67pub fn deinit(self: *Self) void {
68 self.allocator.free(self.s1);
69 self.allocator.free(self.s2);
70 self.allocator.free(self.s3);
71}
72
73/// Lookup the grapheme break property for a code point.
74pub inline fn gbp(self: Self, cp: u21) Gbp {
75 return @enumFromInt(self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] >> 4);
76}
77
78/// Lookup the indic syllable type for a code point.
79pub inline fn indic(self: Self, cp: u21) Indic {
80 return @enumFromInt((self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7);
81}
82
83/// Lookup the indic syllable type for a code point.
84pub inline fn isEmoji(self: Self, cp: u21) bool {
85 return self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1;
86}
diff --git a/src/Normalizer.zig b/src/Normalizer.zig
index 1b4a2d5..6a19f47 100644
--- a/src/Normalizer.zig
+++ b/src/Normalizer.zig
@@ -8,16 +8,18 @@ const CodePointIterator = @import("code_point").Iterator;
8const case_fold_map = @import("ziglyph").case_folding; 8const case_fold_map = @import("ziglyph").case_folding;
9const hangul_map = @import("ziglyph").hangul; 9const hangul_map = @import("ziglyph").hangul;
10const norm_props = @import("ziglyph").normalization_props; 10const norm_props = @import("ziglyph").normalization_props;
11const normp = @import("normp"); 11pub const Data = @import("CombiningClassData");
12
13const Self = @This();
14 12
13ccc_data: *Data,
15nfc_map: std.AutoHashMap([2]u21, u21), 14nfc_map: std.AutoHashMap([2]u21, u21),
16nfd_map: std.AutoHashMap(u21, [2]u21), 15nfd_map: std.AutoHashMap(u21, [2]u21),
17nfkd_map: std.AutoHashMap(u21, [18]u21), 16nfkd_map: std.AutoHashMap(u21, [18]u21),
18 17
19pub fn init(allocator: std.mem.Allocator) !Self { 18const Self = @This();
19
20pub fn init(allocator: std.mem.Allocator, data: *Data) !Self {
20 var self = Self{ 21 var self = Self{
22 .ccc_data = data,
21 .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator), 23 .nfc_map = std.AutoHashMap([2]u21, u21).init(allocator),
22 .nfd_map = std.AutoHashMap(u21, [2]u21).init(allocator), 24 .nfd_map = std.AutoHashMap(u21, [2]u21).init(allocator),
23 .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator), 25 .nfkd_map = std.AutoHashMap(u21, [18]u21).init(allocator),
@@ -95,7 +97,9 @@ pub fn deinit(self: *Self) void {
95} 97}
96 98
97test "init / deinit" { 99test "init / deinit" {
98 var n = try init(std.testing.allocator); 100 var data = try Data.init(std.testing.allocator);
101 defer data.deinit();
102 var n = try init(std.testing.allocator, &data);
99 defer n.deinit(); 103 defer n.deinit();
100} 104}
101 105
@@ -241,7 +245,9 @@ pub fn decompose(self: Self, cp: u21, form: Form) Decomp {
241 245
242test "decompose" { 246test "decompose" {
243 const allocator = std.testing.allocator; 247 const allocator = std.testing.allocator;
244 var n = try init(allocator); 248 var data = try Data.init(allocator);
249 defer data.deinit();
250 var n = try init(allocator, &data);
245 defer n.deinit(); 251 defer n.deinit();
246 252
247 var dc = n.decompose('é', .nfd); 253 var dc = n.decompose('é', .nfd);
@@ -307,19 +313,17 @@ pub const Result = struct {
307}; 313};
308 314
309// Compares code points by Canonical Combining Class order. 315// Compares code points by Canonical Combining Class order.
310fn cccLess(_: void, lhs: u21, rhs: u21) bool { 316fn cccLess(self: Self, lhs: u21, rhs: u21) bool {
311 const lcc = normp.stage_2[normp.stage_1[lhs >> 8] + (lhs & 0xff)]; 317 return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs);
312 const rcc = normp.stage_2[normp.stage_1[rhs >> 8] + (rhs & 0xff)];
313 return lcc < rcc;
314} 318}
315 319
316// Applies the Canonical Sorting Algorithm. 320// Applies the Canonical Sorting Algorithm.
317fn canonicalSort(cps: []u21) void { 321fn canonicalSort(self: Self, cps: []u21) void {
318 var i: usize = 0; 322 var i: usize = 0;
319 while (i < cps.len) : (i += 1) { 323 while (i < cps.len) : (i += 1) {
320 const start: usize = i; 324 const start: usize = i;
321 while (i < cps.len and normp.stage_2[normp.stage_1[cps[i] >> 8] + (cps[i] & 0xff)] != 0) : (i += 1) {} 325 while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {}
322 std.mem.sort(u21, cps[start..i], {}, cccLess); 326 std.mem.sort(u21, cps[start..i], self, cccLess);
323 } 327 }
324} 328}
325 329
@@ -349,7 +353,7 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
349 try dcp_list.appendSlice(slice); 353 try dcp_list.appendSlice(slice);
350 } 354 }
351 355
352 canonicalSort(dcp_list.items); 356 self.canonicalSort(dcp_list.items);
353 357
354 var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4); 358 var dstr_list = try std.ArrayList(u8).initCapacity(allocator, dcp_list.items.len * 4);
355 defer dstr_list.deinit(); 359 defer dstr_list.deinit();
@@ -365,7 +369,9 @@ fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
365 369
366test "nfd ASCII / no-alloc" { 370test "nfd ASCII / no-alloc" {
367 const allocator = std.testing.allocator; 371 const allocator = std.testing.allocator;
368 var n = try init(allocator); 372 var data = try Data.init(allocator);
373 defer data.deinit();
374 var n = try init(allocator, &data);
369 defer n.deinit(); 375 defer n.deinit();
370 376
371 var result = try n.nfd(allocator, "Hello World!"); 377 var result = try n.nfd(allocator, "Hello World!");
@@ -376,7 +382,9 @@ test "nfd ASCII / no-alloc" {
376 382
377test "nfd !ASCII / alloc" { 383test "nfd !ASCII / alloc" {
378 const allocator = std.testing.allocator; 384 const allocator = std.testing.allocator;
379 var n = try init(allocator); 385 var data = try Data.init(allocator);
386 defer data.deinit();
387 var n = try init(allocator, &data);
380 defer n.deinit(); 388 defer n.deinit();
381 389
382 var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); 390 var result = try n.nfd(allocator, "Héllo World! \u{3d3}");
@@ -387,7 +395,9 @@ test "nfd !ASCII / alloc" {
387 395
388test "nfkd ASCII / no-alloc" { 396test "nfkd ASCII / no-alloc" {
389 const allocator = std.testing.allocator; 397 const allocator = std.testing.allocator;
390 var n = try init(allocator); 398 var data = try Data.init(allocator);
399 defer data.deinit();
400 var n = try init(allocator, &data);
391 defer n.deinit(); 401 defer n.deinit();
392 402
393 var result = try n.nfkd(allocator, "Hello World!"); 403 var result = try n.nfkd(allocator, "Hello World!");
@@ -398,7 +408,9 @@ test "nfkd ASCII / no-alloc" {
398 408
399test "nfkd !ASCII / alloc" { 409test "nfkd !ASCII / alloc" {
400 const allocator = std.testing.allocator; 410 const allocator = std.testing.allocator;
401 var n = try init(allocator); 411 var data = try Data.init(allocator);
412 defer data.deinit();
413 var n = try init(allocator, &data);
402 defer n.deinit(); 414 defer n.deinit();
403 415
404 var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); 416 var result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
@@ -413,16 +425,8 @@ fn isHangul(cp: u21) bool {
413 return cp >= 0x1100 and hangul_map.syllableType(cp) != null; 425 return cp >= 0x1100 and hangul_map.syllableType(cp) != null;
414} 426}
415 427
416fn isStarter(cp: u21) bool { 428fn isNonHangulStarter(self: Self, cp: u21) bool {
417 return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] == 0; 429 return !isHangul(cp) and self.ccc_data.isStarter(cp);
418}
419
420fn isCombining(cp: u21) bool {
421 return normp.stage_2[normp.stage_1[cp >> 8] + (cp & 0xff)] != 0;
422}
423
424fn isNonHangulStarter(cp: u21) bool {
425 return !isHangul(cp) and isStarter(cp);
426} 430}
427 431
428/// Normalizes `str` to NFC. 432/// Normalizes `str` to NFC.
@@ -464,7 +468,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
464 468
465 block_check: while (i < d_list.items.len) : (i += 1) { 469 block_check: while (i < d_list.items.len) : (i += 1) {
466 const C = d_list.items[i]; 470 const C = d_list.items[i];
467 const cc_C = normp.stage_2[normp.stage_1[C >> 8] + (C & 0xff)]; 471 const cc_C = self.ccc_data.ccc(C);
468 var starter_index: ?usize = null; 472 var starter_index: ?usize = null;
469 var j: usize = i; 473 var j: usize = i;
470 474
@@ -472,14 +476,14 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
472 j -= 1; 476 j -= 1;
473 477
474 // Check for starter. 478 // Check for starter.
475 if (isStarter(d_list.items[j])) { 479 if (self.ccc_data.isStarter(d_list.items[j])) {
476 if (i - j > 1) { // If there's distance between the starting point and the current position. 480 if (i - j > 1) { // If there's distance between the starting point and the current position.
477 for (d_list.items[(j + 1)..i]) |B| { 481 for (d_list.items[(j + 1)..i]) |B| {
482 const cc_B = self.ccc_data.ccc(B);
478 // Check for blocking conditions. 483 // Check for blocking conditions.
479 if (isHangul(C)) { 484 if (isHangul(C)) {
480 if (isCombining(B) or isNonHangulStarter(B)) continue :block_check; 485 if (cc_B != 0 or self.isNonHangulStarter(B)) continue :block_check;
481 } 486 }
482 const cc_B = normp.stage_2[normp.stage_1[B >> 8] + (B & 0xff)];
483 if (cc_B >= cc_C) continue :block_check; 487 if (cc_B >= cc_C) continue :block_check;
484 } 488 }
485 } 489 }
@@ -560,7 +564,9 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
560 564
561test "nfc" { 565test "nfc" {
562 const allocator = std.testing.allocator; 566 const allocator = std.testing.allocator;
563 var n = try init(allocator); 567 var data = try Data.init(allocator);
568 defer data.deinit();
569 var n = try init(allocator, &data);
564 defer n.deinit(); 570 defer n.deinit();
565 571
566 var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); 572 var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
@@ -571,7 +577,9 @@ test "nfc" {
571 577
572test "nfkc" { 578test "nfkc" {
573 const allocator = std.testing.allocator; 579 const allocator = std.testing.allocator;
574 var n = try init(allocator); 580 var data = try Data.init(allocator);
581 defer data.deinit();
582 var n = try init(allocator, &data);
575 defer n.deinit(); 583 defer n.deinit();
576 584
577 var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); 585 var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
@@ -630,7 +638,9 @@ pub fn eql(self: Self, allocator: std.mem.Allocator, a: []const u8, b: []const u
630 638
631test "eql" { 639test "eql" {
632 const allocator = std.testing.allocator; 640 const allocator = std.testing.allocator;
633 var n = try init(allocator); 641 var data = try Data.init(allocator);
642 defer data.deinit();
643 var n = try init(allocator, &data);
634 defer n.deinit(); 644 defer n.deinit();
635 645
636 try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); 646 try std.testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
@@ -697,7 +707,9 @@ pub fn eqlCaseless(self: Self, allocator: std.mem.Allocator, a: []const u8, b: [
697 707
698test "eqlCaseless" { 708test "eqlCaseless" {
699 const allocator = std.testing.allocator; 709 const allocator = std.testing.allocator;
700 var n = try init(allocator); 710 var data = try Data.init(allocator);
711 defer data.deinit();
712 var n = try init(allocator, &data);
701 defer n.deinit(); 713 defer n.deinit();
702 714
703 try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}")); 715 try std.testing.expect(try n.eqlCaseless(allocator, "Foϓ", "fo\u{03D2}\u{0301}"));
@@ -707,7 +719,7 @@ test "eqlCaseless" {
707// FCD 719// FCD
708fn getLeadCcc(self: Self, cp: u21) u8 { 720fn getLeadCcc(self: Self, cp: u21) u8 {
709 const dc = self.mapping(cp, .nfd); 721 const dc = self.mapping(cp, .nfd);
710 return normp.stage_2[normp.stage_1[dc.cps[0] >> 8] + (dc.cps[0] & 0xff)]; 722 return self.ccc_data.ccc(dc.cps[0]);
711} 723}
712 724
713fn getTrailCcc(self: Self, cp: u21) u8 { 725fn getTrailCcc(self: Self, cp: u21) u8 {
@@ -715,8 +727,7 @@ fn getTrailCcc(self: Self, cp: u21) u8 {
715 const len = for (dc.cps, 0..) |dcp, i| { 727 const len = for (dc.cps, 0..) |dcp, i| {
716 if (dcp == 0) break i; 728 if (dcp == 0) break i;
717 } else dc.cps.len; 729 } else dc.cps.len;
718 const tcp = dc.cps[len -| 1]; 730 return self.ccc_data.ccc(dc.cps[len - 1]);
719 return normp.stage_2[normp.stage_1[tcp >> 8] + (tcp & 0xff)];
720} 731}
721 732
722/// Fast check to detect if a string is already in NFC or NFD form. 733/// Fast check to detect if a string is already in NFC or NFD form.
@@ -733,7 +744,9 @@ pub fn isFcd(self: Self, str: []const u8) bool {
733 744
734test "isFcd" { 745test "isFcd" {
735 const allocator = std.testing.allocator; 746 const allocator = std.testing.allocator;
736 var n = try init(allocator); 747 var data = try Data.init(allocator);
748 defer data.deinit();
749 var n = try init(allocator, &data);
737 defer n.deinit(); 750 defer n.deinit();
738 751
739 const is_nfc = "José \u{3D3}"; 752 const is_nfc = "José \u{3D3}";
@@ -751,7 +764,9 @@ test "Unicode normalization tests" {
751 defer arena.deinit(); 764 defer arena.deinit();
752 var allocator = arena.allocator(); 765 var allocator = arena.allocator();
753 766
754 var n = try init(allocator); 767 var data = try Data.init(allocator);
768 defer data.deinit();
769 var n = try init(allocator, &data);
755 defer n.deinit(); 770 defer n.deinit();
756 771
757 var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); 772 var file = try std.fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
diff --git a/src/grapheme.zig b/src/grapheme.zig
index 3fdf10b..7125b5b 100644
--- a/src/grapheme.zig
+++ b/src/grapheme.zig
@@ -1,9 +1,10 @@
1const std = @import("std"); 1const std = @import("std");
2const mem = std.mem;
2const unicode = std.unicode; 3const unicode = std.unicode;
3 4
4const CodePoint = @import("code_point").CodePoint; 5const CodePoint = @import("code_point").CodePoint;
5const CodePointIterator = @import("code_point").Iterator; 6const CodePointIterator = @import("code_point").Iterator;
6const gbp = @import("gbp"); 7pub const Data = @import("GraphemeData");
7 8
8/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. 9/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
9pub const Grapheme = struct { 10pub const Grapheme = struct {
@@ -21,12 +22,13 @@ pub const Grapheme = struct {
21pub const Iterator = struct { 22pub const Iterator = struct {
22 buf: [2]?CodePoint = .{ null, null }, 23 buf: [2]?CodePoint = .{ null, null },
23 cp_iter: CodePointIterator, 24 cp_iter: CodePointIterator,
25 data: *Data,
24 26
25 const Self = @This(); 27 const Self = @This();
26 28
27 /// Assumes `src` is valid UTF-8. 29 /// Assumes `src` is valid UTF-8.
28 pub fn init(str: []const u8) Self { 30 pub fn init(str: []const u8, data: *Data) Self {
29 var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; 31 var self = Self{ .cp_iter = .{ .bytes = str }, .data = data };
30 self.advance(); 32 self.advance();
31 return self; 33 return self;
32 } 34 }
@@ -55,6 +57,7 @@ pub const Iterator = struct {
55 if (graphemeBreak( 57 if (graphemeBreak(
56 self.buf[0].?.code, 58 self.buf[0].?.code,
57 self.buf[1].?.code, 59 self.buf[1].?.code,
60 self.data,
58 &state, 61 &state,
59 )) return Grapheme{ .len = gc_len, .offset = gc_start }; 62 )) return Grapheme{ .len = gc_len, .offset = gc_start };
60 63
@@ -67,6 +70,7 @@ pub const Iterator = struct {
67 if (graphemeBreak( 70 if (graphemeBreak(
68 self.buf[0].?.code, 71 self.buf[0].?.code,
69 if (self.buf[1]) |ncp| ncp.code else 0, 72 if (self.buf[1]) |ncp| ncp.code else 0,
73 self.data,
70 &state, 74 &state,
71 )) break; 75 )) break;
72 } 76 }
@@ -76,18 +80,12 @@ pub const Iterator = struct {
76}; 80};
77 81
78// Predicates 82// Predicates
79fn isBreaker(cp: u21) bool { 83fn isBreaker(cp: u21, data: *Data) bool {
80 // Extract relevant properties. 84 // Extract relevant properties.
81 const cp_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]]; 85 const cp_gbp_prop = data.gbp(cp);
82 const cp_gbp_prop: gbp.Gbp = @enumFromInt(cp_props_byte >> 4);
83 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; 86 return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
84} 87}
85 88
86fn isIgnorable(cp: u21) bool {
87 const cp_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]];
88 return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}';
89}
90
91// Grapheme break state. 89// Grapheme break state.
92const State = struct { 90const State = struct {
93 bits: u3 = 0, 91 bits: u3 = 0,
@@ -135,18 +133,17 @@ const State = struct {
135pub fn graphemeBreak( 133pub fn graphemeBreak(
136 cp1: u21, 134 cp1: u21,
137 cp2: u21, 135 cp2: u21,
136 data: *Data,
138 state: *State, 137 state: *State,
139) bool { 138) bool {
140 // Extract relevant properties. 139 // Extract relevant properties.
141 const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; 140 const cp1_gbp_prop = data.gbp(cp1);
142 const cp1_gbp_prop: gbp.Gbp = @enumFromInt(cp1_props_byte >> 4); 141 const cp1_indic_prop = data.indic(cp1);
143 const cp1_indic_prop: gbp.Indic = @enumFromInt((cp1_props_byte >> 1) & 0x7); 142 const cp1_is_emoji = data.isEmoji(cp1);
144 const cp1_is_emoji = cp1_props_byte & 1 == 1;
145 143
146 const cp2_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp2 >> 8] + (cp2 & 0xff)]]; 144 const cp2_gbp_prop = data.gbp(cp2);
147 const cp2_gbp_prop: gbp.Gbp = @enumFromInt(cp2_props_byte >> 4); 145 const cp2_indic_prop = data.indic(cp2);
148 const cp2_indic_prop: gbp.Indic = @enumFromInt((cp2_props_byte >> 1) & 0x7); 146 const cp2_is_emoji = data.isEmoji(cp2);
149 const cp2_is_emoji = cp2_props_byte & 1 == 1;
150 147
151 // GB11: Emoji Extend* ZWJ x Emoji 148 // GB11: Emoji Extend* ZWJ x Emoji
152 if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); 149 if (!state.hasXpic() and cp1_is_emoji) state.setXpic();
@@ -157,7 +154,7 @@ pub fn graphemeBreak(
157 if (cp1 == '\r' and cp2 == '\n') return false; 154 if (cp1 == '\r' and cp2 == '\n') return false;
158 155
159 // GB4: Control 156 // GB4: Control
160 if (isBreaker(cp1)) return true; 157 if (isBreaker(cp1, data)) return true;
161 158
162 // GB11: Emoji Extend* ZWJ x Emoji 159 // GB11: Emoji Extend* ZWJ x Emoji
163 if (state.hasXpic() and 160 if (state.hasXpic() and
@@ -175,7 +172,7 @@ pub fn graphemeBreak(
175 if (cp2_gbp_prop == .SpacingMark) return false; 172 if (cp2_gbp_prop == .SpacingMark) return false;
176 173
177 // GB9b: Prepend x 174 // GB9b: Prepend x
178 if (cp1_gbp_prop == .Prepend and !isBreaker(cp2)) return false; 175 if (cp1_gbp_prop == .Prepend and !isBreaker(cp2, data)) return false;
179 176
180 // GB12, GB13: RI x RI 177 // GB12, GB13: RI x RI
181 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { 178 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
@@ -240,6 +237,9 @@ test "Segmentation GraphemeIterator" {
240 var buf_reader = std.io.bufferedReader(file.reader()); 237 var buf_reader = std.io.bufferedReader(file.reader());
241 var input_stream = buf_reader.reader(); 238 var input_stream = buf_reader.reader();
242 239
240 var data = try Data.init(allocator);
241 defer data.deinit();
242
243 var buf: [4096]u8 = undefined; 243 var buf: [4096]u8 = undefined;
244 var line_no: usize = 1; 244 var line_no: usize = 1;
245 245
@@ -282,7 +282,7 @@ test "Segmentation GraphemeIterator" {
282 } 282 }
283 283
284 // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); 284 // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
285 var iter = Iterator.init(all_bytes.items); 285 var iter = Iterator.init(all_bytes.items, &data);
286 286
287 // Chaeck. 287 // Chaeck.
288 for (want.items) |want_gc| { 288 for (want.items) |want_gc| {
@@ -295,19 +295,6 @@ test "Segmentation GraphemeIterator" {
295 } 295 }
296} 296}
297 297
298test "Segmentation comptime GraphemeIterator" {
299 const want = [_][]const u8{ "H", "é", "l", "l", "o" };
300
301 comptime {
302 const src = "Héllo";
303 var ct_iter = Iterator.init(src);
304 var i = 0;
305 while (ct_iter.next()) |grapheme| : (i += 1) {
306 try std.testing.expectEqualStrings(grapheme.bytes(src), want[i]);
307 }
308 }
309}
310
311test "Segmentation ZWJ and ZWSP emoji sequences" { 298test "Segmentation ZWJ and ZWSP emoji sequences" {
312 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; 299 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
313 const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; 300 const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
@@ -315,18 +302,22 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
315 const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; 302 const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;
316 const no_joiner = seq_1 ++ seq_2; 303 const no_joiner = seq_1 ++ seq_2;
317 304
318 var ct_iter = Iterator.init(with_zwj); 305 var data = try Data.init(std.testing.allocator);
306 defer data.deinit();
307
308 var iter = Iterator.init(with_zwj, &data);
309
319 var i: usize = 0; 310 var i: usize = 0;
320 while (ct_iter.next()) |_| : (i += 1) {} 311 while (iter.next()) |_| : (i += 1) {}
321 try std.testing.expectEqual(@as(usize, 1), i); 312 try std.testing.expectEqual(@as(usize, 1), i);
322 313
323 ct_iter = Iterator.init(with_zwsp); 314 iter = Iterator.init(with_zwsp, &data);
324 i = 0; 315 i = 0;
325 while (ct_iter.next()) |_| : (i += 1) {} 316 while (iter.next()) |_| : (i += 1) {}
326 try std.testing.expectEqual(@as(usize, 3), i); 317 try std.testing.expectEqual(@as(usize, 3), i);
327 318
328 ct_iter = Iterator.init(no_joiner); 319 iter = Iterator.init(no_joiner, &data);
329 i = 0; 320 i = 0;
330 while (ct_iter.next()) |_| : (i += 1) {} 321 while (iter.next()) |_| : (i += 1) {}
331 try std.testing.expectEqual(@as(usize, 2), i); 322 try std.testing.expectEqual(@as(usize, 2), i);
332} 323}
diff --git a/src/main.zig b/src/main.zig
index 946ae01..57db05b 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -1,29 +1,47 @@
1const std = @import("std"); 1const std = @import("std");
2 2
3// const GraphemeIterator = @import("ziglyph").GraphemeIterator; 3// const GraphemeIterator = @import("ziglyph").GraphemeIterator;
4// const GraphemeIterator = @import("Grapheme").GraphemeIterator; 4// const Data = @import("grapheme").Data;
5// const GraphemeIterator = @import("grapheme").Iterator;
6
5// const codePointWidth = @import("ziglyph").display_width.codePointWidth; 7// const codePointWidth = @import("ziglyph").display_width.codePointWidth;
6// const codePointWidth = @import("display_width").codePointWidth;
7// const strWidth = @import("ziglyph").display_width.strWidth; 8// const strWidth = @import("ziglyph").display_width.strWidth;
9// const Data = @import("display_width").Data;
10// const codePointWidth = @import("display_width").codePointWidth;
8// const strWidth = @import("display_width").strWidth; 11// const strWidth = @import("display_width").strWidth;
9// const CodePointIterator = @import("CodePoint").CodePointIterator; 12
13// const CodePointIterator = @import("ziglyph").CodePointIterator;
14// const CodePointIterator = @import("code_point").Iterator;
15
10// const ascii = @import("ascii"); 16// const ascii = @import("ascii");
11// const ascii = std.ascii; 17// const ascii = std.ascii;
18
12// const norm = @import("ziglyph").Normalizer; 19// const norm = @import("ziglyph").Normalizer;
20const Data = @import("Normalizer").Data;
13const norm = @import("Normalizer"); 21const norm = @import("Normalizer");
14 22
15pub fn main() !void { 23pub fn main() !void {
24 var args_iter = std.process.args();
25 _ = args_iter.skip();
26 const in_path = args_iter.next() orelse return error.MissingArg;
27
16 var gpa = std.heap.GeneralPurposeAllocator(.{}){}; 28 var gpa = std.heap.GeneralPurposeAllocator(.{}){};
17 defer _ = gpa.deinit(); 29 defer _ = gpa.deinit();
18 const allocator = gpa.allocator(); 30 const allocator = gpa.allocator();
19 31
20 const input = try std.fs.cwd().readFileAlloc(allocator, "data/lang_mix.txt", std.math.maxInt(u32)); 32 const input = try std.fs.cwd().readFileAlloc(allocator, in_path, std.math.maxInt(u32));
21 defer allocator.free(input); 33 defer allocator.free(input);
22 34
23 var n = try norm.init(allocator); 35 var data = try Data.init(allocator);
36 defer data.deinit();
37
38 var n = try norm.init(allocator, &data);
24 defer n.deinit(); 39 defer n.deinit();
40 // var n = try norm.init(allocator);
41 // defer n.deinit();
25 42
26 // var iter = GraphemeIterator.init(input); 43 // var iter = GraphemeIterator.init(input, &data);
44 // defer iter.deinit();
27 // var iter = CodePointIterator{ .bytes = input }; 45 // var iter = CodePointIterator{ .bytes = input };
28 var iter = std.mem.splitScalar(u8, input, '\n'); 46 var iter = std.mem.splitScalar(u8, input, '\n');
29 47
@@ -33,7 +51,7 @@ pub fn main() !void {
33 51
34 // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); 52 // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code));
35 // while (iter.next()) |_| result += 1; 53 // while (iter.next()) |_| result += 1;
36 // while (iter.next()) |line| result += strWidth(line); 54 // while (iter.next()) |line| result += strWidth(line, &data);
37 while (iter.next()) |line| { 55 while (iter.next()) |line| {
38 var nfc = try n.nfc(allocator, line); 56 var nfc = try n.nfc(allocator, line);
39 result += nfc.slice.len; 57 result += nfc.slice.len;