summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--build.zig28
-rw-r--r--codegen/scripts.zig309
-rw-r--r--src/CanonData.zig2
-rw-r--r--src/CaseData.zig10
-rw-r--r--src/CaseFold.zig8
-rw-r--r--src/CombiningData.zig2
-rw-r--r--src/CompatData.zig2
-rw-r--r--src/DisplayWidth.zig10
-rw-r--r--src/FoldData.zig2
-rw-r--r--src/GenCatData.zig2
-rw-r--r--src/GraphemeData.zig2
-rw-r--r--src/HangulData.zig2
-rw-r--r--src/NormData.zig2
-rw-r--r--src/NormPropsData.zig2
-rw-r--r--src/Normalize.zig58
-rw-r--r--src/NumericData.zig4
-rw-r--r--src/ScriptsData.zig226
-rw-r--r--src/WidthData.zig2
-rw-r--r--src/grapheme.zig4
19 files changed, 615 insertions, 62 deletions
diff --git a/build.zig b/build.zig
index 0d002ad..58c3f21 100644
--- a/build.zig
+++ b/build.zig
@@ -137,6 +137,15 @@ pub fn build(b: *std.Build) void {
137 const run_lower_gen_exe = b.addRunArtifact(lower_gen_exe); 137 const run_lower_gen_exe = b.addRunArtifact(lower_gen_exe);
138 const lower_gen_out = run_lower_gen_exe.addOutputFileArg("lower.bin.z"); 138 const lower_gen_out = run_lower_gen_exe.addOutputFileArg("lower.bin.z");
139 139
140 const scripts_gen_exe = b.addExecutable(.{
141 .name = "scripts",
142 .root_source_file = .{ .path = "codegen/scripts.zig" },
143 .target = b.host,
144 .optimize = .Debug,
145 });
146 const run_scripts_gen_exe = b.addRunArtifact(scripts_gen_exe);
147 const scripts_gen_out = run_scripts_gen_exe.addOutputFileArg("scripts.bin.z");
148
140 // Modules we provide 149 // Modules we provide
141 // Code points 150 // Code points
142 const code_point = b.addModule("code_point", .{ 151 const code_point = b.addModule("code_point", .{
@@ -287,14 +296,22 @@ pub fn build(b: *std.Build) void {
287 case_data.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out }); 296 case_data.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out });
288 case_data.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out }); 297 case_data.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out });
289 298
299 // Scripts
300 const scripts_data = b.addModule("ScriptsData", .{
301 .root_source_file = .{ .path = "src/ScriptsData.zig" },
302 .target = target,
303 .optimize = optimize,
304 });
305 scripts_data.addAnonymousImport("scripts", .{ .root_source_file = scripts_gen_out });
306
290 // Tests 307 // Tests
291 const exe_unit_tests = b.addTest(.{ 308 const exe_unit_tests = b.addTest(.{
292 .root_source_file = .{ .path = "src/CaseData.zig" }, 309 .root_source_file = .{ .path = "src/ScriptsData.zig" },
293 .target = target, 310 .target = target,
294 .optimize = optimize, 311 .optimize = optimize,
295 }); 312 });
296 // exe_unit_tests.root_module.addImport("ascii", ascii); 313 // exe_unit_tests.root_module.addImport("ascii", ascii);
297 exe_unit_tests.root_module.addImport("code_point", code_point); 314 // exe_unit_tests.root_module.addImport("code_point", code_point);
298 // exe_unit_tests.root_module.addImport("GraphemeData", grapheme_data); 315 // exe_unit_tests.root_module.addImport("GraphemeData", grapheme_data);
299 // exe_unit_tests.root_module.addImport("grapheme", grapheme); 316 // exe_unit_tests.root_module.addImport("grapheme", grapheme);
300 // exe_unit_tests.root_module.addImport("ziglyph", ziglyph.module("ziglyph")); 317 // exe_unit_tests.root_module.addImport("ziglyph", ziglyph.module("ziglyph"));
@@ -304,9 +321,10 @@ pub fn build(b: *std.Build) void {
304 // exe_unit_tests.root_module.addImport("Normalize", norm); 321 // exe_unit_tests.root_module.addImport("Normalize", norm);
305 // exe_unit_tests.root_module.addImport("FoldData", fold_data); 322 // exe_unit_tests.root_module.addImport("FoldData", fold_data);
306 // exe_unit_tests.root_module.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out }); 323 // exe_unit_tests.root_module.addAnonymousImport("numeric", .{ .root_source_file = num_gen_out });
307 exe_unit_tests.root_module.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out }); 324 // exe_unit_tests.root_module.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out });
308 exe_unit_tests.root_module.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out }); 325 // exe_unit_tests.root_module.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out });
309 exe_unit_tests.root_module.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out }); 326 // exe_unit_tests.root_module.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out });
327 exe_unit_tests.root_module.addAnonymousImport("scripts", .{ .root_source_file = scripts_gen_out });
310 // exe_unit_tests.filter = "nfd !ASCII"; 328 // exe_unit_tests.filter = "nfd !ASCII";
311 329
312 const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); 330 const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
diff --git a/codegen/scripts.zig b/codegen/scripts.zig
new file mode 100644
index 0000000..e985c1e
--- /dev/null
+++ b/codegen/scripts.zig
@@ -0,0 +1,309 @@
1const std = @import("std");
2const builtin = @import("builtin");
3
4const Script = enum {
5 none,
6 Adlam,
7 Ahom,
8 Anatolian_Hieroglyphs,
9 Arabic,
10 Armenian,
11 Avestan,
12 Balinese,
13 Bamum,
14 Bassa_Vah,
15 Batak,
16 Bengali,
17 Bhaiksuki,
18 Bopomofo,
19 Brahmi,
20 Braille,
21 Buginese,
22 Buhid,
23 Canadian_Aboriginal,
24 Carian,
25 Caucasian_Albanian,
26 Chakma,
27 Cham,
28 Cherokee,
29 Chorasmian,
30 Common,
31 Coptic,
32 Cuneiform,
33 Cypriot,
34 Cypro_Minoan,
35 Cyrillic,
36 Deseret,
37 Devanagari,
38 Dives_Akuru,
39 Dogra,
40 Duployan,
41 Egyptian_Hieroglyphs,
42 Elbasan,
43 Elymaic,
44 Ethiopic,
45 Georgian,
46 Glagolitic,
47 Gothic,
48 Grantha,
49 Greek,
50 Gujarati,
51 Gunjala_Gondi,
52 Gurmukhi,
53 Han,
54 Hangul,
55 Hanifi_Rohingya,
56 Hanunoo,
57 Hatran,
58 Hebrew,
59 Hiragana,
60 Imperial_Aramaic,
61 Inherited,
62 Inscriptional_Pahlavi,
63 Inscriptional_Parthian,
64 Javanese,
65 Kaithi,
66 Kannada,
67 Katakana,
68 Kawi,
69 Kayah_Li,
70 Kharoshthi,
71 Khitan_Small_Script,
72 Khmer,
73 Khojki,
74 Khudawadi,
75 Lao,
76 Latin,
77 Lepcha,
78 Limbu,
79 Linear_A,
80 Linear_B,
81 Lisu,
82 Lycian,
83 Lydian,
84 Mahajani,
85 Makasar,
86 Malayalam,
87 Mandaic,
88 Manichaean,
89 Marchen,
90 Masaram_Gondi,
91 Medefaidrin,
92 Meetei_Mayek,
93 Mende_Kikakui,
94 Meroitic_Cursive,
95 Meroitic_Hieroglyphs,
96 Miao,
97 Modi,
98 Mongolian,
99 Mro,
100 Multani,
101 Myanmar,
102 Nabataean,
103 Nag_Mundari,
104 Nandinagari,
105 New_Tai_Lue,
106 Newa,
107 Nko,
108 Nushu,
109 Nyiakeng_Puachue_Hmong,
110 Ogham,
111 Ol_Chiki,
112 Old_Hungarian,
113 Old_Italic,
114 Old_North_Arabian,
115 Old_Permic,
116 Old_Persian,
117 Old_Sogdian,
118 Old_South_Arabian,
119 Old_Turkic,
120 Old_Uyghur,
121 Oriya,
122 Osage,
123 Osmanya,
124 Pahawh_Hmong,
125 Palmyrene,
126 Pau_Cin_Hau,
127 Phags_Pa,
128 Phoenician,
129 Psalter_Pahlavi,
130 Rejang,
131 Runic,
132 Samaritan,
133 Saurashtra,
134 Sharada,
135 Shavian,
136 Siddham,
137 SignWriting,
138 Sinhala,
139 Sogdian,
140 Sora_Sompeng,
141 Soyombo,
142 Sundanese,
143 Syloti_Nagri,
144 Syriac,
145 Tagalog,
146 Tagbanwa,
147 Tai_Le,
148 Tai_Tham,
149 Tai_Viet,
150 Takri,
151 Tamil,
152 Tangsa,
153 Tangut,
154 Telugu,
155 Thaana,
156 Thai,
157 Tibetan,
158 Tifinagh,
159 Tirhuta,
160 Toto,
161 Ugaritic,
162 Vai,
163 Vithkuqi,
164 Wancho,
165 Warang_Citi,
166 Yezidi,
167 Yi,
168 Zanabazar_Square,
169};
170
171const block_size = 256;
172const Block = [block_size]u8;
173
174const BlockMap = std.HashMap(
175 Block,
176 u16,
177 struct {
178 pub fn hash(_: @This(), k: Block) u64 {
179 var hasher = std.hash.Wyhash.init(0);
180 std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
181 return hasher.final();
182 }
183
184 pub fn eql(_: @This(), a: Block, b: Block) bool {
185 return std.mem.eql(u8, &a, &b);
186 }
187 },
188 std.hash_map.default_max_load_percentage,
189);
190
191pub fn main() !void {
192 var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
193 defer arena.deinit();
194 const allocator = arena.allocator();
195
196 var flat_map = std.AutoHashMap(u21, u8).init(allocator);
197 defer flat_map.deinit();
198
199 var line_buf: [4096]u8 = undefined;
200
201 // Process DerivedGeneralCategory.txt
202 var in_file = try std.fs.cwd().openFile("data/unicode/Scripts.txt", .{});
203 defer in_file.close();
204 var in_buf = std.io.bufferedReader(in_file.reader());
205 const in_reader = in_buf.reader();
206
207 while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
208 if (line.len == 0 or line[0] == '#') continue;
209
210 const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line;
211
212 var field_iter = std.mem.tokenizeAny(u8, no_comment, "; ");
213 var current_code: [2]u21 = undefined;
214
215 var i: usize = 0;
216 while (field_iter.next()) |field| : (i += 1) {
217 switch (i) {
218 0 => {
219 // Code point(s)
220 if (std.mem.indexOf(u8, field, "..")) |dots| {
221 current_code = .{
222 try std.fmt.parseInt(u21, field[0..dots], 16),
223 try std.fmt.parseInt(u21, field[dots + 2 ..], 16),
224 };
225 } else {
226 const code = try std.fmt.parseInt(u21, field, 16);
227 current_code = .{ code, code };
228 }
229 },
230 1 => {
231 // Script
232 const script = std.meta.stringToEnum(Script, field) orelse {
233 std.debug.print("Unknown script: {s}\n", .{field});
234 return error.UnknownScript;
235 };
236 for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), @intFromEnum(script));
237 },
238 else => {},
239 }
240 }
241 }
242
243 var blocks_map = BlockMap.init(allocator);
244 defer blocks_map.deinit();
245
246 var stage1 = std.ArrayList(u16).init(allocator);
247 defer stage1.deinit();
248
249 var stage2 = std.ArrayList(u8).init(allocator);
250 defer stage2.deinit();
251
252 var stage3 = std.ArrayList(u8).init(allocator);
253 defer stage3.deinit();
254
255 var block: Block = [_]u8{0} ** block_size;
256 var block_len: u16 = 0;
257
258 for (0..0x110000) |i| {
259 const cp: u21 = @intCast(i);
260 const script = flat_map.get(cp) orelse 0;
261
262 const stage3_idx = blk: {
263 for (stage3.items, 0..) |script_i, j| {
264 if (script == script_i) break :blk j;
265 }
266 try stage3.append(script);
267 break :blk stage3.items.len - 1;
268 };
269
270 // Process block
271 block[block_len] = @intCast(stage3_idx);
272 block_len += 1;
273
274 if (block_len < block_size and cp != 0x10ffff) continue;
275
276 const gop = try blocks_map.getOrPut(block);
277 if (!gop.found_existing) {
278 gop.value_ptr.* = @intCast(stage2.items.len);
279 try stage2.appendSlice(&block);
280 }
281
282 try stage1.append(gop.value_ptr.*);
283 block_len = 0;
284 }
285
286 var args_iter = try std.process.argsWithAllocator(allocator);
287 defer args_iter.deinit();
288 _ = args_iter.skip();
289 const output_path = args_iter.next() orelse @panic("No output file arg!");
290
291 const compressor = std.compress.deflate.compressor;
292 var out_file = try std.fs.cwd().createFile(output_path, .{});
293 defer out_file.close();
294 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression });
295 defer out_comp.deinit();
296 const writer = out_comp.writer();
297
298 const endian = builtin.cpu.arch.endian();
299 try writer.writeInt(u16, @intCast(stage1.items.len), endian);
300 for (stage1.items) |i| try writer.writeInt(u16, i, endian);
301
302 try writer.writeInt(u16, @intCast(stage2.items.len), endian);
303 for (stage2.items) |i| try writer.writeInt(u8, i, endian);
304
305 try writer.writeInt(u8, @intCast(stage3.items.len), endian);
306 for (stage3.items) |i| try writer.writeInt(u8, i, endian);
307
308 try out_comp.flush();
309}
diff --git a/src/CanonData.zig b/src/CanonData.zig
index 36895ff..9f1deb8 100644
--- a/src/CanonData.zig
+++ b/src/CanonData.zig
@@ -42,7 +42,7 @@ pub fn init(allocator: mem.Allocator) !Self {
42 return self; 42 return self;
43} 43}
44 44
45pub fn deinit(self: *Self) void { 45pub fn deinit(self: *const Self) void {
46 self.nfc.deinit(); 46 self.nfc.deinit();
47 for (self.nfd) |slice| self.allocator.free(slice); 47 for (self.nfd) |slice| self.allocator.free(slice);
48 self.allocator.free(self.nfd); 48 self.allocator.free(self.nfd);
diff --git a/src/CaseData.zig b/src/CaseData.zig
index 4f06636..c9ccc1e 100644
--- a/src/CaseData.zig
+++ b/src/CaseData.zig
@@ -77,7 +77,7 @@ pub fn init(allocator: mem.Allocator) !Self {
77 return self; 77 return self;
78} 78}
79 79
80pub fn deinit(self: *Self) void { 80pub fn deinit(self: *const Self) void {
81 self.allocator.free(self.case_map); 81 self.allocator.free(self.case_map);
82 self.allocator.free(self.prop_s1); 82 self.allocator.free(self.prop_s1);
83 self.allocator.free(self.prop_s2); 83 self.allocator.free(self.prop_s2);
@@ -103,7 +103,7 @@ pub fn isUpperStr(self: Self, str: []const u8) bool {
103} 103}
104 104
105test "isUpperStr" { 105test "isUpperStr" {
106 var cd = try init(testing.allocator); 106 const cd = try init(testing.allocator);
107 defer cd.deinit(); 107 defer cd.deinit();
108 108
109 try testing.expect(cd.isUpperStr("HELLO, WORLD 2112!")); 109 try testing.expect(cd.isUpperStr("HELLO, WORLD 2112!"));
@@ -138,7 +138,7 @@ pub fn toUpperStr(
138} 138}
139 139
140test "toUpperStr" { 140test "toUpperStr" {
141 var cd = try init(testing.allocator); 141 const cd = try init(testing.allocator);
142 defer cd.deinit(); 142 defer cd.deinit();
143 143
144 const uppered = try cd.toUpperStr(testing.allocator, "Hello, World 2112!"); 144 const uppered = try cd.toUpperStr(testing.allocator, "Hello, World 2112!");
@@ -161,7 +161,7 @@ pub fn isLowerStr(self: Self, str: []const u8) bool {
161} 161}
162 162
163test "isLowerStr" { 163test "isLowerStr" {
164 var cd = try init(testing.allocator); 164 const cd = try init(testing.allocator);
165 defer cd.deinit(); 165 defer cd.deinit();
166 166
167 try testing.expect(cd.isLowerStr("hello, world 2112!")); 167 try testing.expect(cd.isLowerStr("hello, world 2112!"));
@@ -196,7 +196,7 @@ pub fn toLowerStr(
196} 196}
197 197
198test "toLowerStr" { 198test "toLowerStr" {
199 var cd = try init(testing.allocator); 199 const cd = try init(testing.allocator);
200 defer cd.deinit(); 200 defer cd.deinit();
201 201
202 const lowered = try cd.toLowerStr(testing.allocator, "Hello, World 2112!"); 202 const lowered = try cd.toLowerStr(testing.allocator, "Hello, World 2112!");
diff --git a/src/CaseFold.zig b/src/CaseFold.zig
index e00d03b..9b10e16 100644
--- a/src/CaseFold.zig
+++ b/src/CaseFold.zig
@@ -90,11 +90,11 @@ pub fn compatCaselessMatch(
90test "compatCaselessMatch" { 90test "compatCaselessMatch" {
91 const allocator = testing.allocator; 91 const allocator = testing.allocator;
92 92
93 var norm_data = try Normalize.NormData.init(allocator); 93 const norm_data = try Normalize.NormData.init(allocator);
94 defer norm_data.deinit(); 94 defer norm_data.deinit();
95 const n = Normalize{ .norm_data = &norm_data }; 95 const n = Normalize{ .norm_data = &norm_data };
96 96
97 var fold_data = try FoldData.init(allocator); 97 const fold_data = try FoldData.init(allocator);
98 defer fold_data.deinit(); 98 defer fold_data.deinit();
99 const caser = Self{ .fold_data = &fold_data }; 99 const caser = Self{ .fold_data = &fold_data };
100 100
@@ -163,11 +163,11 @@ pub fn canonCaselessMatch(
163test "canonCaselessMatch" { 163test "canonCaselessMatch" {
164 const allocator = testing.allocator; 164 const allocator = testing.allocator;
165 165
166 var norm_data = try Normalize.NormData.init(allocator); 166 const norm_data = try Normalize.NormData.init(allocator);
167 defer norm_data.deinit(); 167 defer norm_data.deinit();
168 const n = Normalize{ .norm_data = &norm_data }; 168 const n = Normalize{ .norm_data = &norm_data };
169 169
170 var fold_data = try FoldData.init(allocator); 170 const fold_data = try FoldData.init(allocator);
171 defer fold_data.deinit(); 171 defer fold_data.deinit();
172 const caser = Self{ .fold_data = &fold_data }; 172 const caser = Self{ .fold_data = &fold_data };
173 173
diff --git a/src/CombiningData.zig b/src/CombiningData.zig
index 95c947d..c67638c 100644
--- a/src/CombiningData.zig
+++ b/src/CombiningData.zig
@@ -32,7 +32,7 @@ pub fn init(allocator: mem.Allocator) !Self {
32 return self; 32 return self;
33} 33}
34 34
35pub fn deinit(self: *Self) void { 35pub fn deinit(self: *const Self) void {
36 self.allocator.free(self.s1); 36 self.allocator.free(self.s1);
37 self.allocator.free(self.s2); 37 self.allocator.free(self.s2);
38} 38}
diff --git a/src/CompatData.zig b/src/CompatData.zig
index fd7f678..67c43e6 100644
--- a/src/CompatData.zig
+++ b/src/CompatData.zig
@@ -37,7 +37,7 @@ pub fn init(allocator: mem.Allocator) !Self {
37 return self; 37 return self;
38} 38}
39 39
40pub fn deinit(self: *Self) void { 40pub fn deinit(self: *const Self) void {
41 for (self.nfkd) |slice| { 41 for (self.nfkd) |slice| {
42 if (slice.len != 0) self.allocator.free(slice); 42 if (slice.len != 0) self.allocator.free(slice);
43 } 43 }
diff --git a/src/DisplayWidth.zig b/src/DisplayWidth.zig
index 8d5eb0f..e547adf 100644
--- a/src/DisplayWidth.zig
+++ b/src/DisplayWidth.zig
@@ -56,7 +56,7 @@ pub fn strWidth(self: Self, str: []const u8) usize {
56} 56}
57 57
58test "strWidth" { 58test "strWidth" {
59 var data = try DisplayWidthData.init(testing.allocator); 59 const data = try DisplayWidthData.init(testing.allocator);
60 defer data.deinit(); 60 defer data.deinit();
61 const self = Self{ .data = &data }; 61 const self = Self{ .data = &data };
62 62
@@ -157,7 +157,7 @@ pub fn center(
157 157
158test "center" { 158test "center" {
159 const allocator = testing.allocator; 159 const allocator = testing.allocator;
160 var data = try DisplayWidthData.init(allocator); 160 const data = try DisplayWidthData.init(allocator);
161 defer data.deinit(); 161 defer data.deinit();
162 const self = Self{ .data = &data }; 162 const self = Self{ .data = &data };
163 163
@@ -236,7 +236,7 @@ pub fn padLeft(
236 236
237test "padLeft" { 237test "padLeft" {
238 const allocator = testing.allocator; 238 const allocator = testing.allocator;
239 var data = try DisplayWidthData.init(allocator); 239 const data = try DisplayWidthData.init(allocator);
240 defer data.deinit(); 240 defer data.deinit();
241 const self = Self{ .data = &data }; 241 const self = Self{ .data = &data };
242 242
@@ -286,7 +286,7 @@ pub fn padRight(
286 286
287test "padRight" { 287test "padRight" {
288 const allocator = testing.allocator; 288 const allocator = testing.allocator;
289 var data = try DisplayWidthData.init(allocator); 289 const data = try DisplayWidthData.init(allocator);
290 defer data.deinit(); 290 defer data.deinit();
291 const self = Self{ .data = &data }; 291 const self = Self{ .data = &data };
292 292
@@ -339,7 +339,7 @@ pub fn wrap(
339 339
340test "wrap" { 340test "wrap" {
341 const allocator = testing.allocator; 341 const allocator = testing.allocator;
342 var data = try DisplayWidthData.init(allocator); 342 const data = try DisplayWidthData.init(allocator);
343 defer data.deinit(); 343 defer data.deinit();
344 const self = Self{ .data = &data }; 344 const self = Self{ .data = &data };
345 345
diff --git a/src/FoldData.zig b/src/FoldData.zig
index 2a9a1f5..e387447 100644
--- a/src/FoldData.zig
+++ b/src/FoldData.zig
@@ -41,7 +41,7 @@ pub fn init(allocator: mem.Allocator) !Self {
41 return self; 41 return self;
42} 42}
43 43
44pub fn deinit(self: *Self) void { 44pub fn deinit(self: *const Self) void {
45 for (self.fold) |slice| self.allocator.free(slice); 45 for (self.fold) |slice| self.allocator.free(slice);
46 self.allocator.free(self.fold); 46 self.allocator.free(self.fold);
47 self.allocator.free(self.cwcf); 47 self.allocator.free(self.cwcf);
diff --git a/src/GenCatData.zig b/src/GenCatData.zig
index b45135b..37ae037 100644
--- a/src/GenCatData.zig
+++ b/src/GenCatData.zig
@@ -71,7 +71,7 @@ pub fn init(allocator: mem.Allocator) !Self {
71 return self; 71 return self;
72} 72}
73 73
74pub fn deinit(self: *Self) void { 74pub fn deinit(self: *const Self) void {
75 self.allocator.free(self.s1); 75 self.allocator.free(self.s1);
76 self.allocator.free(self.s2); 76 self.allocator.free(self.s2);
77 self.allocator.free(self.s3); 77 self.allocator.free(self.s3);
diff --git a/src/GraphemeData.zig b/src/GraphemeData.zig
index e418dea..971929a 100644
--- a/src/GraphemeData.zig
+++ b/src/GraphemeData.zig
@@ -64,7 +64,7 @@ pub fn init(allocator: mem.Allocator) !Self {
64 return self; 64 return self;
65} 65}
66 66
67pub fn deinit(self: *Self) void { 67pub fn deinit(self: *const Self) void {
68 self.allocator.free(self.s1); 68 self.allocator.free(self.s1);
69 self.allocator.free(self.s2); 69 self.allocator.free(self.s2);
70 self.allocator.free(self.s3); 70 self.allocator.free(self.s3);
diff --git a/src/HangulData.zig b/src/HangulData.zig
index b97424c..ec360e9 100644
--- a/src/HangulData.zig
+++ b/src/HangulData.zig
@@ -41,7 +41,7 @@ pub fn init(allocator: mem.Allocator) !Self {
41 return self; 41 return self;
42} 42}
43 43
44pub fn deinit(self: *Self) void { 44pub fn deinit(self: *const Self) void {
45 self.allocator.free(self.s1); 45 self.allocator.free(self.s1);
46 self.allocator.free(self.s2); 46 self.allocator.free(self.s2);
47} 47}
diff --git a/src/NormData.zig b/src/NormData.zig
index 8a7fa49..413619a 100644
--- a/src/NormData.zig
+++ b/src/NormData.zig
@@ -26,7 +26,7 @@ pub fn init(allocator: std.mem.Allocator) !Self {
26 }; 26 };
27} 27}
28 28
29pub fn deinit(self: *Self) void { 29pub fn deinit(self: *const Self) void {
30 self.canon_data.deinit(); 30 self.canon_data.deinit();
31 self.ccc_data.deinit(); 31 self.ccc_data.deinit();
32 self.compat_data.deinit(); 32 self.compat_data.deinit();
diff --git a/src/NormPropsData.zig b/src/NormPropsData.zig
index 3c49712..893a8d0 100644
--- a/src/NormPropsData.zig
+++ b/src/NormPropsData.zig
@@ -32,7 +32,7 @@ pub fn init(allocator: mem.Allocator) !Self {
32 return self; 32 return self;
33} 33}
34 34
35pub fn deinit(self: *Self) void { 35pub fn deinit(self: *const Self) void {
36 self.allocator.free(self.s1); 36 self.allocator.free(self.s1);
37 self.allocator.free(self.s2); 37 self.allocator.free(self.s2);
38} 38}
diff --git a/src/Normalize.zig b/src/Normalize.zig
index 6ef7c90..daf774d 100644
--- a/src/Normalize.zig
+++ b/src/Normalize.zig
@@ -177,7 +177,7 @@ fn decompose(
177 177
178test "decompose" { 178test "decompose" {
179 const allocator = testing.allocator; 179 const allocator = testing.allocator;
180 var data = try NormData.init(allocator); 180 const data = try NormData.init(allocator);
181 defer data.deinit(); 181 defer data.deinit();
182 var n = Self{ .norm_data = &data }; 182 var n = Self{ .norm_data = &data };
183 183
@@ -225,7 +225,7 @@ pub const Result = struct {
225 allocator: ?mem.Allocator = null, 225 allocator: ?mem.Allocator = null,
226 slice: []const u8, 226 slice: []const u8,
227 227
228 pub fn deinit(self: *Result) void { 228 pub fn deinit(self: *const Result) void {
229 if (self.allocator) |allocator| allocator.free(self.slice); 229 if (self.allocator) |allocator| allocator.free(self.slice);
230 } 230 }
231}; 231};
@@ -297,11 +297,11 @@ fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Resu
297 297
298test "nfd ASCII / no-alloc" { 298test "nfd ASCII / no-alloc" {
299 const allocator = testing.allocator; 299 const allocator = testing.allocator;
300 var data = try NormData.init(allocator); 300 const data = try NormData.init(allocator);
301 defer data.deinit(); 301 defer data.deinit();
302 var n = Self{ .norm_data = &data }; 302 const n = Self{ .norm_data = &data };
303 303
304 var result = try n.nfd(allocator, "Hello World!"); 304 const result = try n.nfd(allocator, "Hello World!");
305 defer result.deinit(); 305 defer result.deinit();
306 306
307 try testing.expectEqualStrings("Hello World!", result.slice); 307 try testing.expectEqualStrings("Hello World!", result.slice);
@@ -309,11 +309,11 @@ test "nfd ASCII / no-alloc" {
309 309
310test "nfd !ASCII / alloc" { 310test "nfd !ASCII / alloc" {
311 const allocator = testing.allocator; 311 const allocator = testing.allocator;
312 var data = try NormData.init(allocator); 312 const data = try NormData.init(allocator);
313 defer data.deinit(); 313 defer data.deinit();
314 var n = Self{ .norm_data = &data }; 314 const n = Self{ .norm_data = &data };
315 315
316 var result = try n.nfd(allocator, "Héllo World! \u{3d3}"); 316 const result = try n.nfd(allocator, "Héllo World! \u{3d3}");
317 defer result.deinit(); 317 defer result.deinit();
318 318
319 try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice); 319 try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice);
@@ -321,11 +321,11 @@ test "nfd !ASCII / alloc" {
321 321
322test "nfkd ASCII / no-alloc" { 322test "nfkd ASCII / no-alloc" {
323 const allocator = testing.allocator; 323 const allocator = testing.allocator;
324 var data = try NormData.init(allocator); 324 const data = try NormData.init(allocator);
325 defer data.deinit(); 325 defer data.deinit();
326 var n = Self{ .norm_data = &data }; 326 const n = Self{ .norm_data = &data };
327 327
328 var result = try n.nfkd(allocator, "Hello World!"); 328 const result = try n.nfkd(allocator, "Hello World!");
329 defer result.deinit(); 329 defer result.deinit();
330 330
331 try testing.expectEqualStrings("Hello World!", result.slice); 331 try testing.expectEqualStrings("Hello World!", result.slice);
@@ -333,11 +333,11 @@ test "nfkd ASCII / no-alloc" {
333 333
334test "nfkd !ASCII / alloc" { 334test "nfkd !ASCII / alloc" {
335 const allocator = testing.allocator; 335 const allocator = testing.allocator;
336 var data = try NormData.init(allocator); 336 const data = try NormData.init(allocator);
337 defer data.deinit(); 337 defer data.deinit();
338 var n = Self{ .norm_data = &data }; 338 const n = Self{ .norm_data = &data };
339 339
340 var result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); 340 const result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
341 defer result.deinit(); 341 defer result.deinit();
342 342
343 try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); 343 try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice);
@@ -532,11 +532,11 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) !Resu
532 532
533test "nfc" { 533test "nfc" {
534 const allocator = testing.allocator; 534 const allocator = testing.allocator;
535 var data = try NormData.init(allocator); 535 const data = try NormData.init(allocator);
536 defer data.deinit(); 536 defer data.deinit();
537 var n = Self{ .norm_data = &data }; 537 const n = Self{ .norm_data = &data };
538 538
539 var result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); 539 const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
540 defer result.deinit(); 540 defer result.deinit();
541 541
542 try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice); 542 try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice);
@@ -544,11 +544,11 @@ test "nfc" {
544 544
545test "nfkc" { 545test "nfkc" {
546 const allocator = testing.allocator; 546 const allocator = testing.allocator;
547 var data = try NormData.init(allocator); 547 const data = try NormData.init(allocator);
548 defer data.deinit(); 548 defer data.deinit();
549 var n = Self{ .norm_data = &data }; 549 const n = Self{ .norm_data = &data };
550 550
551 var result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); 551 const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
552 defer result.deinit(); 552 defer result.deinit();
553 553
554 try testing.expectEqualStrings("Complex char: \u{038E}", result.slice); 554 try testing.expectEqualStrings("Complex char: \u{038E}", result.slice);
@@ -556,9 +556,9 @@ test "nfkc" {
556 556
557/// Tests for equality of `a` and `b` after normalizing to NFC. 557/// Tests for equality of `a` and `b` after normalizing to NFC.
558pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool { 558pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool {
559 var norm_result_a = try self.nfc(allocator, a); 559 const norm_result_a = try self.nfc(allocator, a);
560 defer norm_result_a.deinit(); 560 defer norm_result_a.deinit();
561 var norm_result_b = try self.nfc(allocator, b); 561 const norm_result_b = try self.nfc(allocator, b);
562 defer norm_result_b.deinit(); 562 defer norm_result_b.deinit();
563 563
564 return mem.eql(u8, norm_result_a.slice, norm_result_b.slice); 564 return mem.eql(u8, norm_result_a.slice, norm_result_b.slice);
@@ -566,9 +566,9 @@ pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !
566 566
567test "eql" { 567test "eql" {
568 const allocator = testing.allocator; 568 const allocator = testing.allocator;
569 var data = try NormData.init(allocator); 569 const data = try NormData.init(allocator);
570 defer data.deinit(); 570 defer data.deinit();
571 var n = Self{ .norm_data = &data }; 571 const n = Self{ .norm_data = &data };
572 572
573 try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); 573 try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
574 try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); 574 try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
@@ -601,9 +601,9 @@ fn isFcd(self: Self, str: []const u8) bool {
601 601
602test "isFcd" { 602test "isFcd" {
603 const allocator = testing.allocator; 603 const allocator = testing.allocator;
604 var data = try NormData.init(allocator); 604 const data = try NormData.init(allocator);
605 defer data.deinit(); 605 defer data.deinit();
606 var n = Self{ .norm_data = &data }; 606 const n = Self{ .norm_data = &data };
607 607
608 const is_nfc = "José \u{3D3}"; 608 const is_nfc = "José \u{3D3}";
609 try testing.expect(n.isFcd(is_nfc)); 609 try testing.expect(n.isFcd(is_nfc));
@@ -620,9 +620,9 @@ test "Unicode normalization tests" {
620 defer arena.deinit(); 620 defer arena.deinit();
621 var allocator = arena.allocator(); 621 var allocator = arena.allocator();
622 622
623 var data = try NormData.init(allocator); 623 const data = try NormData.init(allocator);
624 defer data.deinit(); 624 defer data.deinit();
625 var n = Self{ .norm_data = &data }; 625 const n = Self{ .norm_data = &data };
626 626
627 var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); 627 var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
628 defer file.close(); 628 defer file.close();
@@ -721,7 +721,7 @@ test "Unicode normalization tests" {
721 } 721 }
722 722
723 const want = w_buf.items; 723 const want = w_buf.items;
724 var got = try n.nfkd(allocator, input); 724 const got = try n.nfkd(allocator, input);
725 defer got.deinit(); 725 defer got.deinit();
726 726
727 try testing.expectEqualStrings(want, got.slice); 727 try testing.expectEqualStrings(want, got.slice);
diff --git a/src/NumericData.zig b/src/NumericData.zig
index baf8f11..210d623 100644
--- a/src/NumericData.zig
+++ b/src/NumericData.zig
@@ -33,7 +33,7 @@ pub fn init(allocator: mem.Allocator) !Self {
33 return self; 33 return self;
34} 34}
35 35
36pub fn deinit(self: *Self) void { 36pub fn deinit(self: *const Self) void {
37 self.allocator.free(self.s1); 37 self.allocator.free(self.s1);
38 self.allocator.free(self.s2); 38 self.allocator.free(self.s2);
39} 39}
@@ -59,7 +59,7 @@ pub inline fn isDecimal(self: Self, cp: u21) bool {
59} 59}
60 60
61test "isDecimal" { 61test "isDecimal" {
62 var self = try init(testing.allocator); 62 const self = try init(testing.allocator);
63 defer self.deinit(); 63 defer self.deinit();
64 64
65 try testing.expect(self.isNumber('\u{277f}')); 65 try testing.expect(self.isNumber('\u{277f}'));
diff --git a/src/ScriptsData.zig b/src/ScriptsData.zig
new file mode 100644
index 0000000..ac1c46a
--- /dev/null
+++ b/src/ScriptsData.zig
@@ -0,0 +1,226 @@
1const std = @import("std");
2const builtin = @import("builtin");
3const compress = std.compress;
4const mem = std.mem;
5const testing = std.testing;
6
7/// Script
8pub const Script = enum {
9 none,
10 Adlam,
11 Ahom,
12 Anatolian_Hieroglyphs,
13 Arabic,
14 Armenian,
15 Avestan,
16 Balinese,
17 Bamum,
18 Bassa_Vah,
19 Batak,
20 Bengali,
21 Bhaiksuki,
22 Bopomofo,
23 Brahmi,
24 Braille,
25 Buginese,
26 Buhid,
27 Canadian_Aboriginal,
28 Carian,
29 Caucasian_Albanian,
30 Chakma,
31 Cham,
32 Cherokee,
33 Chorasmian,
34 Common,
35 Coptic,
36 Cuneiform,
37 Cypriot,
38 Cypro_Minoan,
39 Cyrillic,
40 Deseret,
41 Devanagari,
42 Dives_Akuru,
43 Dogra,
44 Duployan,
45 Egyptian_Hieroglyphs,
46 Elbasan,
47 Elymaic,
48 Ethiopic,
49 Georgian,
50 Glagolitic,
51 Gothic,
52 Grantha,
53 Greek,
54 Gujarati,
55 Gunjala_Gondi,
56 Gurmukhi,
57 Han,
58 Hangul,
59 Hanifi_Rohingya,
60 Hanunoo,
61 Hatran,
62 Hebrew,
63 Hiragana,
64 Imperial_Aramaic,
65 Inherited,
66 Inscriptional_Pahlavi,
67 Inscriptional_Parthian,
68 Javanese,
69 Kaithi,
70 Kannada,
71 Katakana,
72 Kawi,
73 Kayah_Li,
74 Kharoshthi,
75 Khitan_Small_Script,
76 Khmer,
77 Khojki,
78 Khudawadi,
79 Lao,
80 Latin,
81 Lepcha,
82 Limbu,
83 Linear_A,
84 Linear_B,
85 Lisu,
86 Lycian,
87 Lydian,
88 Mahajani,
89 Makasar,
90 Malayalam,
91 Mandaic,
92 Manichaean,
93 Marchen,
94 Masaram_Gondi,
95 Medefaidrin,
96 Meetei_Mayek,
97 Mende_Kikakui,
98 Meroitic_Cursive,
99 Meroitic_Hieroglyphs,
100 Miao,
101 Modi,
102 Mongolian,
103 Mro,
104 Multani,
105 Myanmar,
106 Nabataean,
107 Nag_Mundari,
108 Nandinagari,
109 New_Tai_Lue,
110 Newa,
111 Nko,
112 Nushu,
113 Nyiakeng_Puachue_Hmong,
114 Ogham,
115 Ol_Chiki,
116 Old_Hungarian,
117 Old_Italic,
118 Old_North_Arabian,
119 Old_Permic,
120 Old_Persian,
121 Old_Sogdian,
122 Old_South_Arabian,
123 Old_Turkic,
124 Old_Uyghur,
125 Oriya,
126 Osage,
127 Osmanya,
128 Pahawh_Hmong,
129 Palmyrene,
130 Pau_Cin_Hau,
131 Phags_Pa,
132 Phoenician,
133 Psalter_Pahlavi,
134 Rejang,
135 Runic,
136 Samaritan,
137 Saurashtra,
138 Sharada,
139 Shavian,
140 Siddham,
141 SignWriting,
142 Sinhala,
143 Sogdian,
144 Sora_Sompeng,
145 Soyombo,
146 Sundanese,
147 Syloti_Nagri,
148 Syriac,
149 Tagalog,
150 Tagbanwa,
151 Tai_Le,
152 Tai_Tham,
153 Tai_Viet,
154 Takri,
155 Tamil,
156 Tangsa,
157 Tangut,
158 Telugu,
159 Thaana,
160 Thai,
161 Tibetan,
162 Tifinagh,
163 Tirhuta,
164 Toto,
165 Ugaritic,
166 Vai,
167 Vithkuqi,
168 Wancho,
169 Warang_Citi,
170 Yezidi,
171 Yi,
172 Zanabazar_Square,
173};
174
175allocator: mem.Allocator,
176s1: []u16 = undefined,
177s2: []u8 = undefined,
178s3: []u8 = undefined,
179
180const Self = @This();
181
182pub fn init(allocator: mem.Allocator) !Self {
183 const decompressor = compress.deflate.decompressor;
184 const in_bytes = @embedFile("scripts");
185 var in_fbs = std.io.fixedBufferStream(in_bytes);
186 var in_decomp = try decompressor(allocator, in_fbs.reader(), null);
187 defer in_decomp.deinit();
188 var reader = in_decomp.reader();
189
190 const endian = builtin.cpu.arch.endian();
191
192 var self = Self{ .allocator = allocator };
193
194 const s1_len: u16 = try reader.readInt(u16, endian);
195 self.s1 = try allocator.alloc(u16, s1_len);
196 for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
197
198 const s2_len: u16 = try reader.readInt(u16, endian);
199 self.s2 = try allocator.alloc(u8, s2_len);
200 _ = try reader.readAll(self.s2);
201
202 const s3_len: u16 = try reader.readInt(u8, endian);
203 self.s3 = try allocator.alloc(u8, s3_len);
204 _ = try reader.readAll(self.s3);
205
206 return self;
207}
208
209pub fn deinit(self: *const Self) void {
210 self.allocator.free(self.s1);
211 self.allocator.free(self.s2);
212 self.allocator.free(self.s3);
213}
214
215/// Lookup the Script type for `cp`.
216pub fn script(self: Self, cp: u21) ?Script {
217 const byte = self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]];
218 if (byte == 0) return null;
219 return @enumFromInt(byte);
220}
221
222test "script" {
223 const self = try init(std.testing.allocator);
224 defer self.deinit();
225 try testing.expectEqual(Script.Latin, self.script('A').?);
226}
diff --git a/src/WidthData.zig b/src/WidthData.zig
index 32f8658..d17f0cd 100644
--- a/src/WidthData.zig
+++ b/src/WidthData.zig
@@ -39,7 +39,7 @@ pub fn init(allocator: mem.Allocator) !Self {
39 return self; 39 return self;
40} 40}
41 41
42pub fn deinit(self: *Self) void { 42pub fn deinit(self: *const Self) void {
43 self.allocator.free(self.s1); 43 self.allocator.free(self.s1);
44 self.allocator.free(self.s2); 44 self.allocator.free(self.s2);
45 self.g_data.deinit(); 45 self.g_data.deinit();
diff --git a/src/grapheme.zig b/src/grapheme.zig
index e55a6a4..ad43cfd 100644
--- a/src/grapheme.zig
+++ b/src/grapheme.zig
@@ -237,7 +237,7 @@ test "Segmentation GraphemeIterator" {
237 var buf_reader = std.io.bufferedReader(file.reader()); 237 var buf_reader = std.io.bufferedReader(file.reader());
238 var input_stream = buf_reader.reader(); 238 var input_stream = buf_reader.reader();
239 239
240 var data = try GraphemeData.init(allocator); 240 const data = try GraphemeData.init(allocator);
241 defer data.deinit(); 241 defer data.deinit();
242 242
243 var buf: [4096]u8 = undefined; 243 var buf: [4096]u8 = undefined;
@@ -302,7 +302,7 @@ test "Segmentation ZWJ and ZWSP emoji sequences" {
302 const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; 302 const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;
303 const no_joiner = seq_1 ++ seq_2; 303 const no_joiner = seq_1 ++ seq_2;
304 304
305 var data = try GraphemeData.init(std.testing.allocator); 305 const data = try GraphemeData.init(std.testing.allocator);
306 defer data.deinit(); 306 defer data.deinit();
307 307
308 var iter = Iterator.init(with_zwj, &data); 308 var iter = Iterator.init(with_zwj, &data);