From 74be85ac145cc6de5d03348e07be8d982c2211cb Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Thu, 28 Mar 2024 10:06:00 -0400 Subject: PropsData and errdefers for init fns --- build.zig | 33 +++++++++++- codegen/core_props.zig | 139 +++++++++++++++++++++++++++++++++++++++++++++++++ codegen/props.zig | 136 +++++++++++++++++++++++++++++++++++++++++++++++ src/CanonData.zig | 10 +++- src/CombiningData.zig | 2 + src/CompatData.zig | 1 + src/FoldData.zig | 8 +++ src/GenCatData.zig | 3 ++ src/GraphemeData.zig | 3 ++ src/HangulData.zig | 2 + src/NormData.zig | 31 ++++++----- src/NormPropsData.zig | 2 + src/NumericData.zig | 10 +--- src/PropsData.zig | 123 +++++++++++++++++++++++++++++++++++++++++++ src/ScriptsData.zig | 3 ++ src/WidthData.zig | 3 ++ 16 files changed, 485 insertions(+), 24 deletions(-) create mode 100644 codegen/core_props.zig create mode 100644 codegen/props.zig create mode 100644 src/PropsData.zig diff --git a/build.zig b/build.zig index 58c3f21..a24181a 100644 --- a/build.zig +++ b/build.zig @@ -146,6 +146,24 @@ pub fn build(b: *std.Build) void { const run_scripts_gen_exe = b.addRunArtifact(scripts_gen_exe); const scripts_gen_out = run_scripts_gen_exe.addOutputFileArg("scripts.bin.z"); + const core_gen_exe = b.addExecutable(.{ + .name = "core", + .root_source_file = .{ .path = "codegen/core_props.zig" }, + .target = b.host, + .optimize = .Debug, + }); + const run_core_gen_exe = b.addRunArtifact(core_gen_exe); + const core_gen_out = run_core_gen_exe.addOutputFileArg("core_props.bin.z"); + + const props_gen_exe = b.addExecutable(.{ + .name = "props", + .root_source_file = .{ .path = "codegen/props.zig" }, + .target = b.host, + .optimize = .Debug, + }); + const run_props_gen_exe = b.addRunArtifact(props_gen_exe); + const props_gen_out = run_props_gen_exe.addOutputFileArg("props.bin.z"); + // Modules we provide // Code points const code_point = b.addModule("code_point", .{ @@ -304,9 +322,18 @@ pub fn build(b: *std.Build) void { }); scripts_data.addAnonymousImport("scripts", .{ .root_source_file = scripts_gen_out }); + // Properties + const props_data = b.addModule("ScriptsData", .{ + .root_source_file = .{ .path = "src/PropsData.zig" }, + .target = target, + .optimize = optimize, + }); + props_data.addAnonymousImport("core_props", .{ .root_source_file = core_gen_out }); + props_data.addAnonymousImport("props", .{ .root_source_file = props_gen_out }); + // Tests const exe_unit_tests = b.addTest(.{ - .root_source_file = .{ .path = "src/ScriptsData.zig" }, + .root_source_file = .{ .path = "src/PropsData.zig" }, .target = target, .optimize = optimize, }); @@ -324,7 +351,9 @@ pub fn build(b: *std.Build) void { // exe_unit_tests.root_module.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out }); // exe_unit_tests.root_module.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out }); // exe_unit_tests.root_module.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out }); - exe_unit_tests.root_module.addAnonymousImport("scripts", .{ .root_source_file = scripts_gen_out }); + // exe_unit_tests.root_module.addAnonymousImport("scripts", .{ .root_source_file = scripts_gen_out }); + exe_unit_tests.root_module.addAnonymousImport("core_props", .{ .root_source_file = core_gen_out }); + exe_unit_tests.root_module.addAnonymousImport("props", .{ .root_source_file = props_gen_out }); // exe_unit_tests.filter = "nfd !ASCII"; const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); diff --git a/codegen/core_props.zig b/codegen/core_props.zig new file mode 100644 index 0000000..1f46f9e --- /dev/null +++ b/codegen/core_props.zig @@ -0,0 +1,139 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const mem = std.mem; + +const block_size = 256; +const Block = [block_size]u8; + +const BlockMap = std.HashMap( + Block, + u16, + struct { + pub fn hash(_: @This(), k: Block) u64 { + var hasher = std.hash.Wyhash.init(0); + std.hash.autoHashStrat(&hasher, k, .DeepRecursive); + return hasher.final(); + } + + pub fn eql(_: @This(), a: Block, b: Block) bool { + return mem.eql(u8, &a, &b); + } + }, + std.hash_map.default_max_load_percentage, +); + +pub fn main() !void { + var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); + defer arena.deinit(); + const allocator = arena.allocator(); + + var flat_map = std.AutoHashMap(u21, u8).init(allocator); + defer flat_map.deinit(); + + var line_buf: [4096]u8 = undefined; + + // Process DerivedCoreProperties.txt + var in_file = try std.fs.cwd().openFile("data/unicode/DerivedCoreProperties.txt", .{}); + defer in_file.close(); + var in_buf = std.io.bufferedReader(in_file.reader()); + const in_reader = in_buf.reader(); + + while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { + if (line.len == 0 or line[0] == '#') continue; + const no_comment = if (mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line; + + var field_iter = mem.tokenizeAny(u8, no_comment, "; "); + var current_code: [2]u21 = undefined; + + var i: usize = 0; + while (field_iter.next()) |field| : (i += 1) { + switch (i) { + 0 => { + // Code point(s) + if (mem.indexOf(u8, field, "..")) |dots| { + current_code = .{ + try std.fmt.parseInt(u21, field[0..dots], 16), + try std.fmt.parseInt(u21, field[dots + 2 ..], 16), + }; + } else { + const code = try std.fmt.parseInt(u21, field, 16); + current_code = .{ code, code }; + } + }, + 1 => { + // Core property + var bit: u8 = 0; + + if (mem.eql(u8, field, "Math")) bit = 1; + if (mem.eql(u8, field, "Alphabetic")) bit = 2; + if (mem.eql(u8, field, "ID_Start")) bit = 4; + if (mem.eql(u8, field, "ID_Continue")) bit = 8; + if (mem.eql(u8, field, "XID_Start")) bit = 16; + if (mem.eql(u8, field, "XID_Continue")) bit = 32; + + if (bit != 0) { + for (current_code[0]..current_code[1] + 1) |cp| { + const gop = try flat_map.getOrPut(@intCast(cp)); + if (!gop.found_existing) gop.value_ptr.* = 0; + gop.value_ptr.* |= bit; + } + } + }, + else => {}, + } + } + } + + var blocks_map = BlockMap.init(allocator); + defer blocks_map.deinit(); + + var stage1 = std.ArrayList(u16).init(allocator); + defer stage1.deinit(); + + var stage2 = std.ArrayList(u8).init(allocator); + defer stage2.deinit(); + + var block: Block = [_]u8{0} ** block_size; + var block_len: u16 = 0; + + for (0..0x110000) |i| { + const cp: u21 = @intCast(i); + const prop = flat_map.get(cp) orelse 0; + + // Process block + block[block_len] = prop; + block_len += 1; + + if (block_len < block_size and cp != 0x10ffff) continue; + + const gop = try blocks_map.getOrPut(block); + if (!gop.found_existing) { + gop.value_ptr.* = @intCast(stage2.items.len); + try stage2.appendSlice(&block); + } + + try stage1.append(gop.value_ptr.*); + block_len = 0; + } + + var args_iter = try std.process.argsWithAllocator(allocator); + defer args_iter.deinit(); + _ = args_iter.skip(); + const output_path = args_iter.next() orelse @panic("No output file arg!"); + + const compressor = std.compress.deflate.compressor; + var out_file = try std.fs.cwd().createFile(output_path, .{}); + defer out_file.close(); + var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); + defer out_comp.deinit(); + const writer = out_comp.writer(); + + const endian = builtin.cpu.arch.endian(); + try writer.writeInt(u16, @intCast(stage1.items.len), endian); + for (stage1.items) |i| try writer.writeInt(u16, i, endian); + + try writer.writeInt(u16, @intCast(stage2.items.len), endian); + try writer.writeAll(stage2.items); + + try out_comp.flush(); +} diff --git a/codegen/props.zig b/codegen/props.zig new file mode 100644 index 0000000..57a205e --- /dev/null +++ b/codegen/props.zig @@ -0,0 +1,136 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const mem = std.mem; + +const block_size = 256; +const Block = [block_size]u8; + +const BlockMap = std.HashMap( + Block, + u16, + struct { + pub fn hash(_: @This(), k: Block) u64 { + var hasher = std.hash.Wyhash.init(0); + std.hash.autoHashStrat(&hasher, k, .DeepRecursive); + return hasher.final(); + } + + pub fn eql(_: @This(), a: Block, b: Block) bool { + return mem.eql(u8, &a, &b); + } + }, + std.hash_map.default_max_load_percentage, +); + +pub fn main() !void { + var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); + defer arena.deinit(); + const allocator = arena.allocator(); + + var flat_map = std.AutoHashMap(u21, u8).init(allocator); + defer flat_map.deinit(); + + var line_buf: [4096]u8 = undefined; + + // Process PropList.txt + var in_file = try std.fs.cwd().openFile("data/unicode/PropList.txt", .{}); + defer in_file.close(); + var in_buf = std.io.bufferedReader(in_file.reader()); + const in_reader = in_buf.reader(); + + while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { + if (line.len == 0 or line[0] == '#') continue; + const no_comment = if (mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line; + + var field_iter = mem.tokenizeAny(u8, no_comment, "; "); + var current_code: [2]u21 = undefined; + + var i: usize = 0; + while (field_iter.next()) |field| : (i += 1) { + switch (i) { + 0 => { + // Code point(s) + if (mem.indexOf(u8, field, "..")) |dots| { + current_code = .{ + try std.fmt.parseInt(u21, field[0..dots], 16), + try std.fmt.parseInt(u21, field[dots + 2 ..], 16), + }; + } else { + const code = try std.fmt.parseInt(u21, field, 16); + current_code = .{ code, code }; + } + }, + 1 => { + // Core property + var bit: u8 = 0; + + if (mem.eql(u8, field, "White_Space")) bit = 1; + if (mem.eql(u8, field, "Hex_Digit")) bit = 2; + if (mem.eql(u8, field, "Diacritic")) bit = 4; + + if (bit != 0) { + for (current_code[0]..current_code[1] + 1) |cp| { + const gop = try flat_map.getOrPut(@intCast(cp)); + if (!gop.found_existing) gop.value_ptr.* = 0; + gop.value_ptr.* |= bit; + } + } + }, + else => {}, + } + } + } + + var blocks_map = BlockMap.init(allocator); + defer blocks_map.deinit(); + + var stage1 = std.ArrayList(u16).init(allocator); + defer stage1.deinit(); + + var stage2 = std.ArrayList(u8).init(allocator); + defer stage2.deinit(); + + var block: Block = [_]u8{0} ** block_size; + var block_len: u16 = 0; + + for (0..0x110000) |i| { + const cp: u21 = @intCast(i); + const prop = flat_map.get(cp) orelse 0; + + // Process block + block[block_len] = prop; + block_len += 1; + + if (block_len < block_size and cp != 0x10ffff) continue; + + const gop = try blocks_map.getOrPut(block); + if (!gop.found_existing) { + gop.value_ptr.* = @intCast(stage2.items.len); + try stage2.appendSlice(&block); + } + + try stage1.append(gop.value_ptr.*); + block_len = 0; + } + + var args_iter = try std.process.argsWithAllocator(allocator); + defer args_iter.deinit(); + _ = args_iter.skip(); + const output_path = args_iter.next() orelse @panic("No output file arg!"); + + const compressor = std.compress.deflate.compressor; + var out_file = try std.fs.cwd().createFile(output_path, .{}); + defer out_file.close(); + var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); + defer out_comp.deinit(); + const writer = out_comp.writer(); + + const endian = builtin.cpu.arch.endian(); + try writer.writeInt(u16, @intCast(stage1.items.len), endian); + for (stage1.items) |i| try writer.writeInt(u16, i, endian); + + try writer.writeInt(u16, @intCast(stage2.items.len), endian); + try writer.writeAll(stage2.items); + + try out_comp.flush(); +} diff --git a/src/CanonData.zig b/src/CanonData.zig index 9f1deb8..64d5555 100644 --- a/src/CanonData.zig +++ b/src/CanonData.zig @@ -24,6 +24,13 @@ pub fn init(allocator: mem.Allocator) !Self { .nfd = try allocator.alloc([]u21, 0x110000), }; + var slices: usize = 0; + errdefer { + self.nfc.deinit(); + for (self.nfd[0..slices]) |slice| self.allocator.free(slice); + self.allocator.free(self.nfd); + } + @memset(self.nfd, &.{}); while (true) { @@ -31,6 +38,7 @@ pub fn init(allocator: mem.Allocator) !Self { if (len == 0) break; const cp = try reader.readInt(u24, endian); self.nfd[cp] = try allocator.alloc(u21, len - 1); + slices += 1; for (0..len - 1) |i| { self.nfd[cp][i] = @intCast(try reader.readInt(u24, endian)); } @@ -42,7 +50,7 @@ pub fn init(allocator: mem.Allocator) !Self { return self; } -pub fn deinit(self: *const Self) void { +pub fn deinit(self: *Self) void { self.nfc.deinit(); for (self.nfd) |slice| self.allocator.free(slice); self.allocator.free(self.nfd); diff --git a/src/CombiningData.zig b/src/CombiningData.zig index c67638c..a40cbde 100644 --- a/src/CombiningData.zig +++ b/src/CombiningData.zig @@ -23,10 +23,12 @@ pub fn init(allocator: mem.Allocator) !Self { const stage_1_len: u16 = try reader.readInt(u16, endian); self.s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(self.s1); for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); const stage_2_len: u16 = try reader.readInt(u16, endian); self.s2 = try allocator.alloc(u8, stage_2_len); + errdefer allocator.free(self.s2); _ = try reader.readAll(self.s2); return self; diff --git a/src/CompatData.zig b/src/CompatData.zig index 67c43e6..a931cb3 100644 --- a/src/CompatData.zig +++ b/src/CompatData.zig @@ -21,6 +21,7 @@ pub fn init(allocator: mem.Allocator) !Self { .allocator = allocator, .nfkd = try allocator.alloc([]u21, 0x110000), }; + errdefer self.deinit(); @memset(self.nfkd, &.{}); diff --git a/src/FoldData.zig b/src/FoldData.zig index e387447..a06eefe 100644 --- a/src/FoldData.zig +++ b/src/FoldData.zig @@ -24,6 +24,13 @@ pub fn init(allocator: mem.Allocator) !Self { .cwcf = try allocator.alloc(bool, 0x110000), }; + var slices: usize = 0; + errdefer { + for (self.fold[0..slices]) |slice| self.allocator.free(slice); + self.allocator.free(self.fold); + self.allocator.free(self.cwcf); + } + @memset(self.fold, &.{}); @memset(self.cwcf, false); @@ -32,6 +39,7 @@ pub fn init(allocator: mem.Allocator) !Self { if (len == 0) break; const cp = try reader.readInt(u24, endian); self.fold[cp >> 1] = try allocator.alloc(u21, len - 1); + slices += 1; for (0..len - 1) |i| { self.fold[cp >> 1][i] = @intCast(try reader.readInt(u24, endian)); } diff --git a/src/GenCatData.zig b/src/GenCatData.zig index 37ae037..12501bf 100644 --- a/src/GenCatData.zig +++ b/src/GenCatData.zig @@ -58,14 +58,17 @@ pub fn init(allocator: mem.Allocator) !Self { const s1_len: u16 = try reader.readInt(u16, endian); self.s1 = try allocator.alloc(u16, s1_len); + errdefer allocator.free(self.s1); for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian); const s2_len: u16 = try reader.readInt(u16, endian); self.s2 = try allocator.alloc(u5, s2_len); + errdefer allocator.free(self.s2); for (0..s2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian)); const s3_len: u16 = try reader.readInt(u8, endian); self.s3 = try allocator.alloc(u5, s3_len); + errdefer allocator.free(self.s3); for (0..s3_len) |i| self.s3[i] = @intCast(try reader.readInt(u8, endian)); return self; diff --git a/src/GraphemeData.zig b/src/GraphemeData.zig index 971929a..500ffea 100644 --- a/src/GraphemeData.zig +++ b/src/GraphemeData.zig @@ -51,14 +51,17 @@ pub fn init(allocator: mem.Allocator) !Self { const s1_len: u16 = try reader.readInt(u16, endian); self.s1 = try allocator.alloc(u16, s1_len); + errdefer allocator.free(self.s1); for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian); const s2_len: u16 = try reader.readInt(u16, endian); self.s2 = try allocator.alloc(u16, s2_len); + errdefer allocator.free(self.s2); for (0..s2_len) |i| self.s2[i] = try reader.readInt(u16, endian); const s3_len: u16 = try reader.readInt(u16, endian); self.s3 = try allocator.alloc(u8, s3_len); + errdefer allocator.free(self.s3); _ = try reader.readAll(self.s3); return self; diff --git a/src/HangulData.zig b/src/HangulData.zig index ec360e9..99d91c1 100644 --- a/src/HangulData.zig +++ b/src/HangulData.zig @@ -32,10 +32,12 @@ pub fn init(allocator: mem.Allocator) !Self { const stage_1_len: u16 = try reader.readInt(u16, endian); self.s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(self.s1); for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); const stage_2_len: u16 = try reader.readInt(u16, endian); self.s2 = try allocator.alloc(u3, stage_2_len); + errdefer allocator.free(self.s2); for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian)); return self; diff --git a/src/NormData.zig b/src/NormData.zig index 413619a..7ffe679 100644 --- a/src/NormData.zig +++ b/src/NormData.zig @@ -8,25 +8,30 @@ const FoldData = @import("FoldData"); const HangulData = @import("HangulData"); const NormPropsData = @import("NormPropsData"); -canon_data: CanonData, -ccc_data: CccData, -compat_data: CompatData, -hangul_data: HangulData, -normp_data: NormPropsData, +canon_data: CanonData = undefined, +ccc_data: CccData = undefined, +compat_data: CompatData = undefined, +hangul_data: HangulData = undefined, +normp_data: NormPropsData = undefined, const Self = @This(); pub fn init(allocator: std.mem.Allocator) !Self { - return Self{ - .canon_data = try CanonData.init(allocator), - .ccc_data = try CccData.init(allocator), - .compat_data = try CompatData.init(allocator), - .hangul_data = try HangulData.init(allocator), - .normp_data = try NormPropsData.init(allocator), - }; + var self = Self{}; + self.canon_data = try CanonData.init(allocator); + errdefer self.canon_data.deinit(); + self.ccc_data = try CccData.init(allocator); + errdefer self.ccc_data.deinit(); + self.compat_data = try CompatData.init(allocator); + errdefer self.compat_data.deinit(); + self.hangul_data = try HangulData.init(allocator); + errdefer self.hangul_data.deinit(); + self.normp_data = try NormPropsData.init(allocator); + + return self; } -pub fn deinit(self: *const Self) void { +pub fn deinit(self: *Self) void { self.canon_data.deinit(); self.ccc_data.deinit(); self.compat_data.deinit(); diff --git a/src/NormPropsData.zig b/src/NormPropsData.zig index 893a8d0..86d497b 100644 --- a/src/NormPropsData.zig +++ b/src/NormPropsData.zig @@ -23,10 +23,12 @@ pub fn init(allocator: mem.Allocator) !Self { const stage_1_len: u16 = try reader.readInt(u16, endian); self.s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(self.s1); for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); const stage_2_len: u16 = try reader.readInt(u16, endian); self.s2 = try allocator.alloc(u4, stage_2_len); + errdefer allocator.free(self.s2); for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian)); return self; diff --git a/src/NumericData.zig b/src/NumericData.zig index 210d623..28e8206 100644 --- a/src/NumericData.zig +++ b/src/NumericData.zig @@ -24,10 +24,12 @@ pub fn init(allocator: mem.Allocator) !Self { const stage_1_len: u16 = try reader.readInt(u16, endian); self.s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(self.s1); for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); const stage_2_len: u16 = try reader.readInt(u16, endian); self.s2 = try allocator.alloc(u8, stage_2_len); + errdefer allocator.free(self.s2); _ = try reader.readAll(self.s2); return self; @@ -38,11 +40,6 @@ pub fn deinit(self: *const Self) void { self.allocator.free(self.s2); } -/// True if `cp` is any numeric type. -pub fn isNumber(self: Self, cp: u21) bool { - return self.isNumeric(cp) or self.isDigit(cp) or self.isDecimal(cp); -} - /// True if `cp` is numeric. pub inline fn isNumeric(self: Self, cp: u21) bool { return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 1 == 1; @@ -62,13 +59,10 @@ test "isDecimal" { const self = try init(testing.allocator); defer self.deinit(); - try testing.expect(self.isNumber('\u{277f}')); - try testing.expect(self.isNumber('3')); try testing.expect(self.isNumeric('\u{277f}')); try testing.expect(self.isDigit('\u{2070}')); try testing.expect(self.isDecimal('3')); - try testing.expect(!self.isNumber('z')); try testing.expect(!self.isNumeric('1')); try testing.expect(!self.isDigit('2')); try testing.expect(!self.isDecimal('g')); diff --git a/src/PropsData.zig b/src/PropsData.zig new file mode 100644 index 0000000..252462e --- /dev/null +++ b/src/PropsData.zig @@ -0,0 +1,123 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; +const testing = std.testing; + +allocator: mem.Allocator, +core_s1: []u16 = undefined, +core_s2: []u8 = undefined, +props_s1: []u16 = undefined, +props_s2: []u8 = undefined, + +const Self = @This(); + +pub fn init(allocator: mem.Allocator) !Self { + const decompressor = compress.deflate.decompressor; + const endian = builtin.cpu.arch.endian(); + + // Process DerivedCoreProperties.txt + const core_bytes = @embedFile("core_props"); + var core_fbs = std.io.fixedBufferStream(core_bytes); + var core_decomp = try decompressor(allocator, core_fbs.reader(), null); + defer core_decomp.deinit(); + var core_reader = core_decomp.reader(); + + var self = Self{ .allocator = allocator }; + + const core_stage_1_len: u16 = try core_reader.readInt(u16, endian); + self.core_s1 = try allocator.alloc(u16, core_stage_1_len); + errdefer allocator.free(self.core_s1); + for (0..core_stage_1_len) |i| self.core_s1[i] = try core_reader.readInt(u16, endian); + + const core_stage_2_len: u16 = try core_reader.readInt(u16, endian); + self.core_s2 = try allocator.alloc(u8, core_stage_2_len); + errdefer allocator.free(self.core_s2); + _ = try core_reader.readAll(self.core_s2); + + // Process PropList.txt + const props_bytes = @embedFile("props"); + var props_fbs = std.io.fixedBufferStream(props_bytes); + var props_decomp = try decompressor(allocator, props_fbs.reader(), null); + defer props_decomp.deinit(); + var props_reader = props_decomp.reader(); + + const stage_1_len: u16 = try props_reader.readInt(u16, endian); + self.props_s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(self.props_s1); + for (0..stage_1_len) |i| self.props_s1[i] = try props_reader.readInt(u16, endian); + + const stage_2_len: u16 = try props_reader.readInt(u16, endian); + self.props_s2 = try allocator.alloc(u8, stage_2_len); + errdefer allocator.free(self.props_s2); + _ = try props_reader.readAll(self.props_s2); + + return self; +} + +pub fn deinit(self: *const Self) void { + self.allocator.free(self.core_s1); + self.allocator.free(self.core_s2); + self.allocator.free(self.props_s1); + self.allocator.free(self.props_s2); +} + +/// True if `cp` is a mathematical symbol. +pub inline fn isMath(self: Self, cp: u21) bool { + return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; +} + +/// True if `cp` is an alphabetic character. +pub inline fn isAlphabetic(self: Self, cp: u21) bool { + return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; +} + +/// True if `cp` is a valid identifier start character. +pub inline fn isIdStart(self: Self, cp: u21) bool { + return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; +} + +/// True if `cp` is a valid identifier continuation character. +pub inline fn isIdContinue(self: Self, cp: u21) bool { + return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 8 == 8; +} + +/// True if `cp` is a valid extended identifier start character. +pub inline fn isXidStart(self: Self, cp: u21) bool { + return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 16 == 16; +} + +/// True if `cp` is a valid extended identifier continuation character. +pub inline fn isXidContinue(self: Self, cp: u21) bool { + return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 32 == 32; +} + +/// True if `cp` is a whitespace character. +pub inline fn isWhitespace(self: Self, cp: u21) bool { + return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; +} + +/// True if `cp` is a hexadecimal digit. +pub inline fn isHexDigit(self: Self, cp: u21) bool { + return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; +} + +/// True if `cp` is a diacritic mark. +pub inline fn isDiacritic(self: Self, cp: u21) bool { + return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; +} + +test "Props" { + const self = try init(testing.allocator); + defer self.deinit(); + + try testing.expect(self.isHexDigit('F')); + try testing.expect(self.isHexDigit('a')); + try testing.expect(self.isHexDigit('8')); + try testing.expect(!self.isHexDigit('z')); + + try testing.expect(self.isDiacritic('\u{301}')); + try testing.expect(self.isAlphabetic('A')); + try testing.expect(!self.isAlphabetic('3')); + try testing.expect(self.isMath('+')); +} diff --git a/src/ScriptsData.zig b/src/ScriptsData.zig index ac1c46a..4e371bf 100644 --- a/src/ScriptsData.zig +++ b/src/ScriptsData.zig @@ -193,14 +193,17 @@ pub fn init(allocator: mem.Allocator) !Self { const s1_len: u16 = try reader.readInt(u16, endian); self.s1 = try allocator.alloc(u16, s1_len); + errdefer allocator.free(self.s1); for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian); const s2_len: u16 = try reader.readInt(u16, endian); self.s2 = try allocator.alloc(u8, s2_len); + errdefer allocator.free(self.s2); _ = try reader.readAll(self.s2); const s3_len: u16 = try reader.readInt(u8, endian); self.s3 = try allocator.alloc(u8, s3_len); + errdefer allocator.free(self.s3); _ = try reader.readAll(self.s3); return self; diff --git a/src/WidthData.zig b/src/WidthData.zig index d17f0cd..b9ef84e 100644 --- a/src/WidthData.zig +++ b/src/WidthData.zig @@ -27,13 +27,16 @@ pub fn init(allocator: mem.Allocator) !Self { .allocator = allocator, .g_data = try GraphemeData.init(allocator), }; + errdefer self.g_data.deinit(); const stage_1_len: u16 = try reader.readInt(u16, endian); self.s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(self.s1); for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); const stage_2_len: u16 = try reader.readInt(u16, endian); self.s2 = try allocator.alloc(i3, stage_2_len); + errdefer allocator.free(self.s2); for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(i8, endian)); return self; -- cgit v1.2.3