From e476250ea9326b2550847b301c265115ff375a31 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 4 Feb 2026 18:36:18 -0500 Subject: Rest of the 'easy' stuff This gets us up to feature parity with Jacob's work. I want to eliminate that last allocation using the comptime hash map, and then see about eliminating allocations from case comparisons as well. That should just about do it. --- build.zig | 57 ++++---------- codegen/case.zig | 145 ++++++++++++++++++++++++++++++++++ codegen/core_props.zig | 30 ++++--- codegen/gencat.zig | 37 ++++++--- codegen/numeric.zig | 24 ++++-- codegen/props.zig | 23 ++++-- codegen/scripts.zig | 31 ++++++-- src/GeneralCategories.zig | 102 +++++++----------------- src/LetterCasing.zig | 179 ++++++++++++------------------------------ src/Properties.zig | 195 +++++++++++++++------------------------------- src/Scripts.zig | 82 +++++-------------- 11 files changed, 430 insertions(+), 475 deletions(-) create mode 100644 codegen/case.zig diff --git a/build.zig b/build.zig index fa8c490..be91f50 100644 --- a/build.zig +++ b/build.zig @@ -151,7 +151,7 @@ pub fn build(b: *std.Build) void { }); gencat_gen_exe.root_module.addAnonymousImport("DerivedGeneralCategory.txt", .{ .root_source_file = b.path("data/unicode/extracted/DerivedGeneralCategory.txt") }); const run_gencat_gen_exe = b.addRunArtifact(gencat_gen_exe); - const gencat_gen_out = run_gencat_gen_exe.addOutputFileArg("gencat.bin.z"); + const gencat_gen_out = run_gencat_gen_exe.addOutputFileArg("gencat.zig"); const fold_gen_exe = b.addExecutable(.{ .name = "fold", @@ -177,47 +177,21 @@ pub fn build(b: *std.Build) void { }); num_gen_exe.root_module.addAnonymousImport("DerivedNumericType.txt", .{ .root_source_file = b.path("data/unicode/extracted/DerivedNumericType.txt") }); const run_num_gen_exe = b.addRunArtifact(num_gen_exe); - const num_gen_out = run_num_gen_exe.addOutputFileArg("numeric.bin.z"); + const num_gen_out = run_num_gen_exe.addOutputFileArg("numeric.zig"); - // Letter case properties - const case_prop_gen_exe = b.addExecutable(.{ - .name = "case_prop", + // Case mappings + const case_gen_exe = b.addExecutable(.{ + .name = "case", .root_module = b.createModule(.{ - .root_source_file = b.path("codegen/case_prop.zig"), + .root_source_file = b.path("codegen/case.zig"), .target = b.graph.host, .optimize = .Debug, }), }); - case_prop_gen_exe.root_module.addAnonymousImport("DerivedCoreProperties.txt", .{ .root_source_file = b.path("data/unicode/DerivedCoreProperties.txt") }); - const run_case_prop_gen_exe = b.addRunArtifact(case_prop_gen_exe); - const case_prop_gen_out = run_case_prop_gen_exe.addOutputFileArg("case_prop.bin.z"); - - // Uppercase mappings - const upper_gen_exe = b.addExecutable(.{ - .name = "upper", - .root_module = b.createModule(.{ - .root_source_file = b.path("codegen/upper.zig"), - .target = b.graph.host, - .optimize = .Debug, - }), - }); - upper_gen_exe.root_module.addAnonymousImport("UnicodeData.txt", .{ .root_source_file = b.path("data/unicode/UnicodeData.txt") }); - const run_upper_gen_exe = b.addRunArtifact(upper_gen_exe); - const upper_gen_out = run_upper_gen_exe.addOutputFileArg("upper.bin.z"); - - // Lowercase mappings - const lower_gen_exe = b.addExecutable(.{ - .name = "lower", - .root_module = b.createModule(.{ - .root_source_file = b.path("codegen/lower.zig"), - .target = b.graph.host, - .optimize = .Debug, - }), - }); - lower_gen_exe.root_module.addAnonymousImport("UnicodeData.txt", .{ .root_source_file = b.path("data/unicode/UnicodeData.txt") }); - const run_lower_gen_exe = b.addRunArtifact(lower_gen_exe); - const lower_gen_out = run_lower_gen_exe.addOutputFileArg("lower.bin.z"); + case_gen_exe.root_module.addAnonymousImport("UnicodeData.txt", .{ .root_source_file = b.path("data/unicode/UnicodeData.txt") }); + const run_case_gen_exe = b.addRunArtifact(case_gen_exe); + const case_gen_out = run_case_gen_exe.addOutputFileArg("case.zig"); const scripts_gen_exe = b.addExecutable(.{ .name = "scripts", .root_module = b.createModule(.{ @@ -226,9 +200,10 @@ pub fn build(b: *std.Build) void { .optimize = .Debug, }), }); + scripts_gen_exe.root_module.addAnonymousImport("Scripts.txt", .{ .root_source_file = b.path("data/unicode/Scripts.txt") }); const run_scripts_gen_exe = b.addRunArtifact(scripts_gen_exe); - const scripts_gen_out = run_scripts_gen_exe.addOutputFileArg("scripts.bin.z"); + const scripts_gen_out = run_scripts_gen_exe.addOutputFileArg("scripts.zig"); const core_gen_exe = b.addExecutable(.{ .name = "core", @@ -240,7 +215,7 @@ pub fn build(b: *std.Build) void { }); core_gen_exe.root_module.addAnonymousImport("DerivedCoreProperties.txt", .{ .root_source_file = b.path("data/unicode/DerivedCoreProperties.txt") }); const run_core_gen_exe = b.addRunArtifact(core_gen_exe); - const core_gen_out = run_core_gen_exe.addOutputFileArg("core_props.bin.z"); + const core_gen_out = run_core_gen_exe.addOutputFileArg("core_props.zig"); const props_gen_exe = b.addExecutable(.{ .name = "props", @@ -253,7 +228,7 @@ pub fn build(b: *std.Build) void { props_gen_exe.root_module.addAnonymousImport("PropList.txt", .{ .root_source_file = b.path("data/unicode/PropList.txt") }); const run_props_gen_exe = b.addRunArtifact(props_gen_exe); - const props_gen_out = run_props_gen_exe.addOutputFileArg("props.bin.z"); + const props_gen_out = run_props_gen_exe.addOutputFileArg("props.zig"); // Modules we provide @@ -457,9 +432,7 @@ pub fn build(b: *std.Build) void { .optimize = optimize, }); letter_case.addImport("code_point", code_point); - letter_case.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out }); - letter_case.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out }); - letter_case.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out }); + letter_case.addAnonymousImport("case", .{ .root_source_file = case_gen_out }); const letter_case_t = b.addTest(.{ .name = "lettercase", @@ -473,7 +446,7 @@ pub fn build(b: *std.Build) void { .target = target, .optimize = optimize, }); - scripts.addAnonymousImport("scripts", .{ .root_source_file = scripts_gen_out }); + scripts.addAnonymousImport("script", .{ .root_source_file = scripts_gen_out }); const scripts_t = b.addTest(.{ .name = "scripts", diff --git a/codegen/case.zig b/codegen/case.zig new file mode 100644 index 0000000..9dffc7c --- /dev/null +++ b/codegen/case.zig @@ -0,0 +1,145 @@ +const std = @import("std"); +const builtin = @import("builtin"); + +const block_size = 256; +const Block = [block_size]u44; + +comptime { + if (@bitSizeOf(u44) != 2 * @bitSizeOf(u21) + 2) { + @compileError("u44 doesn't have expected bit size."); + } +} + +const BlockMap = std.HashMap( + Block, + u16, + struct { + pub fn hash(_: @This(), k: Block) u64 { + var hasher = std.hash.Wyhash.init(0); + std.hash.autoHashStrat(&hasher, k, .DeepRecursive); + return hasher.final(); + } + + pub fn eql(_: @This(), a: Block, b: Block) bool { + return std.mem.eql(u44, &a, &b); + } + }, + std.hash_map.default_max_load_percentage, +); + +pub fn main() !void { + var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); + defer arena.deinit(); + const allocator = arena.allocator(); + + var lower_map = std.AutoHashMap(u21, u21).init(allocator); + defer lower_map.deinit(); + + var upper_map = std.AutoHashMap(u21, u21).init(allocator); + defer upper_map.deinit(); + + // Process UnicodeData.txt + + var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt")); + while (in_reader.takeDelimiterInclusive('\n')) |line| { + if (line.len == 0) continue; + + var field_iter = std.mem.splitScalar(u8, line, ';'); + var cp: u21 = undefined; + + var i: usize = 0; + while (field_iter.next()) |field| : (i += 1) { + if (field.len == 0) continue; + + switch (i) { + 0 => cp = try std.fmt.parseInt(u21, field, 16), + + 12 => { + // Uppercase mapping + try upper_map.put(cp, try std.fmt.parseInt(u21, field, 16)); + }, + + 13 => { + // Lowercase mapping + try lower_map.put(cp, try std.fmt.parseInt(u21, field, 16)); + }, + + else => {}, + } + } + } else |err| switch (err) { + error.EndOfStream => {}, + else => { + return err; + }, + } + + var blocks_map = BlockMap.init(allocator); + defer blocks_map.deinit(); + + var stage1 = std.array_list.Managed(u16).init(allocator); + defer stage1.deinit(); + + var stage2 = std.array_list.Managed(u44).init(allocator); + defer stage2.deinit(); + + var block: Block = [_]u44{0} ** block_size; + var block_len: u16 = 0; + + for (0..0x110000) |i| { + const cp: u21 = @intCast(i); + var case_prop: u44 = 0; + + if (lower_map.get(cp)) |lower| { + case_prop |= @as(u44, lower) << 2 | 1; + } + + if (upper_map.get(cp)) |upper| { + case_prop |= @as(u44, upper) << (2 + 21) | 2; + } + + block[block_len] = case_prop; + block_len += 1; + + if (block_len < block_size and cp != 0x10ffff) continue; + + const gop = try blocks_map.getOrPut(block); + if (!gop.found_existing) { + gop.value_ptr.* = @intCast(stage2.items.len); + try stage2.appendSlice(&block); + } + + try stage1.append(gop.value_ptr.*); + block_len = 0; + } + + var args_iter = try std.process.argsWithAllocator(allocator); + defer args_iter.deinit(); + _ = args_iter.skip(); + const output_path = args_iter.next() orelse @panic("No output file arg!"); + + var write_buf: [4096]u8 = undefined; + var out_file = try std.fs.cwd().createFile(output_path, .{}); + defer out_file.close(); + var writer = out_file.writer(&write_buf); + + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]u44 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); + try writer.interface.flush(); +} diff --git a/codegen/core_props.zig b/codegen/core_props.zig index 6ffdf91..99a55e2 100644 --- a/codegen/core_props.zig +++ b/codegen/core_props.zig @@ -120,17 +120,29 @@ pub fn main() anyerror!void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - var out_buf: [4096]u8 = undefined; + var write_buf: [4096]u8 = undefined; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var writer = out_file.writer(&out_buf); - - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); - - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - try writer.interface.writeAll(stage2.items); + var writer = out_file.writer(&write_buf); + + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]u8 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); } diff --git a/codegen/gencat.zig b/codegen/gencat.zig index 9800f1d..12c8373 100644 --- a/codegen/gencat.zig +++ b/codegen/gencat.zig @@ -150,21 +150,38 @@ pub fn main() !void { defer args_iter.deinit(); _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - var write_buf: [4096]u8 = undefined; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); var writer = out_file.writer(&write_buf); - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); - - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - for (stage2.items) |i| try writer.interface.writeInt(u8, i, endian); - - try writer.interface.writeInt(u8, @intCast(stage3.items.len), endian); - for (stage3.items) |i| try writer.interface.writeInt(u8, i, endian); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const stage2: [{}]u5 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const stage3: [{}]5 = .{{ + , .{stage3.items.len}); + for (stage3.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); + try writer.interface.flush(); } diff --git a/codegen/numeric.zig b/codegen/numeric.zig index b304349..e7b4861 100644 --- a/codegen/numeric.zig +++ b/codegen/numeric.zig @@ -123,12 +123,24 @@ pub fn main() anyerror!void { defer out_file.close(); var writer = out_file.writer(&write_buf); - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); - - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - try writer.interface.writeAll(stage2.items); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]u8 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); } diff --git a/codegen/props.zig b/codegen/props.zig index 35c7dfb..ebd5116 100644 --- a/codegen/props.zig +++ b/codegen/props.zig @@ -123,11 +123,24 @@ pub fn main() anyerror!void { defer out_file.close(); var writer = out_file.writer(&write_buf); - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]u8 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - try writer.interface.writeAll(stage2.items); try writer.interface.flush(); } diff --git a/codegen/scripts.zig b/codegen/scripts.zig index 0f0194c..6bd5866 100644 --- a/codegen/scripts.zig +++ b/codegen/scripts.zig @@ -299,15 +299,32 @@ pub fn main() anyerror!void { defer out_file.close(); var writer = out_file.writer(&write_buf); - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - for (stage2.items) |i| try writer.interface.writeInt(u8, i, endian); + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]u8 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); - try writer.interface.writeInt(u8, @intCast(stage3.items.len), endian); - for (stage3.items) |i| try writer.interface.writeInt(u8, i, endian); + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s3: [{}]u8 = .{{ + , .{stage3.items.len}); + for (stage3.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); } diff --git a/src/GeneralCategories.zig b/src/GeneralCategories.zig index eee7e56..9a383bf 100644 --- a/src/GeneralCategories.zig +++ b/src/GeneralCategories.zig @@ -1,8 +1,19 @@ //! General Categories -s1: []u16 = undefined, -s2: []u5 = undefined, -s3: []u5 = undefined, +const Data = struct { + s1: []const u16 = undefined, + s2: []const u5 = undefined, + s3: []const u5 = undefined, +}; + +const general_categories = general_categories: { + const data = @import("gencat"); + break :general_categories Data{ + .s1 = &data.s1, + .s2 = &data.s2, + .s3 = &data.s3, + }; +}; /// General Category pub const Gc = enum { @@ -38,51 +49,14 @@ pub const Gc = enum { Zs, // Separator, Space }; -const GeneralCategories = @This(); - -pub fn init(allocator: Allocator) Allocator.Error!GeneralCategories { - var gencat = GeneralCategories{}; - try gencat.setup(allocator); - return gencat; -} - -pub fn setup(gencat: *GeneralCategories, allocator: Allocator) Allocator.Error!void { - const in_bytes = @embedFile("gencat"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var reader = in_fbs.reader(); - - const endian = builtin.cpu.arch.endian(); - - const s1_len: u16 = reader.readInt(u16, endian) catch unreachable; - gencat.s1 = try allocator.alloc(u16, s1_len); - errdefer allocator.free(gencat.s1); - for (0..s1_len) |i| gencat.s1[i] = reader.readInt(u16, endian) catch unreachable; - - const s2_len: u16 = reader.readInt(u16, endian) catch unreachable; - gencat.s2 = try allocator.alloc(u5, s2_len); - errdefer allocator.free(gencat.s2); - for (0..s2_len) |i| gencat.s2[i] = @intCast(reader.readInt(u8, endian) catch unreachable); - - const s3_len: u16 = reader.readInt(u8, endian) catch unreachable; - gencat.s3 = try allocator.alloc(u5, s3_len); - errdefer allocator.free(gencat.s3); - for (0..s3_len) |i| gencat.s3[i] = @intCast(reader.readInt(u8, endian) catch unreachable); -} - -pub fn deinit(gencat: *const GeneralCategories, allocator: mem.Allocator) void { - allocator.free(gencat.s1); - allocator.free(gencat.s2); - allocator.free(gencat.s3); -} - /// Lookup the General Category for `cp`. -pub fn gc(gencat: GeneralCategories, cp: u21) Gc { - return @enumFromInt(gencat.s3[gencat.s2[gencat.s1[cp >> 8] + (cp & 0xff)]]); +pub fn gc(cp: u21) Gc { + return @enumFromInt(general_categories.s3[general_categories.s2[general_categories.s1[cp >> 8] + (cp & 0xff)]]); } /// True if `cp` has an C general category. -pub fn isControl(gencat: GeneralCategories, cp: u21) bool { - return switch (gencat.gc(cp)) { +pub fn isControl(cp: u21) bool { + return switch (gc(cp)) { .Cc, .Cf, .Cn, @@ -94,8 +68,8 @@ pub fn isControl(gencat: GeneralCategories, cp: u21) bool { } /// True if `cp` has an L general category. -pub fn isLetter(gencat: GeneralCategories, cp: u21) bool { - return switch (gencat.gc(cp)) { +pub fn isLetter(cp: u21) bool { + return switch (gc(cp)) { .Ll, .Lm, .Lo, @@ -107,8 +81,8 @@ pub fn isLetter(gencat: GeneralCategories, cp: u21) bool { } /// True if `cp` has an M general category. -pub fn isMark(gencat: GeneralCategories, cp: u21) bool { - return switch (gencat.gc(cp)) { +pub fn isMark(cp: u21) bool { + return switch (gc(cp)) { .Mc, .Me, .Mn, @@ -118,8 +92,8 @@ pub fn isMark(gencat: GeneralCategories, cp: u21) bool { } /// True if `cp` has an N general category. -pub fn isNumber(gencat: GeneralCategories, cp: u21) bool { - return switch (gencat.gc(cp)) { +pub fn isNumber(cp: u21) bool { + return switch (gc(cp)) { .Nd, .Nl, .No, @@ -129,8 +103,8 @@ pub fn isNumber(gencat: GeneralCategories, cp: u21) bool { } /// True if `cp` has an P general category. -pub fn isPunctuation(gencat: GeneralCategories, cp: u21) bool { - return switch (gencat.gc(cp)) { +pub fn isPunctuation(cp: u21) bool { + return switch (gc(cp)) { .Pc, .Pd, .Pe, @@ -144,8 +118,8 @@ pub fn isPunctuation(gencat: GeneralCategories, cp: u21) bool { } /// True if `cp` has an S general category. -pub fn isSymbol(gencat: GeneralCategories, cp: u21) bool { - return switch (gencat.gc(cp)) { +pub fn isSymbol(cp: u21) bool { + return switch (gc(cp)) { .Sc, .Sk, .Sm, @@ -156,8 +130,8 @@ pub fn isSymbol(gencat: GeneralCategories, cp: u21) bool { } /// True if `cp` has an Z general category. -pub fn isSeparator(gencat: GeneralCategories, cp: u21) bool { - return switch (gencat.gc(cp)) { +pub fn isSeparator(cp: u21) bool { + return switch (gc(cp)) { .Zl, .Zp, .Zs, @@ -165,19 +139,3 @@ pub fn isSeparator(gencat: GeneralCategories, cp: u21) bool { else => false, }; } - -fn testAllocator(allocator: Allocator) !void { - var gen_cat = try GeneralCategories.init(allocator); - gen_cat.deinit(allocator); -} - -test "Allocation failure" { - try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{}); -} - -const std = @import("std"); -const builtin = @import("builtin"); -const compress = std.compress; -const mem = std.mem; -const testing = std.testing; -const Allocator = mem.Allocator; diff --git a/src/LetterCasing.zig b/src/LetterCasing.zig index 33096fc..24b67a0 100644 --- a/src/LetterCasing.zig +++ b/src/LetterCasing.zig @@ -1,120 +1,58 @@ const CodePointIterator = @import("code_point").Iterator; - -case_map: [][2]u21 = undefined, -prop_s1: []u16 = undefined, -prop_s2: []u8 = undefined, - -const LetterCasing = @This(); - -pub fn init(allocator: Allocator) Allocator.Error!LetterCasing { - var case = LetterCasing{}; - try case.setup(allocator); - return case; -} - -pub fn setup(case: *LetterCasing, allocator: Allocator) Allocator.Error!void { - case.setupInner(allocator) catch |err| { - switch (err) { - error.OutOfMemory => |e| return e, - else => unreachable, - } +const GeneralCategories = @import("GeneralCategories"); + +const Data = struct { + s1: []const u16 = undefined, + s2: []const u44 = undefined, +}; + +const letter_casing = letter_casing: { + const data = @import("case"); + break :letter_casing Data{ + .s1 = &data.s1, + .s2 = &data.s2, }; -} - -inline fn setupInner(self: *LetterCasing, allocator: mem.Allocator) !void { - const endian = builtin.cpu.arch.endian(); - - self.case_map = try allocator.alloc([2]u21, 0x110000); - errdefer allocator.free(self.case_map); - - for (0..0x110000) |i| { - const cp: u21 = @intCast(i); - self.case_map[cp] = .{ cp, cp }; - } - - // Uppercase - const upper_bytes = @embedFile("upper"); - var upper_fbs = std.io.fixedBufferStream(upper_bytes); - var upper_reader = upper_fbs.reader(); - - while (true) { - const cp = try upper_reader.readInt(i24, endian); - if (cp == 0) break; - const diff = try upper_reader.readInt(i24, endian); - self.case_map[@intCast(cp)][0] = @intCast(cp + diff); - } - - // Lowercase - const lower_bytes = @embedFile("lower"); - var lower_fbs = std.io.fixedBufferStream(lower_bytes); - var lower_reader = lower_fbs.reader(); - - while (true) { - const cp = try lower_reader.readInt(i24, endian); - if (cp == 0) break; - const diff = try lower_reader.readInt(i24, endian); - self.case_map[@intCast(cp)][1] = @intCast(cp + diff); - } - - // Case properties - const cp_bytes = @embedFile("case_prop"); - var cp_fbs = std.io.fixedBufferStream(cp_bytes); - var cp_reader = cp_fbs.reader(); - - const stage_1_len: u16 = try cp_reader.readInt(u16, endian); - self.prop_s1 = try allocator.alloc(u16, stage_1_len); - errdefer allocator.free(self.prop_s1); - for (0..stage_1_len) |i| self.prop_s1[i] = try cp_reader.readInt(u16, endian); - - const stage_2_len: u16 = try cp_reader.readInt(u16, endian); - self.prop_s2 = try allocator.alloc(u8, stage_2_len); - errdefer allocator.free(self.prop_s2); - _ = try cp_reader.readAll(self.prop_s2); -} - -pub fn deinit(self: *const LetterCasing, allocator: mem.Allocator) void { - allocator.free(self.case_map); - allocator.free(self.prop_s1); - allocator.free(self.prop_s2); -} +}; // Returns true if `cp` is either upper, lower, or title case. -pub fn isCased(self: LetterCasing, cp: u21) bool { - return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; +pub fn isCased(cp: u21) bool { + return isUpper(cp) or isLower(cp) or GeneralCategories.gc(cp) == .Lt; } // Returns true if `cp` is uppercase. -pub fn isUpper(self: LetterCasing, cp: u21) bool { - return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; +pub fn isUpper(cp: u21) bool { + // isUpper is true if we have a mapping to a lower character (bit 1) + return letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)] & 1 == 1; } -/// Returns true if `str` is all uppercase. -pub fn isUpperStr(self: LetterCasing, str: []const u8) bool { +/// Returns true if `str` is all non-lowercase. +pub fn isUpperStr(str: []const u8) bool { var iter = CodePointIterator{ .bytes = str }; return while (iter.next()) |cp| { - if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false; + if (isLower(cp.code)) break false; } else true; } test "isUpperStr" { - const cd = try init(testing.allocator); - defer cd.deinit(testing.allocator); - - try testing.expect(cd.isUpperStr("HELLO, WORLD 2112!")); - try testing.expect(!cd.isUpperStr("hello, world 2112!")); - try testing.expect(!cd.isUpperStr("Hello, World 2112!")); + try testing.expect(isUpperStr("HELLO, WORLD 2112!")); + try testing.expect(!isUpperStr("hello, world 2112!")); + try testing.expect(!isUpperStr("Hello, World 2112!")); } /// Returns uppercase mapping for `cp`. -pub fn toUpper(self: LetterCasing, cp: u21) u21 { - return self.case_map[cp][0]; +pub fn toUpper(cp: u21) u21 { + const case_prop = letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)]; + if (case_prop & 2 == 2) { + return @intCast(case_prop >> (21 + 2)); + } else { + return cp; + } } /// Returns a new string with all letters in uppercase. /// Caller must free returned bytes with `allocator`. pub fn toUpperStr( - self: LetterCasing, allocator: mem.Allocator, str: []const u8, ) ![]u8 { @@ -125,7 +63,7 @@ pub fn toUpperStr( var buf: [4]u8 = undefined; while (iter.next()) |cp| { - const len = try unicode.utf8Encode(self.toUpper(cp.code), &buf); + const len = try unicode.utf8Encode(toUpper(cp.code), &buf); try bytes.appendSlice(buf[0..len]); } @@ -133,46 +71,45 @@ pub fn toUpperStr( } test "toUpperStr" { - const cd = try init(testing.allocator); - defer cd.deinit(testing.allocator); - - const uppered = try cd.toUpperStr(testing.allocator, "Hello, World 2112!"); + const uppered = try toUpperStr(testing.allocator, "Hello, World 2112!"); defer testing.allocator.free(uppered); try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered); } // Returns true if `cp` is lowercase. -pub fn isLower(self: LetterCasing, cp: u21) bool { - return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; +pub fn isLower(cp: u21) bool { + // isLower is true if we have a mapping to an upper character (bit 2) + return letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)] & 2 == 2; } -/// Returns true if `str` is all lowercase. -pub fn isLowerStr(self: LetterCasing, str: []const u8) bool { +/// Returns true if `str` is all non-uppercase. +pub fn isLowerStr(str: []const u8) bool { var iter = CodePointIterator{ .bytes = str }; return while (iter.next()) |cp| { - if (self.isCased(cp.code) and !self.isLower(cp.code)) break false; + if (isUpper(cp.code)) break false; } else true; } test "isLowerStr" { - const cd = try init(testing.allocator); - defer cd.deinit(testing.allocator); - - try testing.expect(cd.isLowerStr("hello, world 2112!")); - try testing.expect(!cd.isLowerStr("HELLO, WORLD 2112!")); - try testing.expect(!cd.isLowerStr("Hello, World 2112!")); + try testing.expect(isLowerStr("hello, world 2112!")); + try testing.expect(!isLowerStr("HELLO, WORLD 2112!")); + try testing.expect(!isLowerStr("Hello, World 2112!")); } /// Returns lowercase mapping for `cp`. -pub fn toLower(self: LetterCasing, cp: u21) u21 { - return self.case_map[cp][1]; +pub fn toLower(cp: u21) u21 { + const case_prop = letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)]; + if (case_prop & 1 == 1) { + return @intCast((case_prop >> 2) & 0x1FFFFF); + } else { + return cp; + } } /// Returns a new string with all letters in lowercase. /// Caller must free returned bytes with `allocator`. pub fn toLowerStr( - self: LetterCasing, allocator: mem.Allocator, str: []const u8, ) ![]u8 { @@ -183,7 +120,7 @@ pub fn toLowerStr( var buf: [4]u8 = undefined; while (iter.next()) |cp| { - const len = try unicode.utf8Encode(self.toLower(cp.code), &buf); + const len = try unicode.utf8Encode(toLower(cp.code), &buf); try bytes.appendSlice(buf[0..len]); } @@ -191,27 +128,13 @@ pub fn toLowerStr( } test "toLowerStr" { - const cd = try init(testing.allocator); - defer cd.deinit(testing.allocator); - - const lowered = try cd.toLowerStr(testing.allocator, "Hello, World 2112!"); + const lowered = try toLowerStr(testing.allocator, "Hello, World 2112!"); defer testing.allocator.free(lowered); try testing.expectEqualStrings("hello, world 2112!", lowered); } -fn testAllocator(allocator: Allocator) !void { - var prop = try LetterCasing.init(allocator); - prop.deinit(allocator); -} - -test "Allocation failure" { - try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{}); -} - const std = @import("std"); const builtin = @import("builtin"); -const compress = std.compress; const mem = std.mem; -const Allocator = std.mem.Allocator; const testing = std.testing; const unicode = std.unicode; diff --git a/src/Properties.zig b/src/Properties.zig index 432d176..f8c7cfc 100644 --- a/src/Properties.zig +++ b/src/Properties.zig @@ -1,177 +1,108 @@ //! Properties module -core_s1: []u16 = undefined, -core_s2: []u8 = undefined, -props_s1: []u16 = undefined, -props_s2: []u8 = undefined, -num_s1: []u16 = undefined, -num_s2: []u8 = undefined, - -const Properties = @This(); - -pub fn init(allocator: Allocator) Allocator.Error!Properties { - var props = Properties{}; - try props.setup(allocator); - return props; -} - -pub fn setup(props: *Properties, allocator: Allocator) Allocator.Error!void { - props.setupInner(allocator) catch |err| { - switch (err) { - error.OutOfMemory => |e| return e, - else => unreachable, - } +const Data = struct { + core_s1: []const u16 = undefined, + core_s2: []const u8 = undefined, + props_s1: []const u16 = undefined, + props_s2: []const u8 = undefined, + num_s1: []const u16 = undefined, + num_s2: []const u8 = undefined, +}; + +const properties = properties: { + const core_props = @import("core_props"); + const props_data = @import("props"); + const numeric = @import("numeric"); + break :properties Data{ + .core_s1 = &core_props.s1, + .core_s2 = &core_props.s2, + .props_s1 = &props_data.s1, + .props_s2 = &props_data.s2, + .num_s1 = &numeric.s1, + .num_s2 = &numeric.s2, }; -} - -inline fn setupInner(props: *Properties, allocator: Allocator) !void { - const endian = builtin.cpu.arch.endian(); - - // Process DerivedCoreProperties.txt - const core_bytes = @embedFile("core_props"); - var core_fbs = std.io.fixedBufferStream(core_bytes); - var core_reader = core_fbs.reader(); - - const core_stage_1_len: u16 = try core_reader.readInt(u16, endian); - props.core_s1 = try allocator.alloc(u16, core_stage_1_len); - errdefer allocator.free(props.core_s1); - for (0..core_stage_1_len) |i| props.core_s1[i] = try core_reader.readInt(u16, endian); - - const core_stage_2_len: u16 = try core_reader.readInt(u16, endian); - props.core_s2 = try allocator.alloc(u8, core_stage_2_len); - errdefer allocator.free(props.core_s2); - _ = try core_reader.readAll(props.core_s2); - - // Process PropList.txt - const props_bytes = @embedFile("props"); - var props_fbs = std.io.fixedBufferStream(props_bytes); - var props_reader = props_fbs.reader(); - - const stage_1_len: u16 = try props_reader.readInt(u16, endian); - props.props_s1 = try allocator.alloc(u16, stage_1_len); - errdefer allocator.free(props.props_s1); - for (0..stage_1_len) |i| props.props_s1[i] = try props_reader.readInt(u16, endian); - - const stage_2_len: u16 = try props_reader.readInt(u16, endian); - props.props_s2 = try allocator.alloc(u8, stage_2_len); - errdefer allocator.free(props.props_s2); - _ = try props_reader.readAll(props.props_s2); - - // Process DerivedNumericType.txt - const num_bytes = @embedFile("numeric"); - var num_fbs = std.io.fixedBufferStream(num_bytes); - var num_reader = num_fbs.reader(); - - const num_stage_1_len: u16 = try num_reader.readInt(u16, endian); - props.num_s1 = try allocator.alloc(u16, num_stage_1_len); - errdefer allocator.free(props.num_s1); - for (0..num_stage_1_len) |i| props.num_s1[i] = try num_reader.readInt(u16, endian); - - const num_stage_2_len: u16 = try num_reader.readInt(u16, endian); - props.num_s2 = try allocator.alloc(u8, num_stage_2_len); - errdefer allocator.free(props.num_s2); - _ = try num_reader.readAll(props.num_s2); -} +}; -pub fn deinit(self: *const Properties, allocator: Allocator) void { - allocator.free(self.core_s1); - allocator.free(self.core_s2); - allocator.free(self.props_s1); - allocator.free(self.props_s2); - allocator.free(self.num_s1); - allocator.free(self.num_s2); -} +const Properties = @This(); /// True if `cp` is a mathematical symbol. -pub fn isMath(self: Properties, cp: u21) bool { - return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; +pub fn isMath(cp: u21) bool { + return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; } /// True if `cp` is an alphabetic character. -pub fn isAlphabetic(self: Properties, cp: u21) bool { - return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; +pub fn isAlphabetic(cp: u21) bool { + return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; } /// True if `cp` is a valid identifier start character. -pub fn isIdStart(self: Properties, cp: u21) bool { - return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; +pub fn isIdStart(cp: u21) bool { + return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; } /// True if `cp` is a valid identifier continuation character. -pub fn isIdContinue(self: Properties, cp: u21) bool { - return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 8 == 8; +pub fn isIdContinue(cp: u21) bool { + return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 8 == 8; } /// True if `cp` is a valid extended identifier start character. -pub fn isXidStart(self: Properties, cp: u21) bool { - return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 16 == 16; +pub fn isXidStart(cp: u21) bool { + return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 16 == 16; } /// True if `cp` is a valid extended identifier continuation character. -pub fn isXidContinue(self: Properties, cp: u21) bool { - return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 32 == 32; +pub fn isXidContinue(cp: u21) bool { + return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 32 == 32; } /// True if `cp` is a whitespace character. -pub fn isWhitespace(self: Properties, cp: u21) bool { - return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; +pub fn isWhitespace(cp: u21) bool { + return properties.props_s2[properties.props_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; } /// True if `cp` is a hexadecimal digit. -pub fn isHexDigit(self: Properties, cp: u21) bool { - return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; +pub fn isHexDigit(cp: u21) bool { + return properties.props_s2[properties.props_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; } /// True if `cp` is a diacritic mark. -pub fn isDiacritic(self: Properties, cp: u21) bool { - return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; +pub fn isDiacritic(cp: u21) bool { + return properties.props_s2[properties.props_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; } /// True if `cp` is numeric. -pub fn isNumeric(self: Properties, cp: u21) bool { - return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; +pub fn isNumeric(cp: u21) bool { + return properties.num_s2[properties.num_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; } /// True if `cp` is a digit. -pub fn isDigit(self: Properties, cp: u21) bool { - return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; +pub fn isDigit(cp: u21) bool { + return properties.num_s2[properties.num_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; } /// True if `cp` is decimal. -pub fn isDecimal(self: Properties, cp: u21) bool { - return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; +pub fn isDecimal(cp: u21) bool { + return properties.num_s2[properties.num_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; } test "Props" { - const self = try init(testing.allocator); - defer self.deinit(testing.allocator); - - try testing.expect(self.isHexDigit('F')); - try testing.expect(self.isHexDigit('a')); - try testing.expect(self.isHexDigit('8')); - try testing.expect(!self.isHexDigit('z')); - - try testing.expect(self.isDiacritic('\u{301}')); - try testing.expect(self.isAlphabetic('A')); - try testing.expect(!self.isAlphabetic('3')); - try testing.expect(self.isMath('+')); - - try testing.expect(self.isNumeric('\u{277f}')); - try testing.expect(self.isDigit('\u{2070}')); - try testing.expect(self.isDecimal('3')); - - try testing.expect(!self.isNumeric('1')); - try testing.expect(!self.isDigit('2')); - try testing.expect(!self.isDecimal('g')); -} - -fn testAllocator(allocator: Allocator) !void { - var prop = try Properties.init(allocator); - prop.deinit(allocator); -} - -test "Allocation failure" { - try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{}); + try testing.expect(Properties.isHexDigit('F')); + try testing.expect(Properties.isHexDigit('a')); + try testing.expect(Properties.isHexDigit('8')); + try testing.expect(!Properties.isHexDigit('z')); + + try testing.expect(Properties.isDiacritic('\u{301}')); + try testing.expect(Properties.isAlphabetic('A')); + try testing.expect(!Properties.isAlphabetic('3')); + try testing.expect(Properties.isMath('+')); + + try testing.expect(Properties.isNumeric('\u{277f}')); + try testing.expect(Properties.isDigit('\u{2070}')); + try testing.expect(Properties.isDecimal('3')); + + try testing.expect(!Properties.isNumeric('1')); + try testing.expect(!Properties.isDigit('2')); + try testing.expect(!Properties.isDecimal('g')); } const std = @import("std"); diff --git a/src/Scripts.zig b/src/Scripts.zig index 719b01f..4938318 100644 --- a/src/Scripts.zig +++ b/src/Scripts.zig @@ -1,8 +1,18 @@ //! Scripts Module +const Data = struct { + s1: []const u16 = undefined, + s2: []const u8 = undefined, + s3: []const u8 = undefined, +}; -s1: []u16 = undefined, -s2: []u8 = undefined, -s3: []u8 = undefined, +const scripts = scripts: { + const data = @import("script"); + break :scripts Data{ + .s1 = &data.s1, + .s2 = &data.s2, + .s3 = &data.s3, + }; +}; /// Scripts enum pub const Script = enum { @@ -178,76 +188,20 @@ pub const Script = enum { Yi, Zanabazar_Square, }; -const Scripts = @This(); - -pub fn init(allocator: Allocator) Allocator.Error!Scripts { - var scripts = Scripts{}; - try scripts.setup(allocator); - return scripts; -} - -pub fn setup(scripts: *Scripts, allocator: Allocator) Allocator.Error!void { - scripts.setupInner(allocator) catch |err| { - switch (err) { - error.OutOfMemory => |e| return e, - else => unreachable, - } - }; -} - -inline fn setupInner(scripts: *Scripts, allocator: mem.Allocator) !void { - const in_bytes = @embedFile("scripts"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var reader = in_fbs.reader(); - - const endian = builtin.cpu.arch.endian(); - - const s1_len: u16 = try reader.readInt(u16, endian); - scripts.s1 = try allocator.alloc(u16, s1_len); - errdefer allocator.free(scripts.s1); - for (0..s1_len) |i| scripts.s1[i] = try reader.readInt(u16, endian); - - const s2_len: u16 = try reader.readInt(u16, endian); - scripts.s2 = try allocator.alloc(u8, s2_len); - errdefer allocator.free(scripts.s2); - _ = try reader.readAll(scripts.s2); - - const s3_len: u16 = try reader.readInt(u8, endian); - scripts.s3 = try allocator.alloc(u8, s3_len); - errdefer allocator.free(scripts.s3); - _ = try reader.readAll(scripts.s3); -} - -pub fn deinit(self: *const Scripts, allocator: mem.Allocator) void { - allocator.free(self.s1); - allocator.free(self.s2); - allocator.free(self.s3); -} /// Lookup the Script type for `cp`. -pub fn script(self: Scripts, cp: u21) ?Script { - const byte = self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]]; +pub fn script(cp: u21) ?Script { + const byte = scripts.s3[scripts.s2[scripts.s1[cp >> 8] + (cp & 0xff)]]; if (byte == 0) return null; return @enumFromInt(byte); } test "script" { - const self = try init(std.testing.allocator); - defer self.deinit(std.testing.allocator); - try testing.expectEqual(Script.Latin, self.script('A').?); -} - -fn testAllocator(allocator: Allocator) !void { - var prop = try Scripts.init(allocator); - prop.deinit(allocator); -} - -test "Allocation failure" { - try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{}); + try testing.expectEqual(Script.Latin, script('A').?); + // try testing.expectEqual(Script.Deseret, script('𐐌').?); } const std = @import("std"); const builtin = @import("builtin"); -const mem = std.mem; -const Allocator = mem.Allocator; +const unicode = std.unicode; const testing = std.testing; -- cgit v1.2.3