From e476250ea9326b2550847b301c265115ff375a31 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 4 Feb 2026 18:36:18 -0500 Subject: Rest of the 'easy' stuff This gets us up to feature parity with Jacob's work. I want to eliminate that last allocation using the comptime hash map, and then see about eliminating allocations from case comparisons as well. That should just about do it. --- codegen/case.zig | 145 +++++++++++++++++++++++++++++++++++++++++++++++++ codegen/core_props.zig | 30 +++++++--- codegen/gencat.zig | 37 +++++++++---- codegen/numeric.zig | 24 ++++++-- codegen/props.zig | 23 ++++++-- codegen/scripts.zig | 31 ++++++++--- 6 files changed, 253 insertions(+), 37 deletions(-) create mode 100644 codegen/case.zig (limited to 'codegen') diff --git a/codegen/case.zig b/codegen/case.zig new file mode 100644 index 0000000..9dffc7c --- /dev/null +++ b/codegen/case.zig @@ -0,0 +1,145 @@ +const std = @import("std"); +const builtin = @import("builtin"); + +const block_size = 256; +const Block = [block_size]u44; + +comptime { + if (@bitSizeOf(u44) != 2 * @bitSizeOf(u21) + 2) { + @compileError("u44 doesn't have expected bit size."); + } +} + +const BlockMap = std.HashMap( + Block, + u16, + struct { + pub fn hash(_: @This(), k: Block) u64 { + var hasher = std.hash.Wyhash.init(0); + std.hash.autoHashStrat(&hasher, k, .DeepRecursive); + return hasher.final(); + } + + pub fn eql(_: @This(), a: Block, b: Block) bool { + return std.mem.eql(u44, &a, &b); + } + }, + std.hash_map.default_max_load_percentage, +); + +pub fn main() !void { + var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); + defer arena.deinit(); + const allocator = arena.allocator(); + + var lower_map = std.AutoHashMap(u21, u21).init(allocator); + defer lower_map.deinit(); + + var upper_map = std.AutoHashMap(u21, u21).init(allocator); + defer upper_map.deinit(); + + // Process UnicodeData.txt + + var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt")); + while (in_reader.takeDelimiterInclusive('\n')) |line| { + if (line.len == 0) continue; + + var field_iter = std.mem.splitScalar(u8, line, ';'); + var cp: u21 = undefined; + + var i: usize = 0; + while (field_iter.next()) |field| : (i += 1) { + if (field.len == 0) continue; + + switch (i) { + 0 => cp = try std.fmt.parseInt(u21, field, 16), + + 12 => { + // Uppercase mapping + try upper_map.put(cp, try std.fmt.parseInt(u21, field, 16)); + }, + + 13 => { + // Lowercase mapping + try lower_map.put(cp, try std.fmt.parseInt(u21, field, 16)); + }, + + else => {}, + } + } + } else |err| switch (err) { + error.EndOfStream => {}, + else => { + return err; + }, + } + + var blocks_map = BlockMap.init(allocator); + defer blocks_map.deinit(); + + var stage1 = std.array_list.Managed(u16).init(allocator); + defer stage1.deinit(); + + var stage2 = std.array_list.Managed(u44).init(allocator); + defer stage2.deinit(); + + var block: Block = [_]u44{0} ** block_size; + var block_len: u16 = 0; + + for (0..0x110000) |i| { + const cp: u21 = @intCast(i); + var case_prop: u44 = 0; + + if (lower_map.get(cp)) |lower| { + case_prop |= @as(u44, lower) << 2 | 1; + } + + if (upper_map.get(cp)) |upper| { + case_prop |= @as(u44, upper) << (2 + 21) | 2; + } + + block[block_len] = case_prop; + block_len += 1; + + if (block_len < block_size and cp != 0x10ffff) continue; + + const gop = try blocks_map.getOrPut(block); + if (!gop.found_existing) { + gop.value_ptr.* = @intCast(stage2.items.len); + try stage2.appendSlice(&block); + } + + try stage1.append(gop.value_ptr.*); + block_len = 0; + } + + var args_iter = try std.process.argsWithAllocator(allocator); + defer args_iter.deinit(); + _ = args_iter.skip(); + const output_path = args_iter.next() orelse @panic("No output file arg!"); + + var write_buf: [4096]u8 = undefined; + var out_file = try std.fs.cwd().createFile(output_path, .{}); + defer out_file.close(); + var writer = out_file.writer(&write_buf); + + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]u44 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); + try writer.interface.flush(); +} diff --git a/codegen/core_props.zig b/codegen/core_props.zig index 6ffdf91..99a55e2 100644 --- a/codegen/core_props.zig +++ b/codegen/core_props.zig @@ -120,17 +120,29 @@ pub fn main() anyerror!void { _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - var out_buf: [4096]u8 = undefined; + var write_buf: [4096]u8 = undefined; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); - var writer = out_file.writer(&out_buf); - - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); - - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - try writer.interface.writeAll(stage2.items); + var writer = out_file.writer(&write_buf); + + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]u8 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); } diff --git a/codegen/gencat.zig b/codegen/gencat.zig index 9800f1d..12c8373 100644 --- a/codegen/gencat.zig +++ b/codegen/gencat.zig @@ -150,21 +150,38 @@ pub fn main() !void { defer args_iter.deinit(); _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - var write_buf: [4096]u8 = undefined; var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); var writer = out_file.writer(&write_buf); - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); - - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - for (stage2.items) |i| try writer.interface.writeInt(u8, i, endian); - - try writer.interface.writeInt(u8, @intCast(stage3.items.len), endian); - for (stage3.items) |i| try writer.interface.writeInt(u8, i, endian); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const stage2: [{}]u5 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const stage3: [{}]5 = .{{ + , .{stage3.items.len}); + for (stage3.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); + try writer.interface.flush(); } diff --git a/codegen/numeric.zig b/codegen/numeric.zig index b304349..e7b4861 100644 --- a/codegen/numeric.zig +++ b/codegen/numeric.zig @@ -123,12 +123,24 @@ pub fn main() anyerror!void { defer out_file.close(); var writer = out_file.writer(&write_buf); - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); - - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - try writer.interface.writeAll(stage2.items); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]u8 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); } diff --git a/codegen/props.zig b/codegen/props.zig index 35c7dfb..ebd5116 100644 --- a/codegen/props.zig +++ b/codegen/props.zig @@ -123,11 +123,24 @@ pub fn main() anyerror!void { defer out_file.close(); var writer = out_file.writer(&write_buf); - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]u8 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - try writer.interface.writeAll(stage2.items); try writer.interface.flush(); } diff --git a/codegen/scripts.zig b/codegen/scripts.zig index 0f0194c..6bd5866 100644 --- a/codegen/scripts.zig +++ b/codegen/scripts.zig @@ -299,15 +299,32 @@ pub fn main() anyerror!void { defer out_file.close(); var writer = out_file.writer(&write_buf); - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - for (stage2.items) |i| try writer.interface.writeInt(u8, i, endian); + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]u8 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); - try writer.interface.writeInt(u8, @intCast(stage3.items.len), endian); - for (stage3.items) |i| try writer.interface.writeInt(u8, i, endian); + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s3: [{}]u8 = .{{ + , .{stage3.items.len}); + for (stage3.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); } -- cgit v1.2.3