From ba5d9081b479e95ffa7f3baf751beedd370cec14 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 4 Feb 2026 18:01:36 -0500 Subject: Normalization and case folding Both of which deserve some further attention. --- codegen/ccc.zig | 24 ++++++++--- codegen/compat.zig | 121 ++++++++++++++++++++++++++++++++++++++++++++--------- codegen/fold.zig | 57 ++++++++++++++++--------- codegen/hangul.zig | 24 ++++++++--- codegen/normp.zig | 24 ++++++++--- 5 files changed, 193 insertions(+), 57 deletions(-) (limited to 'codegen') diff --git a/codegen/ccc.zig b/codegen/ccc.zig index 4e470ae..e76222f 100644 --- a/codegen/ccc.zig +++ b/codegen/ccc.zig @@ -112,12 +112,24 @@ pub fn main() anyerror!void { defer out_file.close(); var writer = out_file.writer(&write_buf); - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); - - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - try writer.interface.writeAll(stage2.items); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]u8 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); } diff --git a/codegen/compat.zig b/codegen/compat.zig index debb83d..a9d1f92 100644 --- a/codegen/compat.zig +++ b/codegen/compat.zig @@ -1,58 +1,82 @@ const std = @import("std"); const builtin = @import("builtin"); +const block_size = 256; +const Block = [block_size][]const u21; + +const BlockMap = std.HashMap( + Block, + u16, + struct { + pub fn hash(_: @This(), k: Block) u64 { + var hasher = std.hash.Wyhash.init(0); + std.hash.autoHashStrat(&hasher, k, .DeepRecursive); + return hasher.final(); + } + + pub fn eql(_: @This(), aBlock: Block, bBlock: Block) bool { + return for (aBlock, bBlock) |a, b| { + if (a.len != b.len) return false; + for (a, b) |a_cp, b_cp| { + if (a_cp != b_cp) return false; + } + } else true; + } + }, + std.hash_map.default_max_load_percentage, +); + pub fn main() anyerror!void { var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); defer arena.deinit(); const allocator = arena.allocator(); // Process UnicodeData.txt - var write_buf: [4096]u8 = undefined; - var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt")); var args_iter = try std.process.argsWithAllocator(allocator); defer args_iter.deinit(); _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - var out_file = try std.fs.cwd().createFile(output_path, .{}); - defer out_file.close(); - var writer = out_file.writer(&write_buf); + var compat_map = std.AutoHashMap(u21, []u21).init(allocator); + defer compat_map.deinit(); - const endian = builtin.cpu.arch.endian(); - - lines: while (in_reader.takeDelimiterInclusive('\n')) |took| { - const line = std.mem.trimRight(u8, took, "\n"); + while (in_reader.takeDelimiterInclusive('\n')) |line| { if (line.len == 0) continue; var field_iter = std.mem.splitScalar(u8, line, ';'); - var cps: [19]u24 = undefined; - var len: u8 = 1; + var cp: u21 = undefined; var i: usize = 0; while (field_iter.next()) |field| : (i += 1) { + if (field.len == 0) continue; + switch (i) { - 0 => cps[0] = try std.fmt.parseInt(u24, field, 16), + 0 => { + cp = try std.fmt.parseInt(u21, field, 16); + }, 5 => { // Not compatibility. - if (field.len == 0 or field[0] != '<') continue :lines; + if (field[0] != '<') continue; + var cp_iter = std.mem.tokenizeScalar(u8, field, ' '); _ = cp_iter.next(); // + var cps: [18]u21 = undefined; + var len: u8 = 0; + while (cp_iter.next()) |cp_str| : (len += 1) { - cps[len] = try std.fmt.parseInt(u24, cp_str, 16); + cps[len] = try std.fmt.parseInt(u21, cp_str, 16); } - }, - 2 => if (line[0] == '<') continue :lines, + const slice = try allocator.dupe(u21, cps[0..len]); + try compat_map.put(cp, slice); + }, else => {}, } } - - try writer.interface.writeInt(u8, @intCast(len), endian); - for (cps[0..len]) |cp| try writer.interface.writeInt(u24, cp, endian); } else |err| switch (err) { error.EndOfStream => {}, else => { @@ -60,6 +84,63 @@ pub fn main() anyerror!void { }, } - try writer.interface.writeInt(u16, 0, endian); + // Build multi-tiered lookup tables for compatibility decompositions + var blocks_map = BlockMap.init(allocator); + defer blocks_map.deinit(); + + var stage1 = std.array_list.Managed(u16).init(allocator); + defer stage1.deinit(); + + var stage2 = std.array_list.Managed([]const u21).init(allocator); + defer stage2.deinit(); + + var block: Block = [_][]const u21{&[_]u21{}} ** block_size; + var block_len: u16 = 0; + + for (0..0x110000) |i| { + const cp: u21 = @intCast(i); + const compat: []const u21 = compat_map.get(cp) orelse &[_]u21{}; + + block[block_len] = compat; + block_len += 1; + + if (block_len < block_size and cp != 0x10ffff) continue; + + const gop = try blocks_map.getOrPut(block); + if (!gop.found_existing) { + gop.value_ptr.* = @intCast(stage2.items.len); + try stage2.appendSlice(&block); + } + + try stage1.append(gop.value_ptr.*); + block_len = 0; + } + // Write out + var write_buf: [4096]u8 = undefined; + var out_file = try std.fs.cwd().createFile(output_path, .{}); + defer out_file.close(); + var writer = out_file.writer(&write_buf); + + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}][]const u21 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| { + try writer.interface.print("&.{any}, ", .{entry}); + } + + try writer.interface.writeAll( + \\}; + ); + try writer.interface.flush(); } diff --git a/codegen/fold.zig b/codegen/fold.zig index 366ed79..c5f54eb 100644 --- a/codegen/fold.zig +++ b/codegen/fold.zig @@ -228,26 +228,45 @@ pub fn main() anyerror!void { var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); var writer = out_file.writer(&write_buf); - - const endian = builtin.cpu.arch.endian(); // Table metadata. - try writer.interface.writeInt(u24, @intCast(codepoint_cutoff), endian); - try writer.interface.writeInt(u24, @intCast(multiple_codepoint_start), endian); - // Stage 1 - try writer.interface.writeInt(u16, @intCast(meaningful_stage1.len), endian); - try writer.interface.writeAll(meaningful_stage1); - // Stage 2 - try writer.interface.writeInt(u16, @intCast(stage2.len), endian); - try writer.interface.writeAll(stage2); - // Stage 3 - try writer.interface.writeInt(u16, @intCast(stage3.len), endian); - for (stage3) |offset| try writer.interface.writeInt(i24, offset, endian); - // Changes when case folded - // Min and max - try writer.interface.writeInt(u24, std.mem.min(u21, changes_when_casefolded_exceptions.items), endian); - try writer.interface.writeInt(u24, std.mem.max(u21, changes_when_casefolded_exceptions.items), endian); - try writer.interface.writeInt(u16, @intCast(changes_when_casefolded_exceptions.items.len), endian); - for (changes_when_casefolded_exceptions.items) |cp| try writer.interface.writeInt(u24, cp, endian); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const cutoff: u21 = {}; + \\pub const cwcf_exceptions_min: u21 = {}; + \\pub const cwcf_exceptions_max: u21 = {}; + \\pub const cwcf_exceptions: [{}]u21 = .{{ + , .{ codepoint_cutoff, std.mem.min(u21, changes_when_casefolded_exceptions.items), std.mem.max(u21, changes_when_casefolded_exceptions.items), changes_when_casefolded_exceptions.items.len }); + for (changes_when_casefolded_exceptions.items) |cp| try writer.interface.print("{}, ", .{cp}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const multiple_start: u21 = {}; + \\pub const stage1: [{}]u8 = .{{ + , .{ multiple_codepoint_start, meaningful_stage1.len }); + for (meaningful_stage1) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const stage2: [{}]u8 = .{{ + , .{stage2.len}); + for (stage2) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const stage3: [{}]i24 = .{{ + , .{stage3.len}); + for (stage3) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); } diff --git a/codegen/hangul.zig b/codegen/hangul.zig index 2e4c175..d7504a9 100644 --- a/codegen/hangul.zig +++ b/codegen/hangul.zig @@ -120,12 +120,24 @@ pub fn main() anyerror!void { defer out_file.close(); var writer = out_file.writer(&write_buf); - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); - - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - for (stage2.items) |i| try writer.interface.writeInt(u8, i, endian); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]u3 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); } diff --git a/codegen/normp.zig b/codegen/normp.zig index eaf6989..343f03e 100644 --- a/codegen/normp.zig +++ b/codegen/normp.zig @@ -121,12 +121,24 @@ pub fn main() anyerror!void { defer out_file.close(); var writer = out_file.writer(&write_buf); - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); - - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - for (stage2.items) |i| try writer.interface.writeInt(u8, i, endian); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]u3 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); } -- cgit v1.2.3