From ba5d9081b479e95ffa7f3baf751beedd370cec14 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 4 Feb 2026 18:01:36 -0500 Subject: Normalization and case folding Both of which deserve some further attention. --- build.zig | 12 ++- codegen/ccc.zig | 24 +++-- codegen/compat.zig | 121 +++++++++++++++++++---- codegen/fold.zig | 57 +++++++---- codegen/hangul.zig | 24 +++-- codegen/normp.zig | 24 +++-- src/CaseFolding.zig | 258 +++++++++++++++++++------------------------------- src/CombiningData.zig | 49 ++++------ src/CompatData.zig | 64 +++---------- src/HangulData.zig | 42 +++----- src/NormPropsData.zig | 46 +++------ src/Normalize.zig | 119 ++++++++--------------- src/comptime_map.zig | 176 ++++++++++++++++++++++++++++++++++ 13 files changed, 571 insertions(+), 445 deletions(-) create mode 100644 src/comptime_map.zig diff --git a/build.zig b/build.zig index 1ae7005..fa8c490 100644 --- a/build.zig +++ b/build.zig @@ -103,7 +103,7 @@ pub fn build(b: *std.Build) void { }); compat_gen_exe.root_module.addAnonymousImport("UnicodeData.txt", .{ .root_source_file = b.path("data/unicode/UnicodeData.txt") }); const run_compat_gen_exe = b.addRunArtifact(compat_gen_exe); - const compat_gen_out = run_compat_gen_exe.addOutputFileArg("compat.bin.z"); + const compat_gen_out = run_compat_gen_exe.addOutputFileArg("compat.zig"); const hangul_gen_exe = b.addExecutable(.{ .name = "hangul", @@ -115,7 +115,7 @@ pub fn build(b: *std.Build) void { }); hangul_gen_exe.root_module.addAnonymousImport("HangulSyllableType.txt", .{ .root_source_file = b.path("data/unicode/HangulSyllableType.txt") }); const run_hangul_gen_exe = b.addRunArtifact(hangul_gen_exe); - const hangul_gen_out = run_hangul_gen_exe.addOutputFileArg("hangul.bin.z"); + const hangul_gen_out = run_hangul_gen_exe.addOutputFileArg("hangul.zig"); const normp_gen_exe = b.addExecutable(.{ .name = "normp", @@ -127,7 +127,7 @@ pub fn build(b: *std.Build) void { }); normp_gen_exe.root_module.addAnonymousImport("DerivedNormalizationProps.txt", .{ .root_source_file = b.path("data/unicode/DerivedNormalizationProps.txt") }); const run_normp_gen_exe = b.addRunArtifact(normp_gen_exe); - const normp_gen_out = run_normp_gen_exe.addOutputFileArg("normp.bin.z"); + const normp_gen_out = run_normp_gen_exe.addOutputFileArg("normp.zig"); const ccc_gen_exe = b.addExecutable(.{ .name = "ccc", @@ -139,7 +139,7 @@ pub fn build(b: *std.Build) void { }); ccc_gen_exe.root_module.addAnonymousImport("DerivedCombiningClass.txt", .{ .root_source_file = b.path("data/unicode/extracted/DerivedCombiningClass.txt") }); const run_ccc_gen_exe = b.addRunArtifact(ccc_gen_exe); - const ccc_gen_out = run_ccc_gen_exe.addOutputFileArg("ccc.bin.z"); + const ccc_gen_out = run_ccc_gen_exe.addOutputFileArg("ccc.zig"); const gencat_gen_exe = b.addExecutable(.{ .name = "gencat", @@ -164,7 +164,7 @@ pub fn build(b: *std.Build) void { fold_gen_exe.root_module.addAnonymousImport("DerivedCoreProperties.txt", .{ .root_source_file = b.path("data/unicode/DerivedCoreProperties.txt") }); fold_gen_exe.root_module.addAnonymousImport("CaseFolding.txt", .{ .root_source_file = b.path("data/unicode/CaseFolding.txt") }); const run_fold_gen_exe = b.addRunArtifact(fold_gen_exe); - const fold_gen_out = run_fold_gen_exe.addOutputFileArg("fold.bin.z"); + const fold_gen_out = run_fold_gen_exe.addOutputFileArg("fold.zig"); // Numeric types const num_gen_exe = b.addExecutable(.{ @@ -518,6 +518,8 @@ pub fn build(b: *std.Build) void { test_unicode_step.dependOn(&run_unicode_tests.step); test_unicode_step.dependOn(&display_width_tr.step); test_unicode_step.dependOn(&words_tr.step); + test_unicode_step.dependOn(&norm_tr.step); + test_unicode_step.dependOn(&case_fold_tr.step); const test_step = b.step("test", "Run all module tests"); test_step.dependOn(&run_unicode_tests.step); diff --git a/codegen/ccc.zig b/codegen/ccc.zig index 4e470ae..e76222f 100644 --- a/codegen/ccc.zig +++ b/codegen/ccc.zig @@ -112,12 +112,24 @@ pub fn main() anyerror!void { defer out_file.close(); var writer = out_file.writer(&write_buf); - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); - - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - try writer.interface.writeAll(stage2.items); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]u8 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); } diff --git a/codegen/compat.zig b/codegen/compat.zig index debb83d..a9d1f92 100644 --- a/codegen/compat.zig +++ b/codegen/compat.zig @@ -1,58 +1,82 @@ const std = @import("std"); const builtin = @import("builtin"); +const block_size = 256; +const Block = [block_size][]const u21; + +const BlockMap = std.HashMap( + Block, + u16, + struct { + pub fn hash(_: @This(), k: Block) u64 { + var hasher = std.hash.Wyhash.init(0); + std.hash.autoHashStrat(&hasher, k, .DeepRecursive); + return hasher.final(); + } + + pub fn eql(_: @This(), aBlock: Block, bBlock: Block) bool { + return for (aBlock, bBlock) |a, b| { + if (a.len != b.len) return false; + for (a, b) |a_cp, b_cp| { + if (a_cp != b_cp) return false; + } + } else true; + } + }, + std.hash_map.default_max_load_percentage, +); + pub fn main() anyerror!void { var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); defer arena.deinit(); const allocator = arena.allocator(); // Process UnicodeData.txt - var write_buf: [4096]u8 = undefined; - var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt")); var args_iter = try std.process.argsWithAllocator(allocator); defer args_iter.deinit(); _ = args_iter.skip(); const output_path = args_iter.next() orelse @panic("No output file arg!"); - var out_file = try std.fs.cwd().createFile(output_path, .{}); - defer out_file.close(); - var writer = out_file.writer(&write_buf); + var compat_map = std.AutoHashMap(u21, []u21).init(allocator); + defer compat_map.deinit(); - const endian = builtin.cpu.arch.endian(); - - lines: while (in_reader.takeDelimiterInclusive('\n')) |took| { - const line = std.mem.trimRight(u8, took, "\n"); + while (in_reader.takeDelimiterInclusive('\n')) |line| { if (line.len == 0) continue; var field_iter = std.mem.splitScalar(u8, line, ';'); - var cps: [19]u24 = undefined; - var len: u8 = 1; + var cp: u21 = undefined; var i: usize = 0; while (field_iter.next()) |field| : (i += 1) { + if (field.len == 0) continue; + switch (i) { - 0 => cps[0] = try std.fmt.parseInt(u24, field, 16), + 0 => { + cp = try std.fmt.parseInt(u21, field, 16); + }, 5 => { // Not compatibility. - if (field.len == 0 or field[0] != '<') continue :lines; + if (field[0] != '<') continue; + var cp_iter = std.mem.tokenizeScalar(u8, field, ' '); _ = cp_iter.next(); // + var cps: [18]u21 = undefined; + var len: u8 = 0; + while (cp_iter.next()) |cp_str| : (len += 1) { - cps[len] = try std.fmt.parseInt(u24, cp_str, 16); + cps[len] = try std.fmt.parseInt(u21, cp_str, 16); } - }, - 2 => if (line[0] == '<') continue :lines, + const slice = try allocator.dupe(u21, cps[0..len]); + try compat_map.put(cp, slice); + }, else => {}, } } - - try writer.interface.writeInt(u8, @intCast(len), endian); - for (cps[0..len]) |cp| try writer.interface.writeInt(u24, cp, endian); } else |err| switch (err) { error.EndOfStream => {}, else => { @@ -60,6 +84,63 @@ pub fn main() anyerror!void { }, } - try writer.interface.writeInt(u16, 0, endian); + // Build multi-tiered lookup tables for compatibility decompositions + var blocks_map = BlockMap.init(allocator); + defer blocks_map.deinit(); + + var stage1 = std.array_list.Managed(u16).init(allocator); + defer stage1.deinit(); + + var stage2 = std.array_list.Managed([]const u21).init(allocator); + defer stage2.deinit(); + + var block: Block = [_][]const u21{&[_]u21{}} ** block_size; + var block_len: u16 = 0; + + for (0..0x110000) |i| { + const cp: u21 = @intCast(i); + const compat: []const u21 = compat_map.get(cp) orelse &[_]u21{}; + + block[block_len] = compat; + block_len += 1; + + if (block_len < block_size and cp != 0x10ffff) continue; + + const gop = try blocks_map.getOrPut(block); + if (!gop.found_existing) { + gop.value_ptr.* = @intCast(stage2.items.len); + try stage2.appendSlice(&block); + } + + try stage1.append(gop.value_ptr.*); + block_len = 0; + } + // Write out + var write_buf: [4096]u8 = undefined; + var out_file = try std.fs.cwd().createFile(output_path, .{}); + defer out_file.close(); + var writer = out_file.writer(&write_buf); + + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}][]const u21 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| { + try writer.interface.print("&.{any}, ", .{entry}); + } + + try writer.interface.writeAll( + \\}; + ); + try writer.interface.flush(); } diff --git a/codegen/fold.zig b/codegen/fold.zig index 366ed79..c5f54eb 100644 --- a/codegen/fold.zig +++ b/codegen/fold.zig @@ -228,26 +228,45 @@ pub fn main() anyerror!void { var out_file = try std.fs.cwd().createFile(output_path, .{}); defer out_file.close(); var writer = out_file.writer(&write_buf); - - const endian = builtin.cpu.arch.endian(); // Table metadata. - try writer.interface.writeInt(u24, @intCast(codepoint_cutoff), endian); - try writer.interface.writeInt(u24, @intCast(multiple_codepoint_start), endian); - // Stage 1 - try writer.interface.writeInt(u16, @intCast(meaningful_stage1.len), endian); - try writer.interface.writeAll(meaningful_stage1); - // Stage 2 - try writer.interface.writeInt(u16, @intCast(stage2.len), endian); - try writer.interface.writeAll(stage2); - // Stage 3 - try writer.interface.writeInt(u16, @intCast(stage3.len), endian); - for (stage3) |offset| try writer.interface.writeInt(i24, offset, endian); - // Changes when case folded - // Min and max - try writer.interface.writeInt(u24, std.mem.min(u21, changes_when_casefolded_exceptions.items), endian); - try writer.interface.writeInt(u24, std.mem.max(u21, changes_when_casefolded_exceptions.items), endian); - try writer.interface.writeInt(u16, @intCast(changes_when_casefolded_exceptions.items.len), endian); - for (changes_when_casefolded_exceptions.items) |cp| try writer.interface.writeInt(u24, cp, endian); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const cutoff: u21 = {}; + \\pub const cwcf_exceptions_min: u21 = {}; + \\pub const cwcf_exceptions_max: u21 = {}; + \\pub const cwcf_exceptions: [{}]u21 = .{{ + , .{ codepoint_cutoff, std.mem.min(u21, changes_when_casefolded_exceptions.items), std.mem.max(u21, changes_when_casefolded_exceptions.items), changes_when_casefolded_exceptions.items.len }); + for (changes_when_casefolded_exceptions.items) |cp| try writer.interface.print("{}, ", .{cp}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const multiple_start: u21 = {}; + \\pub const stage1: [{}]u8 = .{{ + , .{ multiple_codepoint_start, meaningful_stage1.len }); + for (meaningful_stage1) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const stage2: [{}]u8 = .{{ + , .{stage2.len}); + for (stage2) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const stage3: [{}]i24 = .{{ + , .{stage3.len}); + for (stage3) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); } diff --git a/codegen/hangul.zig b/codegen/hangul.zig index 2e4c175..d7504a9 100644 --- a/codegen/hangul.zig +++ b/codegen/hangul.zig @@ -120,12 +120,24 @@ pub fn main() anyerror!void { defer out_file.close(); var writer = out_file.writer(&write_buf); - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); - - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - for (stage2.items) |i| try writer.interface.writeInt(u8, i, endian); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]u3 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); } diff --git a/codegen/normp.zig b/codegen/normp.zig index eaf6989..343f03e 100644 --- a/codegen/normp.zig +++ b/codegen/normp.zig @@ -121,12 +121,24 @@ pub fn main() anyerror!void { defer out_file.close(); var writer = out_file.writer(&write_buf); - const endian = builtin.cpu.arch.endian(); - try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); - for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); - - try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); - for (stage2.items) |i| try writer.interface.writeInt(u8, i, endian); + try writer.interface.print( + \\//! This file is auto-generated. Do not edit. + \\ + \\pub const s1: [{}]u16 = .{{ + , .{stage1.items.len}); + for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.print( + \\ + \\}}; + \\ + \\pub const s2: [{}]u3 = .{{ + , .{stage2.items.len}); + for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); + + try writer.interface.writeAll( + \\}; + ); try writer.interface.flush(); } diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig index df86b92..88f047c 100644 --- a/src/CaseFolding.zig +++ b/src/CaseFolding.zig @@ -1,113 +1,53 @@ -cutoff: u21 = undefined, -cwcf_exceptions_min: u21 = undefined, -cwcf_exceptions_max: u21 = undefined, -cwcf_exceptions: []u21 = undefined, -multiple_start: u21 = undefined, -stage1: []u8 = undefined, -stage2: []u8 = undefined, -stage3: []i24 = undefined, -normalize: Normalize, -owns_normalize: bool, - const CaseFolding = @This(); -pub fn init(allocator: Allocator) Allocator.Error!CaseFolding { - var case_fold: CaseFolding = undefined; - try case_fold.setup(allocator); - return case_fold; -} - -pub fn initWithNormalize(allocator: Allocator, norm: Normalize) Allocator.Error!CaseFolding { - var casefold: CaseFolding = undefined; - try casefold.setupWithNormalize(allocator, norm); - return casefold; -} - -pub fn setup(casefold: *CaseFolding, allocator: Allocator) Allocator.Error!void { - try casefold.setupImpl(allocator); - // Handle normalize memory separately during setup: - casefold.owns_normalize = false; - errdefer casefold.deinit(allocator); - try casefold.normalize.setup(allocator); - casefold.owns_normalize = true; -} - -pub fn setupWithNormalize(casefold: *CaseFolding, allocator: Allocator, norm: Normalize) !void { - try casefold.setupImpl(allocator); - casefold.normalize = norm; - casefold.owns_normalize = false; -} - -fn setupImpl(casefold: *CaseFolding, allocator: Allocator) Allocator.Error!void { - casefold.setupImplInner(allocator) catch |err| { - switch (err) { - error.OutOfMemory => |e| return e, - else => unreachable, - } +const Data = struct { + cutoff: u21 = undefined, + cwcf_exceptions_min: u21 = undefined, + cwcf_exceptions_max: u21 = undefined, + cwcf_exceptions: []const u21 = undefined, + multiple_start: u21 = undefined, + stage1: []const u8 = undefined, + stage2: []const u8 = undefined, + stage3: []const i24 = undefined, +}; + +const casefold = casefold: { + const data = @import("fold"); + break :casefold Data{ + .cutoff = data.cutoff, + .multiple_start = data.multiple_start, + .stage1 = &data.stage1, + .stage2 = &data.stage2, + .stage3 = &data.stage3, + .cwcf_exceptions_min = data.cwcf_exceptions_min, + .cwcf_exceptions_max = data.cwcf_exceptions_max, + .cwcf_exceptions = &data.cwcf_exceptions, }; -} - -inline fn setupImplInner(casefold: *CaseFolding, allocator: Allocator) !void { - const in_bytes = @embedFile("fold"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var reader = in_fbs.reader(); - - const endian = builtin.cpu.arch.endian(); - - casefold.cutoff = @intCast(try reader.readInt(u24, endian)); - casefold.multiple_start = @intCast(try reader.readInt(u24, endian)); - - var len = try reader.readInt(u16, endian); - casefold.stage1 = try allocator.alloc(u8, len); - errdefer allocator.free(casefold.stage1); - for (0..len) |i| casefold.stage1[i] = try reader.readInt(u8, endian); - - len = try reader.readInt(u16, endian); - casefold.stage2 = try allocator.alloc(u8, len); - errdefer allocator.free(casefold.stage2); - for (0..len) |i| casefold.stage2[i] = try reader.readInt(u8, endian); - - len = try reader.readInt(u16, endian); - casefold.stage3 = try allocator.alloc(i24, len); - errdefer allocator.free(casefold.stage3); - for (0..len) |i| casefold.stage3[i] = try reader.readInt(i24, endian); - - casefold.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian)); - casefold.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian)); - len = try reader.readInt(u16, endian); - casefold.cwcf_exceptions = try allocator.alloc(u21, len); - errdefer allocator.free(casefold.cwcf_exceptions); - for (0..len) |i| casefold.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian)); -} - -pub fn deinit(fdata: *const CaseFolding, allocator: mem.Allocator) void { - allocator.free(fdata.stage1); - allocator.free(fdata.stage2); - allocator.free(fdata.stage3); - allocator.free(fdata.cwcf_exceptions); - if (fdata.owns_normalize) fdata.normalize.deinit(allocator); -} +}; /// Returns the case fold for `cp`. -pub fn caseFold(fdata: *const CaseFolding, cp: u21, buf: []u21) []const u21 { - if (cp >= fdata.cutoff) return &.{}; +pub fn caseFold(cp: u21, buf: []u21) []const u21 { + // Unmatched code points fold to themselves, so we default to this. + buf[0] = cp; - const stage1_val = fdata.stage1[cp >> 8]; - if (stage1_val == 0) return &.{}; + if (cp >= casefold.cutoff) return buf[0..1]; + + const stage1_val = casefold.stage1[cp >> 8]; + if (stage1_val == 0) return buf[0..1]; const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF); - const stage3_index = fdata.stage2[stage2_index]; + const stage3_index = casefold.stage2[stage2_index]; if (stage3_index & 0x80 != 0) { - const real_index = @as(usize, fdata.multiple_start) + (stage3_index ^ 0x80) * 3; - const mapping = mem.sliceTo(fdata.stage3[real_index..][0..3], 0); + const real_index = @as(usize, casefold.multiple_start) + (stage3_index ^ 0x80) * 3; + const mapping = mem.sliceTo(casefold.stage3[real_index..][0..3], 0); for (mapping, 0..) |c, i| buf[i] = @intCast(c); return buf[0..mapping.len]; } - const offset = fdata.stage3[stage3_index]; - if (offset == 0) return &.{}; + const offset = casefold.stage3[stage3_index]; + if (offset == 0) return buf[0..1]; buf[0] = @intCast(@as(i32, cp) + offset); @@ -117,7 +57,6 @@ pub fn caseFold(fdata: *const CaseFolding, cp: u21, buf: []u21) []const u21 { /// Produces the case folded code points for `cps`. Caller must free returned /// slice with `allocator`. pub fn caseFoldAlloc( - casefold: *const CaseFolding, allocator: Allocator, cps: []const u21, ) Allocator.Error![]const u21 { @@ -126,7 +65,7 @@ pub fn caseFoldAlloc( var buf: [3]u21 = undefined; for (cps) |cp| { - const cf = casefold.caseFold(cp, &buf); + const cf = CaseFolding.caseFold(cp, &buf); if (cf.len == 0) { try cfcps.append(cp); @@ -139,19 +78,19 @@ pub fn caseFoldAlloc( } /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). -pub fn cpChangesWhenCaseFolded(casefold: *const CaseFolding, cp: u21) bool { +pub fn cpChangesWhenCaseFolded(cp: u21) bool { var buf: [3]u21 = undefined; - const has_mapping = casefold.caseFold(cp, &buf).len != 0; - return has_mapping and !casefold.isCwcfException(cp); + const has_mapping = CaseFolding.caseFold(cp, &buf).len != 0; + return has_mapping and !CaseFolding.isCwcfException(cp); } -pub fn changesWhenCaseFolded(casefold: *const CaseFolding, cps: []const u21) bool { +pub fn changesWhenCaseFolded(cps: []const u21) bool { return for (cps) |cp| { - if (casefold.cpChangesWhenCaseFolded(cp)) break true; + if (CaseFolding.cpChangesWhenCaseFolded(cp)) break true; } else false; } -fn isCwcfException(casefold: *const CaseFolding, cp: u21) bool { +fn isCwcfException(cp: u21) bool { return cp >= casefold.cwcf_exceptions_min and cp <= casefold.cwcf_exceptions_max and std.mem.indexOfScalar(u21, casefold.cwcf_exceptions, cp) != null; @@ -160,88 +99,114 @@ fn isCwcfException(casefold: *const CaseFolding, cp: u21) bool { /// Caseless compare `a` and `b` by decomposing to NFKD. This is the most /// comprehensive comparison possible, but slower than `canonCaselessMatch`. pub fn compatCaselessMatch( - casefold: *const CaseFolding, allocator: Allocator, + normalize: Normalize, a: []const u8, b: []const u8, ) Allocator.Error!bool { if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); // Process a - const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd); + const nfd_a = try normalize.nfxdCodePoints(allocator, a, .nfd); defer allocator.free(nfd_a); var need_free_cf_nfd_a = false; var cf_nfd_a: []const u21 = nfd_a; - if (casefold.changesWhenCaseFolded(nfd_a)) { - cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a); + if (CaseFolding.changesWhenCaseFolded(nfd_a)) { + cf_nfd_a = try CaseFolding.caseFoldAlloc(allocator, nfd_a); need_free_cf_nfd_a = true; } defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); - const nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_a); + const nfkd_cf_nfd_a = try normalize.nfkdCodePoints(allocator, cf_nfd_a); defer allocator.free(nfkd_cf_nfd_a); - const cf_nfkd_cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_a); + const cf_nfkd_cf_nfd_a = try CaseFolding.caseFoldAlloc(allocator, nfkd_cf_nfd_a); defer allocator.free(cf_nfkd_cf_nfd_a); - const nfkd_cf_nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); + const nfkd_cf_nfkd_cf_nfd_a = try normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); // Process b - const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd); + const nfd_b = try normalize.nfxdCodePoints(allocator, b, .nfd); defer allocator.free(nfd_b); var need_free_cf_nfd_b = false; var cf_nfd_b: []const u21 = nfd_b; - if (casefold.changesWhenCaseFolded(nfd_b)) { - cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b); + if (CaseFolding.changesWhenCaseFolded(nfd_b)) { + cf_nfd_b = try CaseFolding.caseFoldAlloc(allocator, nfd_b); need_free_cf_nfd_b = true; } defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); - const nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_b); + const nfkd_cf_nfd_b = try normalize.nfkdCodePoints(allocator, cf_nfd_b); defer allocator.free(nfkd_cf_nfd_b); - const cf_nfkd_cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_b); + const cf_nfkd_cf_nfd_b = try CaseFolding.caseFoldAlloc(allocator, nfkd_cf_nfd_b); defer allocator.free(cf_nfkd_cf_nfd_b); - const nfkd_cf_nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); + const nfkd_cf_nfkd_cf_nfd_b = try normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); } +test "caseFold" { + var buf: [3]u21 = undefined; + + // Folds '1' to '1' + try testing.expectEqual(1, caseFold('1', &buf).len); + try testing.expectEqual('1', caseFold('1', &buf)[0]); + + // Folds '2' to '2' + try testing.expectEqual(1, caseFold('2', &buf).len); + try testing.expectEqual('2', caseFold('2', &buf)[0]); + + // Folds Armenian capital letter 'Zhe' (U+053A) + try testing.expectEqual(1, caseFold('Ժ', &buf).len); + // Armenian small letter 'Zhe' (U+056A) + try testing.expectEqual('ժ', caseFold('Ժ', &buf)[0]); + + // Folds Greek small letter Upsilon with Dialytika and Perispomeni (U+1FE7) + try testing.expectEqual(3, caseFold('ῧ', &buf).len); + // Greek small letter Upsilon (U+03C5) + try testing.expectEqual('υ', caseFold('ῧ', &buf)[0]); + // Combining Diaeresis + try testing.expectEqual('\u{0308}', caseFold('ῧ', &buf)[1]); + // Combining Greek Perispomeni + try testing.expectEqual('\u{0342}', caseFold('ῧ', &buf)[2]); +} + test "compatCaselessMatch" { const allocator = testing.allocator; - const caser = try CaseFolding.init(allocator); - defer caser.deinit(allocator); + var normalize = try Normalize.init(allocator); + defer normalize.deinit(allocator); - try testing.expect(try caser.compatCaselessMatch(allocator, "ascii only!", "ASCII Only!")); + try testing.expect(try compatCaselessMatch(allocator, normalize, "ascii only!", "ASCII Only!")); const a = "Héllo World! \u{3d3}"; const b = "He\u{301}llo World! \u{3a5}\u{301}"; - try testing.expect(try caser.compatCaselessMatch(allocator, a, b)); + try testing.expect(try compatCaselessMatch(allocator, normalize, a, b)); const c = "He\u{301}llo World! \u{3d2}\u{301}"; - try testing.expect(try caser.compatCaselessMatch(allocator, a, c)); + try testing.expect(try compatCaselessMatch(allocator, normalize, a, c)); } /// Performs canonical caseless string matching by decomposing to NFD. This is /// faster than `compatCaselessMatch`, but less comprehensive. pub fn canonCaselessMatch( - casefold: *const CaseFolding, allocator: Allocator, + normalize: Normalize, a: []const u8, b: []const u8, ) Allocator.Error!bool { if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); // Process a - const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd); + const nfd_a = try normalize.nfxdCodePoints(allocator, a, .nfd); defer allocator.free(nfd_a); var need_free_cf_nfd_a = false; var cf_nfd_a: []const u21 = nfd_a; - if (casefold.changesWhenCaseFolded(nfd_a)) { - cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a); + if (CaseFolding.changesWhenCaseFolded(nfd_a)) { + cf_nfd_a = try CaseFolding.caseFoldAlloc(allocator, nfd_a); need_free_cf_nfd_a = true; } defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); @@ -249,19 +214,19 @@ pub fn canonCaselessMatch( var need_free_nfd_cf_nfd_a = false; var nfd_cf_nfd_a = cf_nfd_a; if (!need_free_cf_nfd_a) { - nfd_cf_nfd_a = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_a); + nfd_cf_nfd_a = try normalize.nfdCodePoints(allocator, cf_nfd_a); need_free_nfd_cf_nfd_a = true; } defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); // Process b - const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd); + const nfd_b = try normalize.nfxdCodePoints(allocator, b, .nfd); defer allocator.free(nfd_b); var need_free_cf_nfd_b = false; var cf_nfd_b: []const u21 = nfd_b; - if (casefold.changesWhenCaseFolded(nfd_b)) { - cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b); + if (CaseFolding.changesWhenCaseFolded(nfd_b)) { + cf_nfd_b = try CaseFolding.caseFoldAlloc(allocator, nfd_b); need_free_cf_nfd_b = true; } defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); @@ -269,7 +234,7 @@ pub fn canonCaselessMatch( var need_free_nfd_cf_nfd_b = false; var nfd_cf_nfd_b = cf_nfd_b; if (!need_free_cf_nfd_b) { - nfd_cf_nfd_b = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_b); + nfd_cf_nfd_b = try normalize.nfdCodePoints(allocator, cf_nfd_b); need_free_nfd_cf_nfd_b = true; } defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b); @@ -280,40 +245,17 @@ pub fn canonCaselessMatch( test "canonCaselessMatch" { const allocator = testing.allocator; - const caser = try CaseFolding.init(allocator); - defer caser.deinit(allocator); + var normalize = try Normalize.init(allocator); + defer normalize.deinit(allocator); - try testing.expect(try caser.canonCaselessMatch(allocator, "ascii only!", "ASCII Only!")); + try testing.expect(try canonCaselessMatch(allocator, normalize, "ascii only!", "ASCII Only!")); const a = "Héllo World! \u{3d3}"; const b = "He\u{301}llo World! \u{3a5}\u{301}"; - try testing.expect(!try caser.canonCaselessMatch(allocator, a, b)); + try testing.expect(!try canonCaselessMatch(allocator, normalize, a, b)); const c = "He\u{301}llo World! \u{3d2}\u{301}"; - try testing.expect(try caser.canonCaselessMatch(allocator, a, c)); -} - -fn testAllocations(allocator: Allocator) !void { - // With normalize provided - { - const normalize = try Normalize.init(allocator); - defer normalize.deinit(allocator); - const caser = try CaseFolding.initWithNormalize(allocator, normalize); - defer caser.deinit(allocator); - } - // With normalize owned - { - const caser = try CaseFolding.init(allocator); - defer caser.deinit(allocator); - } -} - -test "Allocation Failures" { - try testing.checkAllAllocationFailures( - testing.allocator, - testAllocations, - .{}, - ); + try testing.expect(try canonCaselessMatch(allocator, normalize, a, c)); } const std = @import("std"); diff --git a/src/CombiningData.zig b/src/CombiningData.zig index f58e0de..660dc14 100644 --- a/src/CombiningData.zig +++ b/src/CombiningData.zig @@ -1,45 +1,28 @@ //! Combining Class Data -s1: []u16 = undefined, -s2: []u8 = undefined, +const Data = struct { + s1: []const u16 = undefined, + s2: []const u8 = undefined, +}; + +const combining_data = combining_data: { + const data = @import("ccc"); + break :combining_data Data{ + .s1 = &data.s1, + .s2 = &data.s2, + }; +}; const CombiningData = @This(); -pub fn init(allocator: mem.Allocator) !CombiningData { - const in_bytes = @embedFile("ccc"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var reader = in_fbs.reader(); - - const endian = builtin.cpu.arch.endian(); - - var cbdata = CombiningData{}; - - const stage_1_len: u16 = try reader.readInt(u16, endian); - cbdata.s1 = try allocator.alloc(u16, stage_1_len); - errdefer allocator.free(cbdata.s1); - for (0..stage_1_len) |i| cbdata.s1[i] = try reader.readInt(u16, endian); - - const stage_2_len: u16 = try reader.readInt(u16, endian); - cbdata.s2 = try allocator.alloc(u8, stage_2_len); - errdefer allocator.free(cbdata.s2); - _ = try reader.readAll(cbdata.s2); - - return cbdata; -} - -pub fn deinit(cbdata: *const CombiningData, allocator: mem.Allocator) void { - allocator.free(cbdata.s1); - allocator.free(cbdata.s2); -} - /// Returns the canonical combining class for a code point. -pub fn ccc(cbdata: CombiningData, cp: u21) u8 { - return cbdata.s2[cbdata.s1[cp >> 8] + (cp & 0xff)]; +pub fn ccc(cp: u21) u8 { + return combining_data.s2[combining_data.s1[cp >> 8] + (cp & 0xff)]; } /// True if `cp` is a starter code point, not a combining character. -pub fn isStarter(cbdata: CombiningData, cp: u21) bool { - return cbdata.s2[cbdata.s1[cp >> 8] + (cp & 0xff)] == 0; +pub fn isStarter(cp: u21) bool { + return combining_data.s2[combining_data.s1[cp >> 8] + (cp & 0xff)] == 0; } const std = @import("std"); diff --git a/src/CompatData.zig b/src/CompatData.zig index 40ecd12..68d47f2 100644 --- a/src/CompatData.zig +++ b/src/CompatData.zig @@ -1,57 +1,19 @@ //! Compatibility Data -nfkd: [][]u21 = undefined, -cps: []u21 = undefined, - -const CompatData = @This(); - -pub fn init(allocator: mem.Allocator) !CompatData { - const in_bytes = @embedFile("compat"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var reader = in_fbs.reader(); - - const endian = builtin.cpu.arch.endian(); - var cpdata = CompatData{ - .nfkd = try allocator.alloc([]u21, 0x110000), +const Data = struct { + s1: []const u16 = undefined, + s2: []const []const u21 = undefined, +}; + +const compat_data = compat_data: { + const data = @import("compat"); + break :compat_data Data{ + .s1 = &data.s1, + .s2 = &data.s2, }; - { - errdefer allocator.free(cpdata.nfkd); - cpdata.cps = try allocator.alloc(u21, magic.compat_size); - } - errdefer cpdata.deinit(allocator); - - @memset(cpdata.nfkd, &.{}); - - var total_len: usize = 0; - - while (true) { - const len: u8 = try reader.readInt(u8, endian); - if (len == 0) break; - const cp = try reader.readInt(u24, endian); - const nk_s = cpdata.cps[total_len..][0 .. len - 1]; - for (0..len - 1) |i| { - nk_s[i] = @intCast(try reader.readInt(u24, endian)); - } - cpdata.nfkd[cp] = nk_s; - total_len += len - 1; - } - - if (comptime magic.print) std.debug.print("CompatData magic number: {d}", .{total_len}); - - return cpdata; -} - -pub fn deinit(cpdata: *const CompatData, allocator: mem.Allocator) void { - allocator.free(cpdata.cps); - allocator.free(cpdata.nfkd); -} +}; /// Returns compatibility decomposition for `cp`. -pub fn toNfkd(cpdata: *const CompatData, cp: u21) []u21 { - return cpdata.nfkd[cp]; +pub fn toNfkd(cp: u21) []const u21 { + return compat_data.s2[compat_data.s1[cp >> 8] + (cp & 0xff)]; } - -const std = @import("std"); -const builtin = @import("builtin"); -const mem = std.mem; -const magic = @import("magic"); diff --git a/src/HangulData.zig b/src/HangulData.zig index cae8b97..9c17421 100644 --- a/src/HangulData.zig +++ b/src/HangulData.zig @@ -9,39 +9,23 @@ pub const Syllable = enum { T, }; -s1: []u16 = undefined, -s2: []u3 = undefined, - -const Hangul = @This(); - -pub fn init(allocator: mem.Allocator) !Hangul { - const in_bytes = @embedFile("hangul"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var reader = in_fbs.reader(); - - const endian = builtin.cpu.arch.endian(); - var hangul = Hangul{}; - - const stage_1_len: u16 = try reader.readInt(u16, endian); - hangul.s1 = try allocator.alloc(u16, stage_1_len); - errdefer allocator.free(hangul.s1); - for (0..stage_1_len) |i| hangul.s1[i] = try reader.readInt(u16, endian); - - const stage_2_len: u16 = try reader.readInt(u16, endian); - hangul.s2 = try allocator.alloc(u3, stage_2_len); - errdefer allocator.free(hangul.s2); - for (0..stage_2_len) |i| hangul.s2[i] = @intCast(try reader.readInt(u8, endian)); +const Data = struct { + s1: []const u16 = undefined, + s2: []const u3 = undefined, +}; - return hangul; -} +const hangul = hangul_data: { + const data = @import("hangul"); + break :hangul_data Data{ + .s1 = &data.s1, + .s2 = &data.s2, + }; +}; -pub fn deinit(hangul: *const Hangul, allocator: mem.Allocator) void { - allocator.free(hangul.s1); - allocator.free(hangul.s2); -} +const Hangul = @This(); /// Returns the Hangul syllable type for `cp`. -pub fn syllable(hangul: *const Hangul, cp: u21) Syllable { +pub fn syllable(cp: u21) Syllable { return @enumFromInt(hangul.s2[hangul.s1[cp >> 8] + (cp & 0xff)]); } diff --git a/src/NormPropsData.zig b/src/NormPropsData.zig index 7b53542..cca3556 100644 --- a/src/NormPropsData.zig +++ b/src/NormPropsData.zig @@ -1,48 +1,32 @@ //! Normalization Properties Data -s1: []u16 = undefined, -s2: []u4 = undefined, +const Data = struct { + s1: []const u16 = undefined, + s2: []const u3 = undefined, +}; + +const norms = norm_props_data: { + const data = @import("normp"); + break :norm_props_data Data{ + .s1 = &data.s1, + .s2 = &data.s2, + }; +}; const NormProps = @This(); -pub fn init(allocator: mem.Allocator) !NormProps { - const in_bytes = @embedFile("normp"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var reader = in_fbs.reader(); - - const endian = builtin.cpu.arch.endian(); - var norms = NormProps{}; - - const stage_1_len: u16 = try reader.readInt(u16, endian); - norms.s1 = try allocator.alloc(u16, stage_1_len); - errdefer allocator.free(norms.s1); - for (0..stage_1_len) |i| norms.s1[i] = try reader.readInt(u16, endian); - - const stage_2_len: u16 = try reader.readInt(u16, endian); - norms.s2 = try allocator.alloc(u4, stage_2_len); - errdefer allocator.free(norms.s2); - for (0..stage_2_len) |i| norms.s2[i] = @intCast(try reader.readInt(u8, endian)); - - return norms; -} - -pub fn deinit(norms: *const NormProps, allocator: mem.Allocator) void { - allocator.free(norms.s1); - allocator.free(norms.s2); -} - /// Returns true if `cp` is already in NFD form. -pub fn isNfd(norms: *const NormProps, cp: u21) bool { +pub fn isNfd(cp: u21) bool { return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 1 == 0; } /// Returns true if `cp` is already in NFKD form. -pub fn isNfkd(norms: *const NormProps, cp: u21) bool { +pub fn isNfkd(cp: u21) bool { return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 2 == 0; } /// Returns true if `cp` is not allowed in any normalized form. -pub fn isFcx(norms: *const NormProps, cp: u21) bool { +pub fn isFcx(cp: u21) bool { return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 4 == 4; } diff --git a/src/Normalize.zig b/src/Normalize.zig index 4a1bae8..3191a8c 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig @@ -3,64 +3,22 @@ //! NFKC, NFD, and NFKD normalization forms. canon_data: CanonData = undefined, -ccc_data: CccData = undefined, -compat_data: CompatData = undefined, -hangul_data: HangulData = undefined, -normp_data: NormPropsData = undefined, const Normalize = @This(); -pub fn init(allocator: Allocator) Allocator.Error!Normalize { +pub fn init(allocator: Allocator) !Normalize { var norm: Normalize = undefined; try norm.setup(allocator); return norm; } -pub fn setup(self: *Normalize, allocator: Allocator) Allocator.Error!void { - self.canon_data = CanonData.init(allocator) catch |err| { - switch (err) { - error.OutOfMemory => |e| return e, - else => unreachable, - } - }; - errdefer self.canon_data.deinit(allocator); - self.ccc_data = CccData.init(allocator) catch |err| { - switch (err) { - error.OutOfMemory => |e| return e, - else => unreachable, - } - }; - errdefer self.ccc_data.deinit(allocator); - self.compat_data = CompatData.init(allocator) catch |err| { - switch (err) { - error.OutOfMemory => |e| return e, - else => unreachable, - } - }; - errdefer self.compat_data.deinit(allocator); - self.hangul_data = HangulData.init(allocator) catch |err| { - switch (err) { - error.OutOfMemory => |e| return e, - else => unreachable, - } - }; - errdefer self.hangul_data.deinit(allocator); - self.normp_data = NormPropsData.init(allocator) catch |err| { - switch (err) { - error.OutOfMemory => |e| return e, - else => unreachable, - } - }; +pub fn setup(self: *Normalize, allocator: Allocator) !void { + self.canon_data = try CanonData.init(allocator); } pub fn deinit(norm: *const Normalize, allocator: Allocator) void { - // Reasonably safe (?) - var mut_norm = @constCast(norm); + const mut_norm = @constCast(norm); mut_norm.canon_data.deinit(allocator); - mut_norm.ccc_data.deinit(allocator); - mut_norm.compat_data.deinit(allocator); - mut_norm.hangul_data.deinit(allocator); - mut_norm.normp_data.deinit(allocator); } const SBase: u21 = 0xAC00; @@ -73,8 +31,8 @@ const TCount: u21 = 28; const NCount: u21 = 588; // VCount * TCount const SCount: u21 = 11172; // LCount * NCount -fn decomposeHangul(self: Normalize, cp: u21, buf: []u21) ?Decomp { - const kind = self.hangul_data.syllable(cp); +fn decomposeHangul(cp: u21, buf: []u21) ?Decomp { + const kind = HangulData.syllable(cp); if (kind != .LV and kind != .LVT) return null; const SIndex: u21 = cp - SBase; @@ -143,7 +101,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp { }, .nfkd => { - dc.cps = self.compat_data.toNfkd(cp); + dc.cps = CompatData.toNfkd(cp); if (dc.cps.len != 0) { dc.form = .nfkd; } else { @@ -170,13 +128,13 @@ fn decompose( // NFD / NFKD quick checks. switch (form) { - .nfd => if (self.normp_data.isNfd(cp)) return .{}, - .nfkd => if (self.normp_data.isNfkd(cp)) return .{}, + .nfd => if (NormPropsData.isNfd(cp)) return .{}, + .nfkd => if (NormPropsData.isNfkd(cp)) return .{}, else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."), } // Hangul precomposed syllable full decomposition. - if (self.decomposeHangul(cp, buf)) |dc| return dc; + if (decomposeHangul(cp, buf)) |dc| return dc; // Full decomposition. var dc = Decomp{ .form = form }; @@ -218,9 +176,8 @@ fn decompose( test "decompose" { const allocator = testing.allocator; - const n = try Normalize.init(allocator); + var n = try Normalize.init(allocator); defer n.deinit(allocator); - var buf: [18]u21 = undefined; var dc = n.decompose('é', .nfd, &buf); @@ -280,17 +237,17 @@ pub const Result = struct { }; // Compares code points by Canonical Combining Class order. -fn cccLess(self: Normalize, lhs: u21, rhs: u21) bool { - return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs); +fn cccLess(_: void, lhs: u21, rhs: u21) bool { + return CombiningData.ccc(lhs) < CombiningData.ccc(rhs); } // Applies the Canonical Sorting Algorithm. -fn canonicalSort(self: Normalize, cps: []u21) void { +fn canonicalSort(cps: []u21) void { var i: usize = 0; while (i < cps.len) : (i += 1) { const start: usize = i; - while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} - mem.sort(u21, cps[start..i], self, cccLess); + while (i < cps.len and CombiningData.ccc(cps[i]) != 0) : (i += 1) {} + mem.sort(u21, cps[start..i], {}, cccLess); } } @@ -320,7 +277,7 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo } } - self.canonicalSort(dcp_list.items); + canonicalSort(dcp_list.items); return try dcp_list.toOwnedSlice(); } @@ -346,7 +303,7 @@ fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo test "nfd ASCII / no-alloc" { const allocator = testing.allocator; - const n = try Normalize.init(allocator); + var n = try Normalize.init(allocator); defer n.deinit(allocator); const result = try n.nfd(allocator, "Hello World!"); @@ -357,7 +314,7 @@ test "nfd ASCII / no-alloc" { test "nfd !ASCII / alloc" { const allocator = testing.allocator; - const n = try Normalize.init(allocator); + var n = try Normalize.init(allocator); defer n.deinit(allocator); const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); @@ -368,7 +325,7 @@ test "nfd !ASCII / alloc" { test "nfkd ASCII / no-alloc" { const allocator = testing.allocator; - const n = try Normalize.init(allocator); + var n = try Normalize.init(allocator); defer n.deinit(allocator); const result = try n.nfkd(allocator, "Hello World!"); @@ -379,7 +336,7 @@ test "nfkd ASCII / no-alloc" { test "nfkd !ASCII / alloc" { const allocator = testing.allocator; - const n = try Normalize.init(allocator); + var n = try Normalize.init(allocator); defer n.deinit(allocator); const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); @@ -408,7 +365,7 @@ pub fn nfdCodePoints( } } - self.canonicalSort(dcp_list.items); + canonicalSort(dcp_list.items); return try dcp_list.toOwnedSlice(); } @@ -433,15 +390,15 @@ pub fn nfkdCodePoints( } } - self.canonicalSort(dcp_list.items); + canonicalSort(dcp_list.items); return try dcp_list.toOwnedSlice(); } // Composition (NFC, NFKC) -fn isHangul(self: Normalize, cp: u21) bool { - return cp >= 0x1100 and self.hangul_data.syllable(cp) != .none; +fn isHangul(cp: u21) bool { + return cp >= 0x1100 and HangulData.syllable(cp) != .none; } /// Normalizes `str` to NFC. @@ -479,7 +436,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo block_check: while (i < dcps.len) : (i += 1) { const C = dcps[i]; if (C == tombstone) continue :block_check; - const cc_C = self.ccc_data.ccc(C); + const cc_C = CombiningData.ccc(C); var starter_index: ?usize = null; var j: usize = i; @@ -489,12 +446,12 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo if (dcps[j] == tombstone) continue; // Check for starter. - if (self.ccc_data.isStarter(dcps[j])) { + if (CombiningData.isStarter(dcps[j])) { // Check for blocking conditions. for (dcps[(j + 1)..i]) |B| { if (B == tombstone) continue; - const cc_B = self.ccc_data.ccc(B); - if (cc_B != 0 and self.isHangul(C)) continue :block_check; + const cc_B = CombiningData.ccc(B); + if (cc_B != 0 and isHangul(C)) continue :block_check; if (cc_B >= cc_C) continue :block_check; } @@ -515,10 +472,10 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo // If L and C are Hangul syllables, we can compose // them algorithmically if possible. - if (self.isHangul(L) and self.isHangul(C)) { + if (isHangul(L) and isHangul(C)) { // Get Hangul syllable types. - const l_stype = self.hangul_data.syllable(L); - const c_stype = self.hangul_data.syllable(C); + const l_stype = HangulData.syllable(L); + const c_stype = HangulData.syllable(C); if (l_stype == .LV and c_stype == .T) { // LV, T canonical composition. @@ -547,7 +504,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo // Composition Exclusions (FCX) list, // preventing it from appearing in any // composed form (NFC, NFKC). - if (!self.normp_data.isFcx(P)) { + if (!NormPropsData.isFcx(P)) { dcps[sidx] = P; dcps[i] = tombstone; // Mark for deletion. deleted += 1; @@ -577,7 +534,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo test "nfc" { const allocator = testing.allocator; - const n = try Normalize.init(allocator); + var n = try Normalize.init(allocator); defer n.deinit(allocator); const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); @@ -588,7 +545,7 @@ test "nfc" { test "nfkc" { const allocator = testing.allocator; - const n = try Normalize.init(allocator); + var n = try Normalize.init(allocator); defer n.deinit(allocator); const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); @@ -609,7 +566,7 @@ pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8) test "eql" { const allocator = testing.allocator; - const n = try Normalize.init(allocator); + var n = try Normalize.init(allocator); defer n.deinit(allocator); try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); @@ -666,13 +623,13 @@ const mem = std.mem; const simd = std.simd; const testing = std.testing; const unicode = std.unicode; -const Allocator = std.mem.Allocator; +const Allocator = mem.Allocator; const ascii = @import("ascii"); const CodePointIterator = @import("code_point").Iterator; const CanonData = @import("CanonData"); -const CccData = @import("CombiningData"); +const CombiningData = @import("CombiningData"); const CompatData = @import("CompatData"); const HangulData = @import("HangulData"); const NormPropsData = @import("NormPropsData"); diff --git a/src/comptime_map.zig b/src/comptime_map.zig new file mode 100644 index 0000000..b2c4d11 --- /dev/null +++ b/src/comptime_map.zig @@ -0,0 +1,176 @@ +//! A copypasta of Vexu's comptime hashmap: +//! +//! https://github.com/Vexu/comptime_hash_map +//! +//! (License at base of file, thanks Vexu!) +//! + +/// A comptime hashmap constructed with automatically selected hash and eql functions. +pub fn AutoComptimeHashMap(comptime K: type, comptime V: type, comptime values: anytype) type { + return ComptimeHashMap(K, V, hash_map.AutoContext(K), values); +} + +/// Builtin hashmap for strings as keys. +pub fn ComptimeStringHashMap(comptime V: type, comptime values: anytype) type { + return ComptimeHashMap([]const u8, V, hash_map.StringContext, values); +} + +/// A hashmap which is constructed at compile time from constant values. +/// Intended to be used as a faster lookup table. +pub fn ComptimeHashMap(comptime K: type, comptime V: type, comptime ctx: type, comptime values: anytype) type { + std.debug.assert(values.len != 0); + @setEvalBranchQuota(1000 * values.len); + + const Entry = struct { + key: K = undefined, + val: V = undefined, + used: bool = false, + }; + + // ensure that the hash map will be at most 60% full + const size = if (@import("builtin").is_test) values.len else math.ceilPowerOfTwo(usize, values.len * 5 / 3) catch unreachable; + comptime var slots = [1]Entry{.{}} ** size; + comptime var distance: [size]usize = .{0} ** size; + + comptime var max_distance_from_start_index = 0; + + slot_loop: for (values) |kv| { + var key: K = kv.@"0"; + var value: V = kv.@"1"; + + const start_index = @as(usize, ctx.hash(undefined, key)) & (size - 1); + + var roll_over = 0; + var distance_from_start_index = 0; + while (roll_over < size) : ({ + roll_over += 1; + distance_from_start_index += 1; + }) { + const index = (start_index + roll_over) & (size - 1); + const entry = &slots[index]; + + if (entry.used and !ctx.eql(undefined, entry.key, key)) { + if (distance[index] < distance_from_start_index) { + // robin hood to the rescue + const tmp = slots[index]; + max_distance_from_start_index = @max(max_distance_from_start_index, distance_from_start_index); + entry.* = .{ + .used = true, + .key = key, + .val = value, + }; + const tmp_distance = distance[index]; + distance[index] = distance_from_start_index; + key = tmp.key; + value = tmp.val; + distance_from_start_index = tmp_distance; + } + continue; + } + + max_distance_from_start_index = @max(distance_from_start_index, max_distance_from_start_index); + entry.* = .{ + .used = true, + .key = key, + .val = value, + }; + distance[index] = distance_from_start_index; + continue :slot_loop; + } + unreachable; // put into a full map + } + + return struct { + const entries = slots; + + pub fn has(key: K) bool { + return get(key) != null; + } + + pub fn get(key: K) ?*const V { + const start_index = @as(usize, ctx.hash(undefined, key)) & (size - 1); + { + var roll_over: usize = 0; + while (roll_over <= max_distance_from_start_index) : (roll_over += 1) { + const index = (start_index + roll_over) & (size - 1); + const entry = &entries[index]; + + if (!entry.used) return null; + if (ctx.eql(undefined, entry.key, key)) return &entry.val; + } + } + return null; + } + }; +} + +test "basic usage" { + const map = ComptimeStringHashMap(usize, .{ + .{ "foo", 1 }, + .{ "bar", 2 }, + .{ "baz", 3 }, + .{ "quux", 4 }, + .{ "Foo", 1 }, + .{ "Bar", 2 }, + .{ "Baz", 3 }, + .{ "Quux", 4 }, + }); + + try testing.expect(map.has("foo")); + try testing.expect(map.has("bar")); + try testing.expect(map.has("Foo")); + try testing.expect(map.has("Bar")); + try testing.expect(!map.has("zig")); + try testing.expect(!map.has("ziguana")); + + try testing.expect(map.get("baz").?.* == 3); + try testing.expect(map.get("quux").?.* == 4); + try testing.expect(map.get("nah") == null); + try testing.expect(map.get("...") == null); +} + +test "auto comptime hash map" { + const map = AutoComptimeHashMap(usize, []const u8, .{ + .{ 1, "foo" }, + .{ 2, "bar" }, + .{ 3, "baz" }, + .{ 45, "quux" }, + }); + + try testing.expect(map.has(1)); + try testing.expect(map.has(2)); + try testing.expect(!map.has(4)); + try testing.expect(!map.has(1_000_000)); + + try testing.expectEqualStrings("foo", map.get(1).?.*); + try testing.expectEqualStrings("bar", map.get(2).?.*); + try testing.expect(map.get(4) == null); + try testing.expect(map.get(4_000_000) == null); +} + +const std = @import("std"); +const hash_map = std.hash_map; +const testing = std.testing; +const math = std.math; + +// MIT License +// +// Copyright (c) 2020 Veikka Tuominen +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. -- cgit v1.2.3