diff options
| author | 2026-02-04 18:01:36 -0500 | |
|---|---|---|
| committer | 2026-02-04 18:01:36 -0500 | |
| commit | ba5d9081b479e95ffa7f3baf751beedd370cec14 (patch) | |
| tree | c12041d8aab9f9ff68b25a2e2c9042073c3d5f61 | |
| parent | Convert Words module to no-allocation (diff) | |
| download | zg-ba5d9081b479e95ffa7f3baf751beedd370cec14.tar.gz zg-ba5d9081b479e95ffa7f3baf751beedd370cec14.tar.xz zg-ba5d9081b479e95ffa7f3baf751beedd370cec14.zip | |
Normalization and case folding
Both of which deserve some further attention.
| -rw-r--r-- | build.zig | 12 | ||||
| -rw-r--r-- | codegen/ccc.zig | 24 | ||||
| -rw-r--r-- | codegen/compat.zig | 121 | ||||
| -rw-r--r-- | codegen/fold.zig | 57 | ||||
| -rw-r--r-- | codegen/hangul.zig | 24 | ||||
| -rw-r--r-- | codegen/normp.zig | 24 | ||||
| -rw-r--r-- | src/CaseFolding.zig | 258 | ||||
| -rw-r--r-- | src/CombiningData.zig | 49 | ||||
| -rw-r--r-- | src/CompatData.zig | 64 | ||||
| -rw-r--r-- | src/HangulData.zig | 42 | ||||
| -rw-r--r-- | src/NormPropsData.zig | 46 | ||||
| -rw-r--r-- | src/Normalize.zig | 119 | ||||
| -rw-r--r-- | src/comptime_map.zig | 176 |
13 files changed, 571 insertions, 445 deletions
| @@ -103,7 +103,7 @@ pub fn build(b: *std.Build) void { | |||
| 103 | }); | 103 | }); |
| 104 | compat_gen_exe.root_module.addAnonymousImport("UnicodeData.txt", .{ .root_source_file = b.path("data/unicode/UnicodeData.txt") }); | 104 | compat_gen_exe.root_module.addAnonymousImport("UnicodeData.txt", .{ .root_source_file = b.path("data/unicode/UnicodeData.txt") }); |
| 105 | const run_compat_gen_exe = b.addRunArtifact(compat_gen_exe); | 105 | const run_compat_gen_exe = b.addRunArtifact(compat_gen_exe); |
| 106 | const compat_gen_out = run_compat_gen_exe.addOutputFileArg("compat.bin.z"); | 106 | const compat_gen_out = run_compat_gen_exe.addOutputFileArg("compat.zig"); |
| 107 | 107 | ||
| 108 | const hangul_gen_exe = b.addExecutable(.{ | 108 | const hangul_gen_exe = b.addExecutable(.{ |
| 109 | .name = "hangul", | 109 | .name = "hangul", |
| @@ -115,7 +115,7 @@ pub fn build(b: *std.Build) void { | |||
| 115 | }); | 115 | }); |
| 116 | hangul_gen_exe.root_module.addAnonymousImport("HangulSyllableType.txt", .{ .root_source_file = b.path("data/unicode/HangulSyllableType.txt") }); | 116 | hangul_gen_exe.root_module.addAnonymousImport("HangulSyllableType.txt", .{ .root_source_file = b.path("data/unicode/HangulSyllableType.txt") }); |
| 117 | const run_hangul_gen_exe = b.addRunArtifact(hangul_gen_exe); | 117 | const run_hangul_gen_exe = b.addRunArtifact(hangul_gen_exe); |
| 118 | const hangul_gen_out = run_hangul_gen_exe.addOutputFileArg("hangul.bin.z"); | 118 | const hangul_gen_out = run_hangul_gen_exe.addOutputFileArg("hangul.zig"); |
| 119 | 119 | ||
| 120 | const normp_gen_exe = b.addExecutable(.{ | 120 | const normp_gen_exe = b.addExecutable(.{ |
| 121 | .name = "normp", | 121 | .name = "normp", |
| @@ -127,7 +127,7 @@ pub fn build(b: *std.Build) void { | |||
| 127 | }); | 127 | }); |
| 128 | normp_gen_exe.root_module.addAnonymousImport("DerivedNormalizationProps.txt", .{ .root_source_file = b.path("data/unicode/DerivedNormalizationProps.txt") }); | 128 | normp_gen_exe.root_module.addAnonymousImport("DerivedNormalizationProps.txt", .{ .root_source_file = b.path("data/unicode/DerivedNormalizationProps.txt") }); |
| 129 | const run_normp_gen_exe = b.addRunArtifact(normp_gen_exe); | 129 | const run_normp_gen_exe = b.addRunArtifact(normp_gen_exe); |
| 130 | const normp_gen_out = run_normp_gen_exe.addOutputFileArg("normp.bin.z"); | 130 | const normp_gen_out = run_normp_gen_exe.addOutputFileArg("normp.zig"); |
| 131 | 131 | ||
| 132 | const ccc_gen_exe = b.addExecutable(.{ | 132 | const ccc_gen_exe = b.addExecutable(.{ |
| 133 | .name = "ccc", | 133 | .name = "ccc", |
| @@ -139,7 +139,7 @@ pub fn build(b: *std.Build) void { | |||
| 139 | }); | 139 | }); |
| 140 | ccc_gen_exe.root_module.addAnonymousImport("DerivedCombiningClass.txt", .{ .root_source_file = b.path("data/unicode/extracted/DerivedCombiningClass.txt") }); | 140 | ccc_gen_exe.root_module.addAnonymousImport("DerivedCombiningClass.txt", .{ .root_source_file = b.path("data/unicode/extracted/DerivedCombiningClass.txt") }); |
| 141 | const run_ccc_gen_exe = b.addRunArtifact(ccc_gen_exe); | 141 | const run_ccc_gen_exe = b.addRunArtifact(ccc_gen_exe); |
| 142 | const ccc_gen_out = run_ccc_gen_exe.addOutputFileArg("ccc.bin.z"); | 142 | const ccc_gen_out = run_ccc_gen_exe.addOutputFileArg("ccc.zig"); |
| 143 | 143 | ||
| 144 | const gencat_gen_exe = b.addExecutable(.{ | 144 | const gencat_gen_exe = b.addExecutable(.{ |
| 145 | .name = "gencat", | 145 | .name = "gencat", |
| @@ -164,7 +164,7 @@ pub fn build(b: *std.Build) void { | |||
| 164 | fold_gen_exe.root_module.addAnonymousImport("DerivedCoreProperties.txt", .{ .root_source_file = b.path("data/unicode/DerivedCoreProperties.txt") }); | 164 | fold_gen_exe.root_module.addAnonymousImport("DerivedCoreProperties.txt", .{ .root_source_file = b.path("data/unicode/DerivedCoreProperties.txt") }); |
| 165 | fold_gen_exe.root_module.addAnonymousImport("CaseFolding.txt", .{ .root_source_file = b.path("data/unicode/CaseFolding.txt") }); | 165 | fold_gen_exe.root_module.addAnonymousImport("CaseFolding.txt", .{ .root_source_file = b.path("data/unicode/CaseFolding.txt") }); |
| 166 | const run_fold_gen_exe = b.addRunArtifact(fold_gen_exe); | 166 | const run_fold_gen_exe = b.addRunArtifact(fold_gen_exe); |
| 167 | const fold_gen_out = run_fold_gen_exe.addOutputFileArg("fold.bin.z"); | 167 | const fold_gen_out = run_fold_gen_exe.addOutputFileArg("fold.zig"); |
| 168 | 168 | ||
| 169 | // Numeric types | 169 | // Numeric types |
| 170 | const num_gen_exe = b.addExecutable(.{ | 170 | const num_gen_exe = b.addExecutable(.{ |
| @@ -518,6 +518,8 @@ pub fn build(b: *std.Build) void { | |||
| 518 | test_unicode_step.dependOn(&run_unicode_tests.step); | 518 | test_unicode_step.dependOn(&run_unicode_tests.step); |
| 519 | test_unicode_step.dependOn(&display_width_tr.step); | 519 | test_unicode_step.dependOn(&display_width_tr.step); |
| 520 | test_unicode_step.dependOn(&words_tr.step); | 520 | test_unicode_step.dependOn(&words_tr.step); |
| 521 | test_unicode_step.dependOn(&norm_tr.step); | ||
| 522 | test_unicode_step.dependOn(&case_fold_tr.step); | ||
| 521 | 523 | ||
| 522 | const test_step = b.step("test", "Run all module tests"); | 524 | const test_step = b.step("test", "Run all module tests"); |
| 523 | test_step.dependOn(&run_unicode_tests.step); | 525 | test_step.dependOn(&run_unicode_tests.step); |
diff --git a/codegen/ccc.zig b/codegen/ccc.zig index 4e470ae..e76222f 100644 --- a/codegen/ccc.zig +++ b/codegen/ccc.zig | |||
| @@ -112,12 +112,24 @@ pub fn main() anyerror!void { | |||
| 112 | defer out_file.close(); | 112 | defer out_file.close(); |
| 113 | var writer = out_file.writer(&write_buf); | 113 | var writer = out_file.writer(&write_buf); |
| 114 | 114 | ||
| 115 | const endian = builtin.cpu.arch.endian(); | 115 | try writer.interface.print( |
| 116 | try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); | 116 | \\//! This file is auto-generated. Do not edit. |
| 117 | for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); | 117 | \\ |
| 118 | 118 | \\pub const s1: [{}]u16 = .{{ | |
| 119 | try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); | 119 | , .{stage1.items.len}); |
| 120 | try writer.interface.writeAll(stage2.items); | 120 | for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); |
| 121 | |||
| 122 | try writer.interface.print( | ||
| 123 | \\ | ||
| 124 | \\}}; | ||
| 125 | \\ | ||
| 126 | \\pub const s2: [{}]u8 = .{{ | ||
| 127 | , .{stage2.items.len}); | ||
| 128 | for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); | ||
| 129 | |||
| 130 | try writer.interface.writeAll( | ||
| 131 | \\}; | ||
| 132 | ); | ||
| 121 | 133 | ||
| 122 | try writer.interface.flush(); | 134 | try writer.interface.flush(); |
| 123 | } | 135 | } |
diff --git a/codegen/compat.zig b/codegen/compat.zig index debb83d..a9d1f92 100644 --- a/codegen/compat.zig +++ b/codegen/compat.zig | |||
| @@ -1,58 +1,82 @@ | |||
| 1 | const std = @import("std"); | 1 | const std = @import("std"); |
| 2 | const builtin = @import("builtin"); | 2 | const builtin = @import("builtin"); |
| 3 | 3 | ||
| 4 | const block_size = 256; | ||
| 5 | const Block = [block_size][]const u21; | ||
| 6 | |||
| 7 | const BlockMap = std.HashMap( | ||
| 8 | Block, | ||
| 9 | u16, | ||
| 10 | struct { | ||
| 11 | pub fn hash(_: @This(), k: Block) u64 { | ||
| 12 | var hasher = std.hash.Wyhash.init(0); | ||
| 13 | std.hash.autoHashStrat(&hasher, k, .DeepRecursive); | ||
| 14 | return hasher.final(); | ||
| 15 | } | ||
| 16 | |||
| 17 | pub fn eql(_: @This(), aBlock: Block, bBlock: Block) bool { | ||
| 18 | return for (aBlock, bBlock) |a, b| { | ||
| 19 | if (a.len != b.len) return false; | ||
| 20 | for (a, b) |a_cp, b_cp| { | ||
| 21 | if (a_cp != b_cp) return false; | ||
| 22 | } | ||
| 23 | } else true; | ||
| 24 | } | ||
| 25 | }, | ||
| 26 | std.hash_map.default_max_load_percentage, | ||
| 27 | ); | ||
| 28 | |||
| 4 | pub fn main() anyerror!void { | 29 | pub fn main() anyerror!void { |
| 5 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); | 30 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); |
| 6 | defer arena.deinit(); | 31 | defer arena.deinit(); |
| 7 | const allocator = arena.allocator(); | 32 | const allocator = arena.allocator(); |
| 8 | 33 | ||
| 9 | // Process UnicodeData.txt | 34 | // Process UnicodeData.txt |
| 10 | var write_buf: [4096]u8 = undefined; | ||
| 11 | |||
| 12 | var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt")); | 35 | var in_reader = std.io.Reader.fixed(@embedFile("UnicodeData.txt")); |
| 13 | var args_iter = try std.process.argsWithAllocator(allocator); | 36 | var args_iter = try std.process.argsWithAllocator(allocator); |
| 14 | defer args_iter.deinit(); | 37 | defer args_iter.deinit(); |
| 15 | _ = args_iter.skip(); | 38 | _ = args_iter.skip(); |
| 16 | const output_path = args_iter.next() orelse @panic("No output file arg!"); | 39 | const output_path = args_iter.next() orelse @panic("No output file arg!"); |
| 17 | 40 | ||
| 18 | var out_file = try std.fs.cwd().createFile(output_path, .{}); | 41 | var compat_map = std.AutoHashMap(u21, []u21).init(allocator); |
| 19 | defer out_file.close(); | 42 | defer compat_map.deinit(); |
| 20 | var writer = out_file.writer(&write_buf); | ||
| 21 | 43 | ||
| 22 | const endian = builtin.cpu.arch.endian(); | 44 | while (in_reader.takeDelimiterInclusive('\n')) |line| { |
| 23 | |||
| 24 | lines: while (in_reader.takeDelimiterInclusive('\n')) |took| { | ||
| 25 | const line = std.mem.trimRight(u8, took, "\n"); | ||
| 26 | if (line.len == 0) continue; | 45 | if (line.len == 0) continue; |
| 27 | 46 | ||
| 28 | var field_iter = std.mem.splitScalar(u8, line, ';'); | 47 | var field_iter = std.mem.splitScalar(u8, line, ';'); |
| 29 | var cps: [19]u24 = undefined; | 48 | var cp: u21 = undefined; |
| 30 | var len: u8 = 1; | ||
| 31 | 49 | ||
| 32 | var i: usize = 0; | 50 | var i: usize = 0; |
| 33 | while (field_iter.next()) |field| : (i += 1) { | 51 | while (field_iter.next()) |field| : (i += 1) { |
| 52 | if (field.len == 0) continue; | ||
| 53 | |||
| 34 | switch (i) { | 54 | switch (i) { |
| 35 | 0 => cps[0] = try std.fmt.parseInt(u24, field, 16), | 55 | 0 => { |
| 56 | cp = try std.fmt.parseInt(u21, field, 16); | ||
| 57 | }, | ||
| 36 | 58 | ||
| 37 | 5 => { | 59 | 5 => { |
| 38 | // Not compatibility. | 60 | // Not compatibility. |
| 39 | if (field.len == 0 or field[0] != '<') continue :lines; | 61 | if (field[0] != '<') continue; |
| 62 | |||
| 40 | var cp_iter = std.mem.tokenizeScalar(u8, field, ' '); | 63 | var cp_iter = std.mem.tokenizeScalar(u8, field, ' '); |
| 41 | _ = cp_iter.next(); // <compat type> | 64 | _ = cp_iter.next(); // <compat type> |
| 42 | 65 | ||
| 66 | var cps: [18]u21 = undefined; | ||
| 67 | var len: u8 = 0; | ||
| 68 | |||
| 43 | while (cp_iter.next()) |cp_str| : (len += 1) { | 69 | while (cp_iter.next()) |cp_str| : (len += 1) { |
| 44 | cps[len] = try std.fmt.parseInt(u24, cp_str, 16); | 70 | cps[len] = try std.fmt.parseInt(u21, cp_str, 16); |
| 45 | } | 71 | } |
| 46 | }, | ||
| 47 | 72 | ||
| 48 | 2 => if (line[0] == '<') continue :lines, | 73 | const slice = try allocator.dupe(u21, cps[0..len]); |
| 74 | try compat_map.put(cp, slice); | ||
| 75 | }, | ||
| 49 | 76 | ||
| 50 | else => {}, | 77 | else => {}, |
| 51 | } | 78 | } |
| 52 | } | 79 | } |
| 53 | |||
| 54 | try writer.interface.writeInt(u8, @intCast(len), endian); | ||
| 55 | for (cps[0..len]) |cp| try writer.interface.writeInt(u24, cp, endian); | ||
| 56 | } else |err| switch (err) { | 80 | } else |err| switch (err) { |
| 57 | error.EndOfStream => {}, | 81 | error.EndOfStream => {}, |
| 58 | else => { | 82 | else => { |
| @@ -60,6 +84,63 @@ pub fn main() anyerror!void { | |||
| 60 | }, | 84 | }, |
| 61 | } | 85 | } |
| 62 | 86 | ||
| 63 | try writer.interface.writeInt(u16, 0, endian); | 87 | // Build multi-tiered lookup tables for compatibility decompositions |
| 88 | var blocks_map = BlockMap.init(allocator); | ||
| 89 | defer blocks_map.deinit(); | ||
| 90 | |||
| 91 | var stage1 = std.array_list.Managed(u16).init(allocator); | ||
| 92 | defer stage1.deinit(); | ||
| 93 | |||
| 94 | var stage2 = std.array_list.Managed([]const u21).init(allocator); | ||
| 95 | defer stage2.deinit(); | ||
| 96 | |||
| 97 | var block: Block = [_][]const u21{&[_]u21{}} ** block_size; | ||
| 98 | var block_len: u16 = 0; | ||
| 99 | |||
| 100 | for (0..0x110000) |i| { | ||
| 101 | const cp: u21 = @intCast(i); | ||
| 102 | const compat: []const u21 = compat_map.get(cp) orelse &[_]u21{}; | ||
| 103 | |||
| 104 | block[block_len] = compat; | ||
| 105 | block_len += 1; | ||
| 106 | |||
| 107 | if (block_len < block_size and cp != 0x10ffff) continue; | ||
| 108 | |||
| 109 | const gop = try blocks_map.getOrPut(block); | ||
| 110 | if (!gop.found_existing) { | ||
| 111 | gop.value_ptr.* = @intCast(stage2.items.len); | ||
| 112 | try stage2.appendSlice(&block); | ||
| 113 | } | ||
| 114 | |||
| 115 | try stage1.append(gop.value_ptr.*); | ||
| 116 | block_len = 0; | ||
| 117 | } | ||
| 118 | // Write out | ||
| 119 | var write_buf: [4096]u8 = undefined; | ||
| 120 | var out_file = try std.fs.cwd().createFile(output_path, .{}); | ||
| 121 | defer out_file.close(); | ||
| 122 | var writer = out_file.writer(&write_buf); | ||
| 123 | |||
| 124 | try writer.interface.print( | ||
| 125 | \\//! This file is auto-generated. Do not edit. | ||
| 126 | \\ | ||
| 127 | \\pub const s1: [{}]u16 = .{{ | ||
| 128 | , .{stage1.items.len}); | ||
| 129 | for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); | ||
| 130 | |||
| 131 | try writer.interface.print( | ||
| 132 | \\ | ||
| 133 | \\}}; | ||
| 134 | \\ | ||
| 135 | \\pub const s2: [{}][]const u21 = .{{ | ||
| 136 | , .{stage2.items.len}); | ||
| 137 | for (stage2.items) |entry| { | ||
| 138 | try writer.interface.print("&.{any}, ", .{entry}); | ||
| 139 | } | ||
| 140 | |||
| 141 | try writer.interface.writeAll( | ||
| 142 | \\}; | ||
| 143 | ); | ||
| 144 | |||
| 64 | try writer.interface.flush(); | 145 | try writer.interface.flush(); |
| 65 | } | 146 | } |
diff --git a/codegen/fold.zig b/codegen/fold.zig index 366ed79..c5f54eb 100644 --- a/codegen/fold.zig +++ b/codegen/fold.zig | |||
| @@ -228,26 +228,45 @@ pub fn main() anyerror!void { | |||
| 228 | var out_file = try std.fs.cwd().createFile(output_path, .{}); | 228 | var out_file = try std.fs.cwd().createFile(output_path, .{}); |
| 229 | defer out_file.close(); | 229 | defer out_file.close(); |
| 230 | var writer = out_file.writer(&write_buf); | 230 | var writer = out_file.writer(&write_buf); |
| 231 | |||
| 232 | const endian = builtin.cpu.arch.endian(); | ||
| 233 | // Table metadata. | 231 | // Table metadata. |
| 234 | try writer.interface.writeInt(u24, @intCast(codepoint_cutoff), endian); | 232 | try writer.interface.print( |
| 235 | try writer.interface.writeInt(u24, @intCast(multiple_codepoint_start), endian); | 233 | \\//! This file is auto-generated. Do not edit. |
| 236 | // Stage 1 | 234 | \\ |
| 237 | try writer.interface.writeInt(u16, @intCast(meaningful_stage1.len), endian); | 235 | \\pub const cutoff: u21 = {}; |
| 238 | try writer.interface.writeAll(meaningful_stage1); | 236 | \\pub const cwcf_exceptions_min: u21 = {}; |
| 239 | // Stage 2 | 237 | \\pub const cwcf_exceptions_max: u21 = {}; |
| 240 | try writer.interface.writeInt(u16, @intCast(stage2.len), endian); | 238 | \\pub const cwcf_exceptions: [{}]u21 = .{{ |
| 241 | try writer.interface.writeAll(stage2); | 239 | , .{ codepoint_cutoff, std.mem.min(u21, changes_when_casefolded_exceptions.items), std.mem.max(u21, changes_when_casefolded_exceptions.items), changes_when_casefolded_exceptions.items.len }); |
| 242 | // Stage 3 | 240 | for (changes_when_casefolded_exceptions.items) |cp| try writer.interface.print("{}, ", .{cp}); |
| 243 | try writer.interface.writeInt(u16, @intCast(stage3.len), endian); | 241 | |
| 244 | for (stage3) |offset| try writer.interface.writeInt(i24, offset, endian); | 242 | try writer.interface.print( |
| 245 | // Changes when case folded | 243 | \\ |
| 246 | // Min and max | 244 | \\}}; |
| 247 | try writer.interface.writeInt(u24, std.mem.min(u21, changes_when_casefolded_exceptions.items), endian); | 245 | \\ |
| 248 | try writer.interface.writeInt(u24, std.mem.max(u21, changes_when_casefolded_exceptions.items), endian); | 246 | \\pub const multiple_start: u21 = {}; |
| 249 | try writer.interface.writeInt(u16, @intCast(changes_when_casefolded_exceptions.items.len), endian); | 247 | \\pub const stage1: [{}]u8 = .{{ |
| 250 | for (changes_when_casefolded_exceptions.items) |cp| try writer.interface.writeInt(u24, cp, endian); | 248 | , .{ multiple_codepoint_start, meaningful_stage1.len }); |
| 249 | for (meaningful_stage1) |entry| try writer.interface.print("{}, ", .{entry}); | ||
| 250 | |||
| 251 | try writer.interface.print( | ||
| 252 | \\ | ||
| 253 | \\}}; | ||
| 254 | \\ | ||
| 255 | \\pub const stage2: [{}]u8 = .{{ | ||
| 256 | , .{stage2.len}); | ||
| 257 | for (stage2) |entry| try writer.interface.print("{}, ", .{entry}); | ||
| 258 | |||
| 259 | try writer.interface.print( | ||
| 260 | \\ | ||
| 261 | \\}}; | ||
| 262 | \\ | ||
| 263 | \\pub const stage3: [{}]i24 = .{{ | ||
| 264 | , .{stage3.len}); | ||
| 265 | for (stage3) |entry| try writer.interface.print("{}, ", .{entry}); | ||
| 266 | |||
| 267 | try writer.interface.writeAll( | ||
| 268 | \\}; | ||
| 269 | ); | ||
| 251 | 270 | ||
| 252 | try writer.interface.flush(); | 271 | try writer.interface.flush(); |
| 253 | } | 272 | } |
diff --git a/codegen/hangul.zig b/codegen/hangul.zig index 2e4c175..d7504a9 100644 --- a/codegen/hangul.zig +++ b/codegen/hangul.zig | |||
| @@ -120,12 +120,24 @@ pub fn main() anyerror!void { | |||
| 120 | defer out_file.close(); | 120 | defer out_file.close(); |
| 121 | var writer = out_file.writer(&write_buf); | 121 | var writer = out_file.writer(&write_buf); |
| 122 | 122 | ||
| 123 | const endian = builtin.cpu.arch.endian(); | 123 | try writer.interface.print( |
| 124 | try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); | 124 | \\//! This file is auto-generated. Do not edit. |
| 125 | for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); | 125 | \\ |
| 126 | 126 | \\pub const s1: [{}]u16 = .{{ | |
| 127 | try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); | 127 | , .{stage1.items.len}); |
| 128 | for (stage2.items) |i| try writer.interface.writeInt(u8, i, endian); | 128 | for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); |
| 129 | |||
| 130 | try writer.interface.print( | ||
| 131 | \\ | ||
| 132 | \\}}; | ||
| 133 | \\ | ||
| 134 | \\pub const s2: [{}]u3 = .{{ | ||
| 135 | , .{stage2.items.len}); | ||
| 136 | for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); | ||
| 137 | |||
| 138 | try writer.interface.writeAll( | ||
| 139 | \\}; | ||
| 140 | ); | ||
| 129 | 141 | ||
| 130 | try writer.interface.flush(); | 142 | try writer.interface.flush(); |
| 131 | } | 143 | } |
diff --git a/codegen/normp.zig b/codegen/normp.zig index eaf6989..343f03e 100644 --- a/codegen/normp.zig +++ b/codegen/normp.zig | |||
| @@ -121,12 +121,24 @@ pub fn main() anyerror!void { | |||
| 121 | defer out_file.close(); | 121 | defer out_file.close(); |
| 122 | var writer = out_file.writer(&write_buf); | 122 | var writer = out_file.writer(&write_buf); |
| 123 | 123 | ||
| 124 | const endian = builtin.cpu.arch.endian(); | 124 | try writer.interface.print( |
| 125 | try writer.interface.writeInt(u16, @intCast(stage1.items.len), endian); | 125 | \\//! This file is auto-generated. Do not edit. |
| 126 | for (stage1.items) |i| try writer.interface.writeInt(u16, i, endian); | 126 | \\ |
| 127 | 127 | \\pub const s1: [{}]u16 = .{{ | |
| 128 | try writer.interface.writeInt(u16, @intCast(stage2.items.len), endian); | 128 | , .{stage1.items.len}); |
| 129 | for (stage2.items) |i| try writer.interface.writeInt(u8, i, endian); | 129 | for (stage1.items) |entry| try writer.interface.print("{}, ", .{entry}); |
| 130 | |||
| 131 | try writer.interface.print( | ||
| 132 | \\ | ||
| 133 | \\}}; | ||
| 134 | \\ | ||
| 135 | \\pub const s2: [{}]u3 = .{{ | ||
| 136 | , .{stage2.items.len}); | ||
| 137 | for (stage2.items) |entry| try writer.interface.print("{}, ", .{entry}); | ||
| 138 | |||
| 139 | try writer.interface.writeAll( | ||
| 140 | \\}; | ||
| 141 | ); | ||
| 130 | 142 | ||
| 131 | try writer.interface.flush(); | 143 | try writer.interface.flush(); |
| 132 | } | 144 | } |
diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig index df86b92..88f047c 100644 --- a/src/CaseFolding.zig +++ b/src/CaseFolding.zig | |||
| @@ -1,113 +1,53 @@ | |||
| 1 | cutoff: u21 = undefined, | ||
| 2 | cwcf_exceptions_min: u21 = undefined, | ||
| 3 | cwcf_exceptions_max: u21 = undefined, | ||
| 4 | cwcf_exceptions: []u21 = undefined, | ||
| 5 | multiple_start: u21 = undefined, | ||
| 6 | stage1: []u8 = undefined, | ||
| 7 | stage2: []u8 = undefined, | ||
| 8 | stage3: []i24 = undefined, | ||
| 9 | normalize: Normalize, | ||
| 10 | owns_normalize: bool, | ||
| 11 | |||
| 12 | const CaseFolding = @This(); | 1 | const CaseFolding = @This(); |
| 13 | 2 | ||
| 14 | pub fn init(allocator: Allocator) Allocator.Error!CaseFolding { | 3 | const Data = struct { |
| 15 | var case_fold: CaseFolding = undefined; | 4 | cutoff: u21 = undefined, |
| 16 | try case_fold.setup(allocator); | 5 | cwcf_exceptions_min: u21 = undefined, |
| 17 | return case_fold; | 6 | cwcf_exceptions_max: u21 = undefined, |
| 18 | } | 7 | cwcf_exceptions: []const u21 = undefined, |
| 19 | 8 | multiple_start: u21 = undefined, | |
| 20 | pub fn initWithNormalize(allocator: Allocator, norm: Normalize) Allocator.Error!CaseFolding { | 9 | stage1: []const u8 = undefined, |
| 21 | var casefold: CaseFolding = undefined; | 10 | stage2: []const u8 = undefined, |
| 22 | try casefold.setupWithNormalize(allocator, norm); | 11 | stage3: []const i24 = undefined, |
| 23 | return casefold; | 12 | }; |
| 24 | } | 13 | |
| 25 | 14 | const casefold = casefold: { | |
| 26 | pub fn setup(casefold: *CaseFolding, allocator: Allocator) Allocator.Error!void { | 15 | const data = @import("fold"); |
| 27 | try casefold.setupImpl(allocator); | 16 | break :casefold Data{ |
| 28 | // Handle normalize memory separately during setup: | 17 | .cutoff = data.cutoff, |
| 29 | casefold.owns_normalize = false; | 18 | .multiple_start = data.multiple_start, |
| 30 | errdefer casefold.deinit(allocator); | 19 | .stage1 = &data.stage1, |
| 31 | try casefold.normalize.setup(allocator); | 20 | .stage2 = &data.stage2, |
| 32 | casefold.owns_normalize = true; | 21 | .stage3 = &data.stage3, |
| 33 | } | 22 | .cwcf_exceptions_min = data.cwcf_exceptions_min, |
| 34 | 23 | .cwcf_exceptions_max = data.cwcf_exceptions_max, | |
| 35 | pub fn setupWithNormalize(casefold: *CaseFolding, allocator: Allocator, norm: Normalize) !void { | 24 | .cwcf_exceptions = &data.cwcf_exceptions, |
| 36 | try casefold.setupImpl(allocator); | ||
| 37 | casefold.normalize = norm; | ||
| 38 | casefold.owns_normalize = false; | ||
| 39 | } | ||
| 40 | |||
| 41 | fn setupImpl(casefold: *CaseFolding, allocator: Allocator) Allocator.Error!void { | ||
| 42 | casefold.setupImplInner(allocator) catch |err| { | ||
| 43 | switch (err) { | ||
| 44 | error.OutOfMemory => |e| return e, | ||
| 45 | else => unreachable, | ||
| 46 | } | ||
| 47 | }; | 25 | }; |
| 48 | } | 26 | }; |
| 49 | |||
| 50 | inline fn setupImplInner(casefold: *CaseFolding, allocator: Allocator) !void { | ||
| 51 | const in_bytes = @embedFile("fold"); | ||
| 52 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 53 | var reader = in_fbs.reader(); | ||
| 54 | |||
| 55 | const endian = builtin.cpu.arch.endian(); | ||
| 56 | |||
| 57 | casefold.cutoff = @intCast(try reader.readInt(u24, endian)); | ||
| 58 | casefold.multiple_start = @intCast(try reader.readInt(u24, endian)); | ||
| 59 | |||
| 60 | var len = try reader.readInt(u16, endian); | ||
| 61 | casefold.stage1 = try allocator.alloc(u8, len); | ||
| 62 | errdefer allocator.free(casefold.stage1); | ||
| 63 | for (0..len) |i| casefold.stage1[i] = try reader.readInt(u8, endian); | ||
| 64 | |||
| 65 | len = try reader.readInt(u16, endian); | ||
| 66 | casefold.stage2 = try allocator.alloc(u8, len); | ||
| 67 | errdefer allocator.free(casefold.stage2); | ||
| 68 | for (0..len) |i| casefold.stage2[i] = try reader.readInt(u8, endian); | ||
| 69 | |||
| 70 | len = try reader.readInt(u16, endian); | ||
| 71 | casefold.stage3 = try allocator.alloc(i24, len); | ||
| 72 | errdefer allocator.free(casefold.stage3); | ||
| 73 | for (0..len) |i| casefold.stage3[i] = try reader.readInt(i24, endian); | ||
| 74 | |||
| 75 | casefold.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian)); | ||
| 76 | casefold.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian)); | ||
| 77 | len = try reader.readInt(u16, endian); | ||
| 78 | casefold.cwcf_exceptions = try allocator.alloc(u21, len); | ||
| 79 | errdefer allocator.free(casefold.cwcf_exceptions); | ||
| 80 | for (0..len) |i| casefold.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian)); | ||
| 81 | } | ||
| 82 | |||
| 83 | pub fn deinit(fdata: *const CaseFolding, allocator: mem.Allocator) void { | ||
| 84 | allocator.free(fdata.stage1); | ||
| 85 | allocator.free(fdata.stage2); | ||
| 86 | allocator.free(fdata.stage3); | ||
| 87 | allocator.free(fdata.cwcf_exceptions); | ||
| 88 | if (fdata.owns_normalize) fdata.normalize.deinit(allocator); | ||
| 89 | } | ||
| 90 | 27 | ||
| 91 | /// Returns the case fold for `cp`. | 28 | /// Returns the case fold for `cp`. |
| 92 | pub fn caseFold(fdata: *const CaseFolding, cp: u21, buf: []u21) []const u21 { | 29 | pub fn caseFold(cp: u21, buf: []u21) []const u21 { |
| 93 | if (cp >= fdata.cutoff) return &.{}; | 30 | // Unmatched code points fold to themselves, so we default to this. |
| 31 | buf[0] = cp; | ||
| 94 | 32 | ||
| 95 | const stage1_val = fdata.stage1[cp >> 8]; | 33 | if (cp >= casefold.cutoff) return buf[0..1]; |
| 96 | if (stage1_val == 0) return &.{}; | 34 | |
| 35 | const stage1_val = casefold.stage1[cp >> 8]; | ||
| 36 | if (stage1_val == 0) return buf[0..1]; | ||
| 97 | 37 | ||
| 98 | const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF); | 38 | const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF); |
| 99 | const stage3_index = fdata.stage2[stage2_index]; | 39 | const stage3_index = casefold.stage2[stage2_index]; |
| 100 | 40 | ||
| 101 | if (stage3_index & 0x80 != 0) { | 41 | if (stage3_index & 0x80 != 0) { |
| 102 | const real_index = @as(usize, fdata.multiple_start) + (stage3_index ^ 0x80) * 3; | 42 | const real_index = @as(usize, casefold.multiple_start) + (stage3_index ^ 0x80) * 3; |
| 103 | const mapping = mem.sliceTo(fdata.stage3[real_index..][0..3], 0); | 43 | const mapping = mem.sliceTo(casefold.stage3[real_index..][0..3], 0); |
| 104 | for (mapping, 0..) |c, i| buf[i] = @intCast(c); | 44 | for (mapping, 0..) |c, i| buf[i] = @intCast(c); |
| 105 | 45 | ||
| 106 | return buf[0..mapping.len]; | 46 | return buf[0..mapping.len]; |
| 107 | } | 47 | } |
| 108 | 48 | ||
| 109 | const offset = fdata.stage3[stage3_index]; | 49 | const offset = casefold.stage3[stage3_index]; |
| 110 | if (offset == 0) return &.{}; | 50 | if (offset == 0) return buf[0..1]; |
| 111 | 51 | ||
| 112 | buf[0] = @intCast(@as(i32, cp) + offset); | 52 | buf[0] = @intCast(@as(i32, cp) + offset); |
| 113 | 53 | ||
| @@ -117,7 +57,6 @@ pub fn caseFold(fdata: *const CaseFolding, cp: u21, buf: []u21) []const u21 { | |||
| 117 | /// Produces the case folded code points for `cps`. Caller must free returned | 57 | /// Produces the case folded code points for `cps`. Caller must free returned |
| 118 | /// slice with `allocator`. | 58 | /// slice with `allocator`. |
| 119 | pub fn caseFoldAlloc( | 59 | pub fn caseFoldAlloc( |
| 120 | casefold: *const CaseFolding, | ||
| 121 | allocator: Allocator, | 60 | allocator: Allocator, |
| 122 | cps: []const u21, | 61 | cps: []const u21, |
| 123 | ) Allocator.Error![]const u21 { | 62 | ) Allocator.Error![]const u21 { |
| @@ -126,7 +65,7 @@ pub fn caseFoldAlloc( | |||
| 126 | var buf: [3]u21 = undefined; | 65 | var buf: [3]u21 = undefined; |
| 127 | 66 | ||
| 128 | for (cps) |cp| { | 67 | for (cps) |cp| { |
| 129 | const cf = casefold.caseFold(cp, &buf); | 68 | const cf = CaseFolding.caseFold(cp, &buf); |
| 130 | 69 | ||
| 131 | if (cf.len == 0) { | 70 | if (cf.len == 0) { |
| 132 | try cfcps.append(cp); | 71 | try cfcps.append(cp); |
| @@ -139,19 +78,19 @@ pub fn caseFoldAlloc( | |||
| 139 | } | 78 | } |
| 140 | 79 | ||
| 141 | /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). | 80 | /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). |
| 142 | pub fn cpChangesWhenCaseFolded(casefold: *const CaseFolding, cp: u21) bool { | 81 | pub fn cpChangesWhenCaseFolded(cp: u21) bool { |
| 143 | var buf: [3]u21 = undefined; | 82 | var buf: [3]u21 = undefined; |
| 144 | const has_mapping = casefold.caseFold(cp, &buf).len != 0; | 83 | const has_mapping = CaseFolding.caseFold(cp, &buf).len != 0; |
| 145 | return has_mapping and !casefold.isCwcfException(cp); | 84 | return has_mapping and !CaseFolding.isCwcfException(cp); |
| 146 | } | 85 | } |
| 147 | 86 | ||
| 148 | pub fn changesWhenCaseFolded(casefold: *const CaseFolding, cps: []const u21) bool { | 87 | pub fn changesWhenCaseFolded(cps: []const u21) bool { |
| 149 | return for (cps) |cp| { | 88 | return for (cps) |cp| { |
| 150 | if (casefold.cpChangesWhenCaseFolded(cp)) break true; | 89 | if (CaseFolding.cpChangesWhenCaseFolded(cp)) break true; |
| 151 | } else false; | 90 | } else false; |
| 152 | } | 91 | } |
| 153 | 92 | ||
| 154 | fn isCwcfException(casefold: *const CaseFolding, cp: u21) bool { | 93 | fn isCwcfException(cp: u21) bool { |
| 155 | return cp >= casefold.cwcf_exceptions_min and | 94 | return cp >= casefold.cwcf_exceptions_min and |
| 156 | cp <= casefold.cwcf_exceptions_max and | 95 | cp <= casefold.cwcf_exceptions_max and |
| 157 | std.mem.indexOfScalar(u21, casefold.cwcf_exceptions, cp) != null; | 96 | std.mem.indexOfScalar(u21, casefold.cwcf_exceptions, cp) != null; |
| @@ -160,88 +99,114 @@ fn isCwcfException(casefold: *const CaseFolding, cp: u21) bool { | |||
| 160 | /// Caseless compare `a` and `b` by decomposing to NFKD. This is the most | 99 | /// Caseless compare `a` and `b` by decomposing to NFKD. This is the most |
| 161 | /// comprehensive comparison possible, but slower than `canonCaselessMatch`. | 100 | /// comprehensive comparison possible, but slower than `canonCaselessMatch`. |
| 162 | pub fn compatCaselessMatch( | 101 | pub fn compatCaselessMatch( |
| 163 | casefold: *const CaseFolding, | ||
| 164 | allocator: Allocator, | 102 | allocator: Allocator, |
| 103 | normalize: Normalize, | ||
| 165 | a: []const u8, | 104 | a: []const u8, |
| 166 | b: []const u8, | 105 | b: []const u8, |
| 167 | ) Allocator.Error!bool { | 106 | ) Allocator.Error!bool { |
| 168 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); | 107 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); |
| 169 | 108 | ||
| 170 | // Process a | 109 | // Process a |
| 171 | const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd); | 110 | const nfd_a = try normalize.nfxdCodePoints(allocator, a, .nfd); |
| 172 | defer allocator.free(nfd_a); | 111 | defer allocator.free(nfd_a); |
| 173 | 112 | ||
| 174 | var need_free_cf_nfd_a = false; | 113 | var need_free_cf_nfd_a = false; |
| 175 | var cf_nfd_a: []const u21 = nfd_a; | 114 | var cf_nfd_a: []const u21 = nfd_a; |
| 176 | if (casefold.changesWhenCaseFolded(nfd_a)) { | 115 | if (CaseFolding.changesWhenCaseFolded(nfd_a)) { |
| 177 | cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a); | 116 | cf_nfd_a = try CaseFolding.caseFoldAlloc(allocator, nfd_a); |
| 178 | need_free_cf_nfd_a = true; | 117 | need_free_cf_nfd_a = true; |
| 179 | } | 118 | } |
| 180 | defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); | 119 | defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); |
| 181 | 120 | ||
| 182 | const nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_a); | 121 | const nfkd_cf_nfd_a = try normalize.nfkdCodePoints(allocator, cf_nfd_a); |
| 183 | defer allocator.free(nfkd_cf_nfd_a); | 122 | defer allocator.free(nfkd_cf_nfd_a); |
| 184 | const cf_nfkd_cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_a); | 123 | const cf_nfkd_cf_nfd_a = try CaseFolding.caseFoldAlloc(allocator, nfkd_cf_nfd_a); |
| 185 | defer allocator.free(cf_nfkd_cf_nfd_a); | 124 | defer allocator.free(cf_nfkd_cf_nfd_a); |
| 186 | const nfkd_cf_nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); | 125 | const nfkd_cf_nfkd_cf_nfd_a = try normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); |
| 187 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); | 126 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); |
| 188 | 127 | ||
| 189 | // Process b | 128 | // Process b |
| 190 | const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd); | 129 | const nfd_b = try normalize.nfxdCodePoints(allocator, b, .nfd); |
| 191 | defer allocator.free(nfd_b); | 130 | defer allocator.free(nfd_b); |
| 192 | 131 | ||
| 193 | var need_free_cf_nfd_b = false; | 132 | var need_free_cf_nfd_b = false; |
| 194 | var cf_nfd_b: []const u21 = nfd_b; | 133 | var cf_nfd_b: []const u21 = nfd_b; |
| 195 | if (casefold.changesWhenCaseFolded(nfd_b)) { | 134 | if (CaseFolding.changesWhenCaseFolded(nfd_b)) { |
| 196 | cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b); | 135 | cf_nfd_b = try CaseFolding.caseFoldAlloc(allocator, nfd_b); |
| 197 | need_free_cf_nfd_b = true; | 136 | need_free_cf_nfd_b = true; |
| 198 | } | 137 | } |
| 199 | defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); | 138 | defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); |
| 200 | 139 | ||
| 201 | const nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_b); | 140 | const nfkd_cf_nfd_b = try normalize.nfkdCodePoints(allocator, cf_nfd_b); |
| 202 | defer allocator.free(nfkd_cf_nfd_b); | 141 | defer allocator.free(nfkd_cf_nfd_b); |
| 203 | const cf_nfkd_cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_b); | 142 | const cf_nfkd_cf_nfd_b = try CaseFolding.caseFoldAlloc(allocator, nfkd_cf_nfd_b); |
| 204 | defer allocator.free(cf_nfkd_cf_nfd_b); | 143 | defer allocator.free(cf_nfkd_cf_nfd_b); |
| 205 | const nfkd_cf_nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); | 144 | const nfkd_cf_nfkd_cf_nfd_b = try normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); |
| 206 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); | 145 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); |
| 207 | 146 | ||
| 208 | return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); | 147 | return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); |
| 209 | } | 148 | } |
| 210 | 149 | ||
| 150 | test "caseFold" { | ||
| 151 | var buf: [3]u21 = undefined; | ||
| 152 | |||
| 153 | // Folds '1' to '1' | ||
| 154 | try testing.expectEqual(1, caseFold('1', &buf).len); | ||
| 155 | try testing.expectEqual('1', caseFold('1', &buf)[0]); | ||
| 156 | |||
| 157 | // Folds '2' to '2' | ||
| 158 | try testing.expectEqual(1, caseFold('2', &buf).len); | ||
| 159 | try testing.expectEqual('2', caseFold('2', &buf)[0]); | ||
| 160 | |||
| 161 | // Folds Armenian capital letter 'Zhe' (U+053A) | ||
| 162 | try testing.expectEqual(1, caseFold('Ժ', &buf).len); | ||
| 163 | // Armenian small letter 'Zhe' (U+056A) | ||
| 164 | try testing.expectEqual('ժ', caseFold('Ժ', &buf)[0]); | ||
| 165 | |||
| 166 | // Folds Greek small letter Upsilon with Dialytika and Perispomeni (U+1FE7) | ||
| 167 | try testing.expectEqual(3, caseFold('ῧ', &buf).len); | ||
| 168 | // Greek small letter Upsilon (U+03C5) | ||
| 169 | try testing.expectEqual('υ', caseFold('ῧ', &buf)[0]); | ||
| 170 | // Combining Diaeresis | ||
| 171 | try testing.expectEqual('\u{0308}', caseFold('ῧ', &buf)[1]); | ||
| 172 | // Combining Greek Perispomeni | ||
| 173 | try testing.expectEqual('\u{0342}', caseFold('ῧ', &buf)[2]); | ||
| 174 | } | ||
| 175 | |||
| 211 | test "compatCaselessMatch" { | 176 | test "compatCaselessMatch" { |
| 212 | const allocator = testing.allocator; | 177 | const allocator = testing.allocator; |
| 213 | 178 | ||
| 214 | const caser = try CaseFolding.init(allocator); | 179 | var normalize = try Normalize.init(allocator); |
| 215 | defer caser.deinit(allocator); | 180 | defer normalize.deinit(allocator); |
| 216 | 181 | ||
| 217 | try testing.expect(try caser.compatCaselessMatch(allocator, "ascii only!", "ASCII Only!")); | 182 | try testing.expect(try compatCaselessMatch(allocator, normalize, "ascii only!", "ASCII Only!")); |
| 218 | 183 | ||
| 219 | const a = "Héllo World! \u{3d3}"; | 184 | const a = "Héllo World! \u{3d3}"; |
| 220 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; | 185 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; |
| 221 | try testing.expect(try caser.compatCaselessMatch(allocator, a, b)); | 186 | try testing.expect(try compatCaselessMatch(allocator, normalize, a, b)); |
| 222 | 187 | ||
| 223 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; | 188 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; |
| 224 | try testing.expect(try caser.compatCaselessMatch(allocator, a, c)); | 189 | try testing.expect(try compatCaselessMatch(allocator, normalize, a, c)); |
| 225 | } | 190 | } |
| 226 | 191 | ||
| 227 | /// Performs canonical caseless string matching by decomposing to NFD. This is | 192 | /// Performs canonical caseless string matching by decomposing to NFD. This is |
| 228 | /// faster than `compatCaselessMatch`, but less comprehensive. | 193 | /// faster than `compatCaselessMatch`, but less comprehensive. |
| 229 | pub fn canonCaselessMatch( | 194 | pub fn canonCaselessMatch( |
| 230 | casefold: *const CaseFolding, | ||
| 231 | allocator: Allocator, | 195 | allocator: Allocator, |
| 196 | normalize: Normalize, | ||
| 232 | a: []const u8, | 197 | a: []const u8, |
| 233 | b: []const u8, | 198 | b: []const u8, |
| 234 | ) Allocator.Error!bool { | 199 | ) Allocator.Error!bool { |
| 235 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); | 200 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); |
| 236 | 201 | ||
| 237 | // Process a | 202 | // Process a |
| 238 | const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd); | 203 | const nfd_a = try normalize.nfxdCodePoints(allocator, a, .nfd); |
| 239 | defer allocator.free(nfd_a); | 204 | defer allocator.free(nfd_a); |
| 240 | 205 | ||
| 241 | var need_free_cf_nfd_a = false; | 206 | var need_free_cf_nfd_a = false; |
| 242 | var cf_nfd_a: []const u21 = nfd_a; | 207 | var cf_nfd_a: []const u21 = nfd_a; |
| 243 | if (casefold.changesWhenCaseFolded(nfd_a)) { | 208 | if (CaseFolding.changesWhenCaseFolded(nfd_a)) { |
| 244 | cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a); | 209 | cf_nfd_a = try CaseFolding.caseFoldAlloc(allocator, nfd_a); |
| 245 | need_free_cf_nfd_a = true; | 210 | need_free_cf_nfd_a = true; |
| 246 | } | 211 | } |
| 247 | defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); | 212 | defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); |
| @@ -249,19 +214,19 @@ pub fn canonCaselessMatch( | |||
| 249 | var need_free_nfd_cf_nfd_a = false; | 214 | var need_free_nfd_cf_nfd_a = false; |
| 250 | var nfd_cf_nfd_a = cf_nfd_a; | 215 | var nfd_cf_nfd_a = cf_nfd_a; |
| 251 | if (!need_free_cf_nfd_a) { | 216 | if (!need_free_cf_nfd_a) { |
| 252 | nfd_cf_nfd_a = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_a); | 217 | nfd_cf_nfd_a = try normalize.nfdCodePoints(allocator, cf_nfd_a); |
| 253 | need_free_nfd_cf_nfd_a = true; | 218 | need_free_nfd_cf_nfd_a = true; |
| 254 | } | 219 | } |
| 255 | defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); | 220 | defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); |
| 256 | 221 | ||
| 257 | // Process b | 222 | // Process b |
| 258 | const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd); | 223 | const nfd_b = try normalize.nfxdCodePoints(allocator, b, .nfd); |
| 259 | defer allocator.free(nfd_b); | 224 | defer allocator.free(nfd_b); |
| 260 | 225 | ||
| 261 | var need_free_cf_nfd_b = false; | 226 | var need_free_cf_nfd_b = false; |
| 262 | var cf_nfd_b: []const u21 = nfd_b; | 227 | var cf_nfd_b: []const u21 = nfd_b; |
| 263 | if (casefold.changesWhenCaseFolded(nfd_b)) { | 228 | if (CaseFolding.changesWhenCaseFolded(nfd_b)) { |
| 264 | cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b); | 229 | cf_nfd_b = try CaseFolding.caseFoldAlloc(allocator, nfd_b); |
| 265 | need_free_cf_nfd_b = true; | 230 | need_free_cf_nfd_b = true; |
| 266 | } | 231 | } |
| 267 | defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); | 232 | defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); |
| @@ -269,7 +234,7 @@ pub fn canonCaselessMatch( | |||
| 269 | var need_free_nfd_cf_nfd_b = false; | 234 | var need_free_nfd_cf_nfd_b = false; |
| 270 | var nfd_cf_nfd_b = cf_nfd_b; | 235 | var nfd_cf_nfd_b = cf_nfd_b; |
| 271 | if (!need_free_cf_nfd_b) { | 236 | if (!need_free_cf_nfd_b) { |
| 272 | nfd_cf_nfd_b = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_b); | 237 | nfd_cf_nfd_b = try normalize.nfdCodePoints(allocator, cf_nfd_b); |
| 273 | need_free_nfd_cf_nfd_b = true; | 238 | need_free_nfd_cf_nfd_b = true; |
| 274 | } | 239 | } |
| 275 | defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b); | 240 | defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b); |
| @@ -280,40 +245,17 @@ pub fn canonCaselessMatch( | |||
| 280 | test "canonCaselessMatch" { | 245 | test "canonCaselessMatch" { |
| 281 | const allocator = testing.allocator; | 246 | const allocator = testing.allocator; |
| 282 | 247 | ||
| 283 | const caser = try CaseFolding.init(allocator); | 248 | var normalize = try Normalize.init(allocator); |
| 284 | defer caser.deinit(allocator); | 249 | defer normalize.deinit(allocator); |
| 285 | 250 | ||
| 286 | try testing.expect(try caser.canonCaselessMatch(allocator, "ascii only!", "ASCII Only!")); | 251 | try testing.expect(try canonCaselessMatch(allocator, normalize, "ascii only!", "ASCII Only!")); |
| 287 | 252 | ||
| 288 | const a = "Héllo World! \u{3d3}"; | 253 | const a = "Héllo World! \u{3d3}"; |
| 289 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; | 254 | const b = "He\u{301}llo World! \u{3a5}\u{301}"; |
| 290 | try testing.expect(!try caser.canonCaselessMatch(allocator, a, b)); | 255 | try testing.expect(!try canonCaselessMatch(allocator, normalize, a, b)); |
| 291 | 256 | ||
| 292 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; | 257 | const c = "He\u{301}llo World! \u{3d2}\u{301}"; |
| 293 | try testing.expect(try caser.canonCaselessMatch(allocator, a, c)); | 258 | try testing.expect(try canonCaselessMatch(allocator, normalize, a, c)); |
| 294 | } | ||
| 295 | |||
| 296 | fn testAllocations(allocator: Allocator) !void { | ||
| 297 | // With normalize provided | ||
| 298 | { | ||
| 299 | const normalize = try Normalize.init(allocator); | ||
| 300 | defer normalize.deinit(allocator); | ||
| 301 | const caser = try CaseFolding.initWithNormalize(allocator, normalize); | ||
| 302 | defer caser.deinit(allocator); | ||
| 303 | } | ||
| 304 | // With normalize owned | ||
| 305 | { | ||
| 306 | const caser = try CaseFolding.init(allocator); | ||
| 307 | defer caser.deinit(allocator); | ||
| 308 | } | ||
| 309 | } | ||
| 310 | |||
| 311 | test "Allocation Failures" { | ||
| 312 | try testing.checkAllAllocationFailures( | ||
| 313 | testing.allocator, | ||
| 314 | testAllocations, | ||
| 315 | .{}, | ||
| 316 | ); | ||
| 317 | } | 259 | } |
| 318 | 260 | ||
| 319 | const std = @import("std"); | 261 | const std = @import("std"); |
diff --git a/src/CombiningData.zig b/src/CombiningData.zig index f58e0de..660dc14 100644 --- a/src/CombiningData.zig +++ b/src/CombiningData.zig | |||
| @@ -1,45 +1,28 @@ | |||
| 1 | //! Combining Class Data | 1 | //! Combining Class Data |
| 2 | 2 | ||
| 3 | s1: []u16 = undefined, | 3 | const Data = struct { |
| 4 | s2: []u8 = undefined, | 4 | s1: []const u16 = undefined, |
| 5 | s2: []const u8 = undefined, | ||
| 6 | }; | ||
| 7 | |||
| 8 | const combining_data = combining_data: { | ||
| 9 | const data = @import("ccc"); | ||
| 10 | break :combining_data Data{ | ||
| 11 | .s1 = &data.s1, | ||
| 12 | .s2 = &data.s2, | ||
| 13 | }; | ||
| 14 | }; | ||
| 5 | 15 | ||
| 6 | const CombiningData = @This(); | 16 | const CombiningData = @This(); |
| 7 | 17 | ||
| 8 | pub fn init(allocator: mem.Allocator) !CombiningData { | ||
| 9 | const in_bytes = @embedFile("ccc"); | ||
| 10 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 11 | var reader = in_fbs.reader(); | ||
| 12 | |||
| 13 | const endian = builtin.cpu.arch.endian(); | ||
| 14 | |||
| 15 | var cbdata = CombiningData{}; | ||
| 16 | |||
| 17 | const stage_1_len: u16 = try reader.readInt(u16, endian); | ||
| 18 | cbdata.s1 = try allocator.alloc(u16, stage_1_len); | ||
| 19 | errdefer allocator.free(cbdata.s1); | ||
| 20 | for (0..stage_1_len) |i| cbdata.s1[i] = try reader.readInt(u16, endian); | ||
| 21 | |||
| 22 | const stage_2_len: u16 = try reader.readInt(u16, endian); | ||
| 23 | cbdata.s2 = try allocator.alloc(u8, stage_2_len); | ||
| 24 | errdefer allocator.free(cbdata.s2); | ||
| 25 | _ = try reader.readAll(cbdata.s2); | ||
| 26 | |||
| 27 | return cbdata; | ||
| 28 | } | ||
| 29 | |||
| 30 | pub fn deinit(cbdata: *const CombiningData, allocator: mem.Allocator) void { | ||
| 31 | allocator.free(cbdata.s1); | ||
| 32 | allocator.free(cbdata.s2); | ||
| 33 | } | ||
| 34 | |||
| 35 | /// Returns the canonical combining class for a code point. | 18 | /// Returns the canonical combining class for a code point. |
| 36 | pub fn ccc(cbdata: CombiningData, cp: u21) u8 { | 19 | pub fn ccc(cp: u21) u8 { |
| 37 | return cbdata.s2[cbdata.s1[cp >> 8] + (cp & 0xff)]; | 20 | return combining_data.s2[combining_data.s1[cp >> 8] + (cp & 0xff)]; |
| 38 | } | 21 | } |
| 39 | 22 | ||
| 40 | /// True if `cp` is a starter code point, not a combining character. | 23 | /// True if `cp` is a starter code point, not a combining character. |
| 41 | pub fn isStarter(cbdata: CombiningData, cp: u21) bool { | 24 | pub fn isStarter(cp: u21) bool { |
| 42 | return cbdata.s2[cbdata.s1[cp >> 8] + (cp & 0xff)] == 0; | 25 | return combining_data.s2[combining_data.s1[cp >> 8] + (cp & 0xff)] == 0; |
| 43 | } | 26 | } |
| 44 | 27 | ||
| 45 | const std = @import("std"); | 28 | const std = @import("std"); |
diff --git a/src/CompatData.zig b/src/CompatData.zig index 40ecd12..68d47f2 100644 --- a/src/CompatData.zig +++ b/src/CompatData.zig | |||
| @@ -1,57 +1,19 @@ | |||
| 1 | //! Compatibility Data | 1 | //! Compatibility Data |
| 2 | 2 | ||
| 3 | nfkd: [][]u21 = undefined, | 3 | const Data = struct { |
| 4 | cps: []u21 = undefined, | 4 | s1: []const u16 = undefined, |
| 5 | 5 | s2: []const []const u21 = undefined, | |
| 6 | const CompatData = @This(); | 6 | }; |
| 7 | 7 | ||
| 8 | pub fn init(allocator: mem.Allocator) !CompatData { | 8 | const compat_data = compat_data: { |
| 9 | const in_bytes = @embedFile("compat"); | 9 | const data = @import("compat"); |
| 10 | var in_fbs = std.io.fixedBufferStream(in_bytes); | 10 | break :compat_data Data{ |
| 11 | var reader = in_fbs.reader(); | 11 | .s1 = &data.s1, |
| 12 | 12 | .s2 = &data.s2, | |
| 13 | const endian = builtin.cpu.arch.endian(); | ||
| 14 | var cpdata = CompatData{ | ||
| 15 | .nfkd = try allocator.alloc([]u21, 0x110000), | ||
| 16 | }; | 13 | }; |
| 17 | { | 14 | }; |
| 18 | errdefer allocator.free(cpdata.nfkd); | ||
| 19 | cpdata.cps = try allocator.alloc(u21, magic.compat_size); | ||
| 20 | } | ||
| 21 | errdefer cpdata.deinit(allocator); | ||
| 22 | |||
| 23 | @memset(cpdata.nfkd, &.{}); | ||
| 24 | |||
| 25 | var total_len: usize = 0; | ||
| 26 | |||
| 27 | while (true) { | ||
| 28 | const len: u8 = try reader.readInt(u8, endian); | ||
| 29 | if (len == 0) break; | ||
| 30 | const cp = try reader.readInt(u24, endian); | ||
| 31 | const nk_s = cpdata.cps[total_len..][0 .. len - 1]; | ||
| 32 | for (0..len - 1) |i| { | ||
| 33 | nk_s[i] = @intCast(try reader.readInt(u24, endian)); | ||
| 34 | } | ||
| 35 | cpdata.nfkd[cp] = nk_s; | ||
| 36 | total_len += len - 1; | ||
| 37 | } | ||
| 38 | |||
| 39 | if (comptime magic.print) std.debug.print("CompatData magic number: {d}", .{total_len}); | ||
| 40 | |||
| 41 | return cpdata; | ||
| 42 | } | ||
| 43 | |||
| 44 | pub fn deinit(cpdata: *const CompatData, allocator: mem.Allocator) void { | ||
| 45 | allocator.free(cpdata.cps); | ||
| 46 | allocator.free(cpdata.nfkd); | ||
| 47 | } | ||
| 48 | 15 | ||
| 49 | /// Returns compatibility decomposition for `cp`. | 16 | /// Returns compatibility decomposition for `cp`. |
| 50 | pub fn toNfkd(cpdata: *const CompatData, cp: u21) []u21 { | 17 | pub fn toNfkd(cp: u21) []const u21 { |
| 51 | return cpdata.nfkd[cp]; | 18 | return compat_data.s2[compat_data.s1[cp >> 8] + (cp & 0xff)]; |
| 52 | } | 19 | } |
| 53 | |||
| 54 | const std = @import("std"); | ||
| 55 | const builtin = @import("builtin"); | ||
| 56 | const mem = std.mem; | ||
| 57 | const magic = @import("magic"); | ||
diff --git a/src/HangulData.zig b/src/HangulData.zig index cae8b97..9c17421 100644 --- a/src/HangulData.zig +++ b/src/HangulData.zig | |||
| @@ -9,39 +9,23 @@ pub const Syllable = enum { | |||
| 9 | T, | 9 | T, |
| 10 | }; | 10 | }; |
| 11 | 11 | ||
| 12 | s1: []u16 = undefined, | 12 | const Data = struct { |
| 13 | s2: []u3 = undefined, | 13 | s1: []const u16 = undefined, |
| 14 | 14 | s2: []const u3 = undefined, | |
| 15 | const Hangul = @This(); | 15 | }; |
| 16 | |||
| 17 | pub fn init(allocator: mem.Allocator) !Hangul { | ||
| 18 | const in_bytes = @embedFile("hangul"); | ||
| 19 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 20 | var reader = in_fbs.reader(); | ||
| 21 | |||
| 22 | const endian = builtin.cpu.arch.endian(); | ||
| 23 | var hangul = Hangul{}; | ||
| 24 | |||
| 25 | const stage_1_len: u16 = try reader.readInt(u16, endian); | ||
| 26 | hangul.s1 = try allocator.alloc(u16, stage_1_len); | ||
| 27 | errdefer allocator.free(hangul.s1); | ||
| 28 | for (0..stage_1_len) |i| hangul.s1[i] = try reader.readInt(u16, endian); | ||
| 29 | |||
| 30 | const stage_2_len: u16 = try reader.readInt(u16, endian); | ||
| 31 | hangul.s2 = try allocator.alloc(u3, stage_2_len); | ||
| 32 | errdefer allocator.free(hangul.s2); | ||
| 33 | for (0..stage_2_len) |i| hangul.s2[i] = @intCast(try reader.readInt(u8, endian)); | ||
| 34 | 16 | ||
| 35 | return hangul; | 17 | const hangul = hangul_data: { |
| 36 | } | 18 | const data = @import("hangul"); |
| 19 | break :hangul_data Data{ | ||
| 20 | .s1 = &data.s1, | ||
| 21 | .s2 = &data.s2, | ||
| 22 | }; | ||
| 23 | }; | ||
| 37 | 24 | ||
| 38 | pub fn deinit(hangul: *const Hangul, allocator: mem.Allocator) void { | 25 | const Hangul = @This(); |
| 39 | allocator.free(hangul.s1); | ||
| 40 | allocator.free(hangul.s2); | ||
| 41 | } | ||
| 42 | 26 | ||
| 43 | /// Returns the Hangul syllable type for `cp`. | 27 | /// Returns the Hangul syllable type for `cp`. |
| 44 | pub fn syllable(hangul: *const Hangul, cp: u21) Syllable { | 28 | pub fn syllable(cp: u21) Syllable { |
| 45 | return @enumFromInt(hangul.s2[hangul.s1[cp >> 8] + (cp & 0xff)]); | 29 | return @enumFromInt(hangul.s2[hangul.s1[cp >> 8] + (cp & 0xff)]); |
| 46 | } | 30 | } |
| 47 | 31 | ||
diff --git a/src/NormPropsData.zig b/src/NormPropsData.zig index 7b53542..cca3556 100644 --- a/src/NormPropsData.zig +++ b/src/NormPropsData.zig | |||
| @@ -1,48 +1,32 @@ | |||
| 1 | //! Normalization Properties Data | 1 | //! Normalization Properties Data |
| 2 | 2 | ||
| 3 | s1: []u16 = undefined, | 3 | const Data = struct { |
| 4 | s2: []u4 = undefined, | 4 | s1: []const u16 = undefined, |
| 5 | s2: []const u3 = undefined, | ||
| 6 | }; | ||
| 7 | |||
| 8 | const norms = norm_props_data: { | ||
| 9 | const data = @import("normp"); | ||
| 10 | break :norm_props_data Data{ | ||
| 11 | .s1 = &data.s1, | ||
| 12 | .s2 = &data.s2, | ||
| 13 | }; | ||
| 14 | }; | ||
| 5 | 15 | ||
| 6 | const NormProps = @This(); | 16 | const NormProps = @This(); |
| 7 | 17 | ||
| 8 | pub fn init(allocator: mem.Allocator) !NormProps { | ||
| 9 | const in_bytes = @embedFile("normp"); | ||
| 10 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 11 | var reader = in_fbs.reader(); | ||
| 12 | |||
| 13 | const endian = builtin.cpu.arch.endian(); | ||
| 14 | var norms = NormProps{}; | ||
| 15 | |||
| 16 | const stage_1_len: u16 = try reader.readInt(u16, endian); | ||
| 17 | norms.s1 = try allocator.alloc(u16, stage_1_len); | ||
| 18 | errdefer allocator.free(norms.s1); | ||
| 19 | for (0..stage_1_len) |i| norms.s1[i] = try reader.readInt(u16, endian); | ||
| 20 | |||
| 21 | const stage_2_len: u16 = try reader.readInt(u16, endian); | ||
| 22 | norms.s2 = try allocator.alloc(u4, stage_2_len); | ||
| 23 | errdefer allocator.free(norms.s2); | ||
| 24 | for (0..stage_2_len) |i| norms.s2[i] = @intCast(try reader.readInt(u8, endian)); | ||
| 25 | |||
| 26 | return norms; | ||
| 27 | } | ||
| 28 | |||
| 29 | pub fn deinit(norms: *const NormProps, allocator: mem.Allocator) void { | ||
| 30 | allocator.free(norms.s1); | ||
| 31 | allocator.free(norms.s2); | ||
| 32 | } | ||
| 33 | |||
| 34 | /// Returns true if `cp` is already in NFD form. | 18 | /// Returns true if `cp` is already in NFD form. |
| 35 | pub fn isNfd(norms: *const NormProps, cp: u21) bool { | 19 | pub fn isNfd(cp: u21) bool { |
| 36 | return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 1 == 0; | 20 | return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 1 == 0; |
| 37 | } | 21 | } |
| 38 | 22 | ||
| 39 | /// Returns true if `cp` is already in NFKD form. | 23 | /// Returns true if `cp` is already in NFKD form. |
| 40 | pub fn isNfkd(norms: *const NormProps, cp: u21) bool { | 24 | pub fn isNfkd(cp: u21) bool { |
| 41 | return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 2 == 0; | 25 | return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 2 == 0; |
| 42 | } | 26 | } |
| 43 | 27 | ||
| 44 | /// Returns true if `cp` is not allowed in any normalized form. | 28 | /// Returns true if `cp` is not allowed in any normalized form. |
| 45 | pub fn isFcx(norms: *const NormProps, cp: u21) bool { | 29 | pub fn isFcx(cp: u21) bool { |
| 46 | return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 4 == 4; | 30 | return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 4 == 4; |
| 47 | } | 31 | } |
| 48 | 32 | ||
diff --git a/src/Normalize.zig b/src/Normalize.zig index 4a1bae8..3191a8c 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig | |||
| @@ -3,64 +3,22 @@ | |||
| 3 | //! NFKC, NFD, and NFKD normalization forms. | 3 | //! NFKC, NFD, and NFKD normalization forms. |
| 4 | 4 | ||
| 5 | canon_data: CanonData = undefined, | 5 | canon_data: CanonData = undefined, |
| 6 | ccc_data: CccData = undefined, | ||
| 7 | compat_data: CompatData = undefined, | ||
| 8 | hangul_data: HangulData = undefined, | ||
| 9 | normp_data: NormPropsData = undefined, | ||
| 10 | 6 | ||
| 11 | const Normalize = @This(); | 7 | const Normalize = @This(); |
| 12 | 8 | ||
| 13 | pub fn init(allocator: Allocator) Allocator.Error!Normalize { | 9 | pub fn init(allocator: Allocator) !Normalize { |
| 14 | var norm: Normalize = undefined; | 10 | var norm: Normalize = undefined; |
| 15 | try norm.setup(allocator); | 11 | try norm.setup(allocator); |
| 16 | return norm; | 12 | return norm; |
| 17 | } | 13 | } |
| 18 | 14 | ||
| 19 | pub fn setup(self: *Normalize, allocator: Allocator) Allocator.Error!void { | 15 | pub fn setup(self: *Normalize, allocator: Allocator) !void { |
| 20 | self.canon_data = CanonData.init(allocator) catch |err| { | 16 | self.canon_data = try CanonData.init(allocator); |
| 21 | switch (err) { | ||
| 22 | error.OutOfMemory => |e| return e, | ||
| 23 | else => unreachable, | ||
| 24 | } | ||
| 25 | }; | ||
| 26 | errdefer self.canon_data.deinit(allocator); | ||
| 27 | self.ccc_data = CccData.init(allocator) catch |err| { | ||
| 28 | switch (err) { | ||
| 29 | error.OutOfMemory => |e| return e, | ||
| 30 | else => unreachable, | ||
| 31 | } | ||
| 32 | }; | ||
| 33 | errdefer self.ccc_data.deinit(allocator); | ||
| 34 | self.compat_data = CompatData.init(allocator) catch |err| { | ||
| 35 | switch (err) { | ||
| 36 | error.OutOfMemory => |e| return e, | ||
| 37 | else => unreachable, | ||
| 38 | } | ||
| 39 | }; | ||
| 40 | errdefer self.compat_data.deinit(allocator); | ||
| 41 | self.hangul_data = HangulData.init(allocator) catch |err| { | ||
| 42 | switch (err) { | ||
| 43 | error.OutOfMemory => |e| return e, | ||
| 44 | else => unreachable, | ||
| 45 | } | ||
| 46 | }; | ||
| 47 | errdefer self.hangul_data.deinit(allocator); | ||
| 48 | self.normp_data = NormPropsData.init(allocator) catch |err| { | ||
| 49 | switch (err) { | ||
| 50 | error.OutOfMemory => |e| return e, | ||
| 51 | else => unreachable, | ||
| 52 | } | ||
| 53 | }; | ||
| 54 | } | 17 | } |
| 55 | 18 | ||
| 56 | pub fn deinit(norm: *const Normalize, allocator: Allocator) void { | 19 | pub fn deinit(norm: *const Normalize, allocator: Allocator) void { |
| 57 | // Reasonably safe (?) | 20 | const mut_norm = @constCast(norm); |
| 58 | var mut_norm = @constCast(norm); | ||
| 59 | mut_norm.canon_data.deinit(allocator); | 21 | mut_norm.canon_data.deinit(allocator); |
| 60 | mut_norm.ccc_data.deinit(allocator); | ||
| 61 | mut_norm.compat_data.deinit(allocator); | ||
| 62 | mut_norm.hangul_data.deinit(allocator); | ||
| 63 | mut_norm.normp_data.deinit(allocator); | ||
| 64 | } | 22 | } |
| 65 | 23 | ||
| 66 | const SBase: u21 = 0xAC00; | 24 | const SBase: u21 = 0xAC00; |
| @@ -73,8 +31,8 @@ const TCount: u21 = 28; | |||
| 73 | const NCount: u21 = 588; // VCount * TCount | 31 | const NCount: u21 = 588; // VCount * TCount |
| 74 | const SCount: u21 = 11172; // LCount * NCount | 32 | const SCount: u21 = 11172; // LCount * NCount |
| 75 | 33 | ||
| 76 | fn decomposeHangul(self: Normalize, cp: u21, buf: []u21) ?Decomp { | 34 | fn decomposeHangul(cp: u21, buf: []u21) ?Decomp { |
| 77 | const kind = self.hangul_data.syllable(cp); | 35 | const kind = HangulData.syllable(cp); |
| 78 | if (kind != .LV and kind != .LVT) return null; | 36 | if (kind != .LV and kind != .LVT) return null; |
| 79 | 37 | ||
| 80 | const SIndex: u21 = cp - SBase; | 38 | const SIndex: u21 = cp - SBase; |
| @@ -143,7 +101,7 @@ fn mapping(self: Normalize, cp: u21, form: Form) Decomp { | |||
| 143 | }, | 101 | }, |
| 144 | 102 | ||
| 145 | .nfkd => { | 103 | .nfkd => { |
| 146 | dc.cps = self.compat_data.toNfkd(cp); | 104 | dc.cps = CompatData.toNfkd(cp); |
| 147 | if (dc.cps.len != 0) { | 105 | if (dc.cps.len != 0) { |
| 148 | dc.form = .nfkd; | 106 | dc.form = .nfkd; |
| 149 | } else { | 107 | } else { |
| @@ -170,13 +128,13 @@ fn decompose( | |||
| 170 | 128 | ||
| 171 | // NFD / NFKD quick checks. | 129 | // NFD / NFKD quick checks. |
| 172 | switch (form) { | 130 | switch (form) { |
| 173 | .nfd => if (self.normp_data.isNfd(cp)) return .{}, | 131 | .nfd => if (NormPropsData.isNfd(cp)) return .{}, |
| 174 | .nfkd => if (self.normp_data.isNfkd(cp)) return .{}, | 132 | .nfkd => if (NormPropsData.isNfkd(cp)) return .{}, |
| 175 | else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."), | 133 | else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."), |
| 176 | } | 134 | } |
| 177 | 135 | ||
| 178 | // Hangul precomposed syllable full decomposition. | 136 | // Hangul precomposed syllable full decomposition. |
| 179 | if (self.decomposeHangul(cp, buf)) |dc| return dc; | 137 | if (decomposeHangul(cp, buf)) |dc| return dc; |
| 180 | 138 | ||
| 181 | // Full decomposition. | 139 | // Full decomposition. |
| 182 | var dc = Decomp{ .form = form }; | 140 | var dc = Decomp{ .form = form }; |
| @@ -218,9 +176,8 @@ fn decompose( | |||
| 218 | 176 | ||
| 219 | test "decompose" { | 177 | test "decompose" { |
| 220 | const allocator = testing.allocator; | 178 | const allocator = testing.allocator; |
| 221 | const n = try Normalize.init(allocator); | 179 | var n = try Normalize.init(allocator); |
| 222 | defer n.deinit(allocator); | 180 | defer n.deinit(allocator); |
| 223 | |||
| 224 | var buf: [18]u21 = undefined; | 181 | var buf: [18]u21 = undefined; |
| 225 | 182 | ||
| 226 | var dc = n.decompose('é', .nfd, &buf); | 183 | var dc = n.decompose('é', .nfd, &buf); |
| @@ -280,17 +237,17 @@ pub const Result = struct { | |||
| 280 | }; | 237 | }; |
| 281 | 238 | ||
| 282 | // Compares code points by Canonical Combining Class order. | 239 | // Compares code points by Canonical Combining Class order. |
| 283 | fn cccLess(self: Normalize, lhs: u21, rhs: u21) bool { | 240 | fn cccLess(_: void, lhs: u21, rhs: u21) bool { |
| 284 | return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs); | 241 | return CombiningData.ccc(lhs) < CombiningData.ccc(rhs); |
| 285 | } | 242 | } |
| 286 | 243 | ||
| 287 | // Applies the Canonical Sorting Algorithm. | 244 | // Applies the Canonical Sorting Algorithm. |
| 288 | fn canonicalSort(self: Normalize, cps: []u21) void { | 245 | fn canonicalSort(cps: []u21) void { |
| 289 | var i: usize = 0; | 246 | var i: usize = 0; |
| 290 | while (i < cps.len) : (i += 1) { | 247 | while (i < cps.len) : (i += 1) { |
| 291 | const start: usize = i; | 248 | const start: usize = i; |
| 292 | while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} | 249 | while (i < cps.len and CombiningData.ccc(cps[i]) != 0) : (i += 1) {} |
| 293 | mem.sort(u21, cps[start..i], self, cccLess); | 250 | mem.sort(u21, cps[start..i], {}, cccLess); |
| 294 | } | 251 | } |
| 295 | } | 252 | } |
| 296 | 253 | ||
| @@ -320,7 +277,7 @@ pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, fo | |||
| 320 | } | 277 | } |
| 321 | } | 278 | } |
| 322 | 279 | ||
| 323 | self.canonicalSort(dcp_list.items); | 280 | canonicalSort(dcp_list.items); |
| 324 | 281 | ||
| 325 | return try dcp_list.toOwnedSlice(); | 282 | return try dcp_list.toOwnedSlice(); |
| 326 | } | 283 | } |
| @@ -346,7 +303,7 @@ fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo | |||
| 346 | 303 | ||
| 347 | test "nfd ASCII / no-alloc" { | 304 | test "nfd ASCII / no-alloc" { |
| 348 | const allocator = testing.allocator; | 305 | const allocator = testing.allocator; |
| 349 | const n = try Normalize.init(allocator); | 306 | var n = try Normalize.init(allocator); |
| 350 | defer n.deinit(allocator); | 307 | defer n.deinit(allocator); |
| 351 | 308 | ||
| 352 | const result = try n.nfd(allocator, "Hello World!"); | 309 | const result = try n.nfd(allocator, "Hello World!"); |
| @@ -357,7 +314,7 @@ test "nfd ASCII / no-alloc" { | |||
| 357 | 314 | ||
| 358 | test "nfd !ASCII / alloc" { | 315 | test "nfd !ASCII / alloc" { |
| 359 | const allocator = testing.allocator; | 316 | const allocator = testing.allocator; |
| 360 | const n = try Normalize.init(allocator); | 317 | var n = try Normalize.init(allocator); |
| 361 | defer n.deinit(allocator); | 318 | defer n.deinit(allocator); |
| 362 | 319 | ||
| 363 | const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); | 320 | const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); |
| @@ -368,7 +325,7 @@ test "nfd !ASCII / alloc" { | |||
| 368 | 325 | ||
| 369 | test "nfkd ASCII / no-alloc" { | 326 | test "nfkd ASCII / no-alloc" { |
| 370 | const allocator = testing.allocator; | 327 | const allocator = testing.allocator; |
| 371 | const n = try Normalize.init(allocator); | 328 | var n = try Normalize.init(allocator); |
| 372 | defer n.deinit(allocator); | 329 | defer n.deinit(allocator); |
| 373 | 330 | ||
| 374 | const result = try n.nfkd(allocator, "Hello World!"); | 331 | const result = try n.nfkd(allocator, "Hello World!"); |
| @@ -379,7 +336,7 @@ test "nfkd ASCII / no-alloc" { | |||
| 379 | 336 | ||
| 380 | test "nfkd !ASCII / alloc" { | 337 | test "nfkd !ASCII / alloc" { |
| 381 | const allocator = testing.allocator; | 338 | const allocator = testing.allocator; |
| 382 | const n = try Normalize.init(allocator); | 339 | var n = try Normalize.init(allocator); |
| 383 | defer n.deinit(allocator); | 340 | defer n.deinit(allocator); |
| 384 | 341 | ||
| 385 | const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); | 342 | const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); |
| @@ -408,7 +365,7 @@ pub fn nfdCodePoints( | |||
| 408 | } | 365 | } |
| 409 | } | 366 | } |
| 410 | 367 | ||
| 411 | self.canonicalSort(dcp_list.items); | 368 | canonicalSort(dcp_list.items); |
| 412 | 369 | ||
| 413 | return try dcp_list.toOwnedSlice(); | 370 | return try dcp_list.toOwnedSlice(); |
| 414 | } | 371 | } |
| @@ -433,15 +390,15 @@ pub fn nfkdCodePoints( | |||
| 433 | } | 390 | } |
| 434 | } | 391 | } |
| 435 | 392 | ||
| 436 | self.canonicalSort(dcp_list.items); | 393 | canonicalSort(dcp_list.items); |
| 437 | 394 | ||
| 438 | return try dcp_list.toOwnedSlice(); | 395 | return try dcp_list.toOwnedSlice(); |
| 439 | } | 396 | } |
| 440 | 397 | ||
| 441 | // Composition (NFC, NFKC) | 398 | // Composition (NFC, NFKC) |
| 442 | 399 | ||
| 443 | fn isHangul(self: Normalize, cp: u21) bool { | 400 | fn isHangul(cp: u21) bool { |
| 444 | return cp >= 0x1100 and self.hangul_data.syllable(cp) != .none; | 401 | return cp >= 0x1100 and HangulData.syllable(cp) != .none; |
| 445 | } | 402 | } |
| 446 | 403 | ||
| 447 | /// Normalizes `str` to NFC. | 404 | /// Normalizes `str` to NFC. |
| @@ -479,7 +436,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo | |||
| 479 | block_check: while (i < dcps.len) : (i += 1) { | 436 | block_check: while (i < dcps.len) : (i += 1) { |
| 480 | const C = dcps[i]; | 437 | const C = dcps[i]; |
| 481 | if (C == tombstone) continue :block_check; | 438 | if (C == tombstone) continue :block_check; |
| 482 | const cc_C = self.ccc_data.ccc(C); | 439 | const cc_C = CombiningData.ccc(C); |
| 483 | var starter_index: ?usize = null; | 440 | var starter_index: ?usize = null; |
| 484 | var j: usize = i; | 441 | var j: usize = i; |
| 485 | 442 | ||
| @@ -489,12 +446,12 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo | |||
| 489 | if (dcps[j] == tombstone) continue; | 446 | if (dcps[j] == tombstone) continue; |
| 490 | 447 | ||
| 491 | // Check for starter. | 448 | // Check for starter. |
| 492 | if (self.ccc_data.isStarter(dcps[j])) { | 449 | if (CombiningData.isStarter(dcps[j])) { |
| 493 | // Check for blocking conditions. | 450 | // Check for blocking conditions. |
| 494 | for (dcps[(j + 1)..i]) |B| { | 451 | for (dcps[(j + 1)..i]) |B| { |
| 495 | if (B == tombstone) continue; | 452 | if (B == tombstone) continue; |
| 496 | const cc_B = self.ccc_data.ccc(B); | 453 | const cc_B = CombiningData.ccc(B); |
| 497 | if (cc_B != 0 and self.isHangul(C)) continue :block_check; | 454 | if (cc_B != 0 and isHangul(C)) continue :block_check; |
| 498 | if (cc_B >= cc_C) continue :block_check; | 455 | if (cc_B >= cc_C) continue :block_check; |
| 499 | } | 456 | } |
| 500 | 457 | ||
| @@ -515,10 +472,10 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo | |||
| 515 | 472 | ||
| 516 | // If L and C are Hangul syllables, we can compose | 473 | // If L and C are Hangul syllables, we can compose |
| 517 | // them algorithmically if possible. | 474 | // them algorithmically if possible. |
| 518 | if (self.isHangul(L) and self.isHangul(C)) { | 475 | if (isHangul(L) and isHangul(C)) { |
| 519 | // Get Hangul syllable types. | 476 | // Get Hangul syllable types. |
| 520 | const l_stype = self.hangul_data.syllable(L); | 477 | const l_stype = HangulData.syllable(L); |
| 521 | const c_stype = self.hangul_data.syllable(C); | 478 | const c_stype = HangulData.syllable(C); |
| 522 | 479 | ||
| 523 | if (l_stype == .LV and c_stype == .T) { | 480 | if (l_stype == .LV and c_stype == .T) { |
| 524 | // LV, T canonical composition. | 481 | // LV, T canonical composition. |
| @@ -547,7 +504,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo | |||
| 547 | // Composition Exclusions (FCX) list, | 504 | // Composition Exclusions (FCX) list, |
| 548 | // preventing it from appearing in any | 505 | // preventing it from appearing in any |
| 549 | // composed form (NFC, NFKC). | 506 | // composed form (NFC, NFKC). |
| 550 | if (!self.normp_data.isFcx(P)) { | 507 | if (!NormPropsData.isFcx(P)) { |
| 551 | dcps[sidx] = P; | 508 | dcps[sidx] = P; |
| 552 | dcps[i] = tombstone; // Mark for deletion. | 509 | dcps[i] = tombstone; // Mark for deletion. |
| 553 | deleted += 1; | 510 | deleted += 1; |
| @@ -577,7 +534,7 @@ fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allo | |||
| 577 | 534 | ||
| 578 | test "nfc" { | 535 | test "nfc" { |
| 579 | const allocator = testing.allocator; | 536 | const allocator = testing.allocator; |
| 580 | const n = try Normalize.init(allocator); | 537 | var n = try Normalize.init(allocator); |
| 581 | defer n.deinit(allocator); | 538 | defer n.deinit(allocator); |
| 582 | 539 | ||
| 583 | const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); | 540 | const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); |
| @@ -588,7 +545,7 @@ test "nfc" { | |||
| 588 | 545 | ||
| 589 | test "nfkc" { | 546 | test "nfkc" { |
| 590 | const allocator = testing.allocator; | 547 | const allocator = testing.allocator; |
| 591 | const n = try Normalize.init(allocator); | 548 | var n = try Normalize.init(allocator); |
| 592 | defer n.deinit(allocator); | 549 | defer n.deinit(allocator); |
| 593 | 550 | ||
| 594 | const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); | 551 | const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); |
| @@ -609,7 +566,7 @@ pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8) | |||
| 609 | 566 | ||
| 610 | test "eql" { | 567 | test "eql" { |
| 611 | const allocator = testing.allocator; | 568 | const allocator = testing.allocator; |
| 612 | const n = try Normalize.init(allocator); | 569 | var n = try Normalize.init(allocator); |
| 613 | defer n.deinit(allocator); | 570 | defer n.deinit(allocator); |
| 614 | 571 | ||
| 615 | try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); | 572 | try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); |
| @@ -666,13 +623,13 @@ const mem = std.mem; | |||
| 666 | const simd = std.simd; | 623 | const simd = std.simd; |
| 667 | const testing = std.testing; | 624 | const testing = std.testing; |
| 668 | const unicode = std.unicode; | 625 | const unicode = std.unicode; |
| 669 | const Allocator = std.mem.Allocator; | 626 | const Allocator = mem.Allocator; |
| 670 | 627 | ||
| 671 | const ascii = @import("ascii"); | 628 | const ascii = @import("ascii"); |
| 672 | const CodePointIterator = @import("code_point").Iterator; | 629 | const CodePointIterator = @import("code_point").Iterator; |
| 673 | 630 | ||
| 674 | const CanonData = @import("CanonData"); | 631 | const CanonData = @import("CanonData"); |
| 675 | const CccData = @import("CombiningData"); | 632 | const CombiningData = @import("CombiningData"); |
| 676 | const CompatData = @import("CompatData"); | 633 | const CompatData = @import("CompatData"); |
| 677 | const HangulData = @import("HangulData"); | 634 | const HangulData = @import("HangulData"); |
| 678 | const NormPropsData = @import("NormPropsData"); | 635 | const NormPropsData = @import("NormPropsData"); |
diff --git a/src/comptime_map.zig b/src/comptime_map.zig new file mode 100644 index 0000000..b2c4d11 --- /dev/null +++ b/src/comptime_map.zig | |||
| @@ -0,0 +1,176 @@ | |||
| 1 | //! A copypasta of Vexu's comptime hashmap: | ||
| 2 | //! | ||
| 3 | //! https://github.com/Vexu/comptime_hash_map | ||
| 4 | //! | ||
| 5 | //! (License at base of file, thanks Vexu!) | ||
| 6 | //! | ||
| 7 | |||
| 8 | /// A comptime hashmap constructed with automatically selected hash and eql functions. | ||
| 9 | pub fn AutoComptimeHashMap(comptime K: type, comptime V: type, comptime values: anytype) type { | ||
| 10 | return ComptimeHashMap(K, V, hash_map.AutoContext(K), values); | ||
| 11 | } | ||
| 12 | |||
| 13 | /// Builtin hashmap for strings as keys. | ||
| 14 | pub fn ComptimeStringHashMap(comptime V: type, comptime values: anytype) type { | ||
| 15 | return ComptimeHashMap([]const u8, V, hash_map.StringContext, values); | ||
| 16 | } | ||
| 17 | |||
| 18 | /// A hashmap which is constructed at compile time from constant values. | ||
| 19 | /// Intended to be used as a faster lookup table. | ||
| 20 | pub fn ComptimeHashMap(comptime K: type, comptime V: type, comptime ctx: type, comptime values: anytype) type { | ||
| 21 | std.debug.assert(values.len != 0); | ||
| 22 | @setEvalBranchQuota(1000 * values.len); | ||
| 23 | |||
| 24 | const Entry = struct { | ||
| 25 | key: K = undefined, | ||
| 26 | val: V = undefined, | ||
| 27 | used: bool = false, | ||
| 28 | }; | ||
| 29 | |||
| 30 | // ensure that the hash map will be at most 60% full | ||
| 31 | const size = if (@import("builtin").is_test) values.len else math.ceilPowerOfTwo(usize, values.len * 5 / 3) catch unreachable; | ||
| 32 | comptime var slots = [1]Entry{.{}} ** size; | ||
| 33 | comptime var distance: [size]usize = .{0} ** size; | ||
| 34 | |||
| 35 | comptime var max_distance_from_start_index = 0; | ||
| 36 | |||
| 37 | slot_loop: for (values) |kv| { | ||
| 38 | var key: K = kv.@"0"; | ||
| 39 | var value: V = kv.@"1"; | ||
| 40 | |||
| 41 | const start_index = @as(usize, ctx.hash(undefined, key)) & (size - 1); | ||
| 42 | |||
| 43 | var roll_over = 0; | ||
| 44 | var distance_from_start_index = 0; | ||
| 45 | while (roll_over < size) : ({ | ||
| 46 | roll_over += 1; | ||
| 47 | distance_from_start_index += 1; | ||
| 48 | }) { | ||
| 49 | const index = (start_index + roll_over) & (size - 1); | ||
| 50 | const entry = &slots[index]; | ||
| 51 | |||
| 52 | if (entry.used and !ctx.eql(undefined, entry.key, key)) { | ||
| 53 | if (distance[index] < distance_from_start_index) { | ||
| 54 | // robin hood to the rescue | ||
| 55 | const tmp = slots[index]; | ||
| 56 | max_distance_from_start_index = @max(max_distance_from_start_index, distance_from_start_index); | ||
| 57 | entry.* = .{ | ||
| 58 | .used = true, | ||
| 59 | .key = key, | ||
| 60 | .val = value, | ||
| 61 | }; | ||
| 62 | const tmp_distance = distance[index]; | ||
| 63 | distance[index] = distance_from_start_index; | ||
| 64 | key = tmp.key; | ||
| 65 | value = tmp.val; | ||
| 66 | distance_from_start_index = tmp_distance; | ||
| 67 | } | ||
| 68 | continue; | ||
| 69 | } | ||
| 70 | |||
| 71 | max_distance_from_start_index = @max(distance_from_start_index, max_distance_from_start_index); | ||
| 72 | entry.* = .{ | ||
| 73 | .used = true, | ||
| 74 | .key = key, | ||
| 75 | .val = value, | ||
| 76 | }; | ||
| 77 | distance[index] = distance_from_start_index; | ||
| 78 | continue :slot_loop; | ||
| 79 | } | ||
| 80 | unreachable; // put into a full map | ||
| 81 | } | ||
| 82 | |||
| 83 | return struct { | ||
| 84 | const entries = slots; | ||
| 85 | |||
| 86 | pub fn has(key: K) bool { | ||
| 87 | return get(key) != null; | ||
| 88 | } | ||
| 89 | |||
| 90 | pub fn get(key: K) ?*const V { | ||
| 91 | const start_index = @as(usize, ctx.hash(undefined, key)) & (size - 1); | ||
| 92 | { | ||
| 93 | var roll_over: usize = 0; | ||
| 94 | while (roll_over <= max_distance_from_start_index) : (roll_over += 1) { | ||
| 95 | const index = (start_index + roll_over) & (size - 1); | ||
| 96 | const entry = &entries[index]; | ||
| 97 | |||
| 98 | if (!entry.used) return null; | ||
| 99 | if (ctx.eql(undefined, entry.key, key)) return &entry.val; | ||
| 100 | } | ||
| 101 | } | ||
| 102 | return null; | ||
| 103 | } | ||
| 104 | }; | ||
| 105 | } | ||
| 106 | |||
| 107 | test "basic usage" { | ||
| 108 | const map = ComptimeStringHashMap(usize, .{ | ||
| 109 | .{ "foo", 1 }, | ||
| 110 | .{ "bar", 2 }, | ||
| 111 | .{ "baz", 3 }, | ||
| 112 | .{ "quux", 4 }, | ||
| 113 | .{ "Foo", 1 }, | ||
| 114 | .{ "Bar", 2 }, | ||
| 115 | .{ "Baz", 3 }, | ||
| 116 | .{ "Quux", 4 }, | ||
| 117 | }); | ||
| 118 | |||
| 119 | try testing.expect(map.has("foo")); | ||
| 120 | try testing.expect(map.has("bar")); | ||
| 121 | try testing.expect(map.has("Foo")); | ||
| 122 | try testing.expect(map.has("Bar")); | ||
| 123 | try testing.expect(!map.has("zig")); | ||
| 124 | try testing.expect(!map.has("ziguana")); | ||
| 125 | |||
| 126 | try testing.expect(map.get("baz").?.* == 3); | ||
| 127 | try testing.expect(map.get("quux").?.* == 4); | ||
| 128 | try testing.expect(map.get("nah") == null); | ||
| 129 | try testing.expect(map.get("...") == null); | ||
| 130 | } | ||
| 131 | |||
| 132 | test "auto comptime hash map" { | ||
| 133 | const map = AutoComptimeHashMap(usize, []const u8, .{ | ||
| 134 | .{ 1, "foo" }, | ||
| 135 | .{ 2, "bar" }, | ||
| 136 | .{ 3, "baz" }, | ||
| 137 | .{ 45, "quux" }, | ||
| 138 | }); | ||
| 139 | |||
| 140 | try testing.expect(map.has(1)); | ||
| 141 | try testing.expect(map.has(2)); | ||
| 142 | try testing.expect(!map.has(4)); | ||
| 143 | try testing.expect(!map.has(1_000_000)); | ||
| 144 | |||
| 145 | try testing.expectEqualStrings("foo", map.get(1).?.*); | ||
| 146 | try testing.expectEqualStrings("bar", map.get(2).?.*); | ||
| 147 | try testing.expect(map.get(4) == null); | ||
| 148 | try testing.expect(map.get(4_000_000) == null); | ||
| 149 | } | ||
| 150 | |||
| 151 | const std = @import("std"); | ||
| 152 | const hash_map = std.hash_map; | ||
| 153 | const testing = std.testing; | ||
| 154 | const math = std.math; | ||
| 155 | |||
| 156 | // MIT License | ||
| 157 | // | ||
| 158 | // Copyright (c) 2020 Veikka Tuominen | ||
| 159 | // | ||
| 160 | // Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| 161 | // of this software and associated documentation files (the "Software"), to deal | ||
| 162 | // in the Software without restriction, including without limitation the rights | ||
| 163 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| 164 | // copies of the Software, and to permit persons to whom the Software is | ||
| 165 | // furnished to do so, subject to the following conditions: | ||
| 166 | // | ||
| 167 | // The above copyright notice and this permission notice shall be included in all | ||
| 168 | // copies or substantial portions of the Software. | ||
| 169 | // | ||
| 170 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| 171 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| 172 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| 173 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| 174 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| 175 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 176 | // SOFTWARE. | ||