diff options
| author | 2026-02-05 07:07:40 -0500 | |
|---|---|---|
| committer | 2026-02-05 07:07:40 -0500 | |
| commit | 95f9487f6a7bde2d7266399bdf6843b97cc1b301 (patch) | |
| tree | 122cd20fa574861e807844974b49eb2f91285d3c /src/CanonData.zig | |
| parent | Teasing out canonicalization (diff) | |
| download | zg-95f9487f6a7bde2d7266399bdf6843b97cc1b301.tar.gz zg-95f9487f6a7bde2d7266399bdf6843b97cc1b301.tar.xz zg-95f9487f6a7bde2d7266399bdf6843b97cc1b301.zip | |
Base units do not allocate
CanonData included. I may still sort out caseless matching without
allocation, but that's a stretch goal.
Closes #86
Closes #85
Diffstat (limited to 'src/CanonData.zig')
| -rw-r--r-- | src/CanonData.zig | 57 |
1 files changed, 17 insertions, 40 deletions
diff --git a/src/CanonData.zig b/src/CanonData.zig index c972534..5c1ffa6 100644 --- a/src/CanonData.zig +++ b/src/CanonData.zig | |||
| @@ -1,13 +1,23 @@ | |||
| 1 | //! Canonicalization Data | 1 | //! Canonicalization Data |
| 2 | 2 | ||
| 3 | s1: []const u16 = undefined, | 3 | const Data = struct { |
| 4 | s2: []const @import("canon").Canonicalization = undefined, | 4 | s1: []const u16 = undefined, |
| 5 | nfc: std.AutoHashMapUnmanaged([2]u21, u21), | 5 | s2: []const @import("canon").Canonicalization = undefined, |
| 6 | }; | ||
| 7 | |||
| 8 | const canon_data = canon_data: { | ||
| 9 | const canon_ = @import("canon"); | ||
| 10 | break :canon_data Data{ | ||
| 11 | .s1 = &canon_.s1, | ||
| 12 | .s2 = &canon_.s2, | ||
| 13 | }; | ||
| 14 | }; | ||
| 6 | 15 | ||
| 7 | const CanonData = @This(); | 16 | const CanonData = @This(); |
| 8 | 17 | ||
| 9 | // There's a bug here, which is down to how static u21 vs. runtime are handled, | 18 | // There's a bug here, which is down to how static u21 vs. runtime are handled, |
| 10 | // the "unique representation" claim is not working out. So we do this: | 19 | // the "unique representation" claim is not working out. AutoHash casts to bytes, |
| 20 | // and that won't fly. So we do this: | ||
| 11 | 21 | ||
| 12 | const Context = struct { | 22 | const Context = struct { |
| 13 | pub fn hash(_: Context, cps: [2]u21) u64 { | 23 | pub fn hash(_: Context, cps: [2]u21) u64 { |
| @@ -22,47 +32,14 @@ const Context = struct { | |||
| 22 | 32 | ||
| 23 | const c_map = comptime_map.ComptimeHashMap([2]u21, u21, Context, @import("canon").c_map); | 33 | const c_map = comptime_map.ComptimeHashMap([2]u21, u21, Context, @import("canon").c_map); |
| 24 | 34 | ||
| 25 | pub fn init(allocator: mem.Allocator) !CanonData { | ||
| 26 | var cdata = CanonData{ | ||
| 27 | .nfc = .empty, | ||
| 28 | }; | ||
| 29 | errdefer cdata.deinit(allocator); | ||
| 30 | |||
| 31 | const data = @import("canon"); | ||
| 32 | cdata.s1 = &data.s1; | ||
| 33 | cdata.s2 = &data.s2; | ||
| 34 | var count: usize = 0; | ||
| 35 | for (data.composite) |cp| { | ||
| 36 | count += 1; | ||
| 37 | const cps = cdata.toNfd(cp); | ||
| 38 | std.debug.assert(cps.len == 2); | ||
| 39 | try cdata.nfc.put(allocator, cps[0..2].*, cp); | ||
| 40 | } | ||
| 41 | |||
| 42 | // var keys = cdata.nfc.keyIterator(); | ||
| 43 | // while (keys.next()) |key| { | ||
| 44 | // const c32: [2]u32 = .{ key[0], key[1] }; | ||
| 45 | // if (c_map.get(c32)) |_| { | ||
| 46 | // std.debug.print("got", .{}); | ||
| 47 | // } | ||
| 48 | // } | ||
| 49 | |||
| 50 | return cdata; | ||
| 51 | } | ||
| 52 | |||
| 53 | pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void { | ||
| 54 | cdata.nfc.deinit(allocator); | ||
| 55 | } | ||
| 56 | |||
| 57 | /// Returns canonical decomposition for `cp`. | 35 | /// Returns canonical decomposition for `cp`. |
| 58 | pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 { | 36 | pub fn toNfd(cp: u21) []const u21 { |
| 59 | const canon = &cdata.s2[cdata.s1[cp >> 8] + (cp & 0xff)]; | 37 | const canon = &canon_data.s2[canon_data.s1[cp >> 8] + (cp & 0xff)]; |
| 60 | return canon.cps[0..canon.len]; | 38 | return canon.cps[0..canon.len]; |
| 61 | } | 39 | } |
| 62 | 40 | ||
| 63 | // Returns the primary composite for the codepoints in `cp`. | 41 | // Returns the primary composite for the codepoints in `cp`. |
| 64 | pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 { | 42 | pub fn toNfc(cps: [2]u21) ?u21 { |
| 65 | _ = cdata; | ||
| 66 | if (c_map.get(cps)) |cpp| { | 43 | if (c_map.get(cps)) |cpp| { |
| 67 | return cpp.*; | 44 | return cpp.*; |
| 68 | } else { | 45 | } else { |