summaryrefslogtreecommitdiff
path: root/src/CanonData.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/CanonData.zig')
-rw-r--r--src/CanonData.zig87
1 files changed, 43 insertions, 44 deletions
diff --git a/src/CanonData.zig b/src/CanonData.zig
index cf9dc8a..c972534 100644
--- a/src/CanonData.zig
+++ b/src/CanonData.zig
@@ -1,78 +1,77 @@
1//! Canonicalization Data 1//! Canonicalization Data
2 2
3s1: []const u16 = undefined,
4s2: []const @import("canon").Canonicalization = undefined,
3nfc: std.AutoHashMapUnmanaged([2]u21, u21), 5nfc: std.AutoHashMapUnmanaged([2]u21, u21),
4nfd: [][]u21 = undefined,
5cps: []u21 = undefined,
6 6
7const CanonData = @This(); 7const CanonData = @This();
8 8
9pub fn init(allocator: mem.Allocator) !CanonData { 9// There's a bug here, which is down to how static u21 vs. runtime are handled,
10 const in_bytes = @embedFile("canon"); 10// the "unique representation" claim is not working out. So we do this:
11 var in_fbs = std.io.fixedBufferStream(in_bytes);
12 var reader = in_fbs.reader();
13 11
14 const endian = builtin.cpu.arch.endian(); 12const Context = struct {
15 var cdata = CanonData{ 13 pub fn hash(_: Context, cps: [2]u21) u64 {
16 .nfc = .empty, 14 const cp_44: u64 = (@as(u64, cps[0]) << 22) + cps[1];
17 .nfd = try allocator.alloc([]u21, 0x110000), 15 return std.hash.int(cp_44);
18 };
19 {
20 errdefer allocator.free(cdata.nfd);
21 cdata.cps = try allocator.alloc(u21, magic.canon_size);
22 } 16 }
23 17
24 var total_cp: u24 = undefined; 18 pub fn eql(_: Context, cps1: [2]u21, cps2: [2]u21) bool {
25 19 return cps1[0] == cps2[0] and cps1[1] == cps2[1];
26 errdefer {
27 cdata.nfc.deinit(allocator);
28 allocator.free(cdata.cps);
29 allocator.free(cdata.nfd);
30 } 20 }
21};
31 22
32 @memset(cdata.nfd, &.{}); 23const c_map = comptime_map.ComptimeHashMap([2]u21, u21, Context, @import("canon").c_map);
33 24
34 var total_len: usize = 0; 25pub fn init(allocator: mem.Allocator) !CanonData {
26 var cdata = CanonData{
27 .nfc = .empty,
28 };
29 errdefer cdata.deinit(allocator);
35 30
36 while (true) { 31 const data = @import("canon");
37 const len: u8 = try reader.readInt(u8, endian); 32 cdata.s1 = &data.s1;
38 if (len == 0) break; 33 cdata.s2 = &data.s2;
39 const cp = try reader.readInt(u24, endian); 34 var count: usize = 0;
40 total_cp = cp; 35 for (data.composite) |cp| {
41 const nfd_cp = cdata.cps[total_len..][0 .. len - 1]; 36 count += 1;
42 for (0..len - 1) |i| { 37 const cps = cdata.toNfd(cp);
43 nfd_cp[i] = @intCast(try reader.readInt(u24, endian)); 38 std.debug.assert(cps.len == 2);
44 } 39 try cdata.nfc.put(allocator, cps[0..2].*, cp);
45 if (len == 3) {
46 try cdata.nfc.put(allocator, nfd_cp[0..2].*, @intCast(cp));
47 }
48 cdata.nfd[cp] = nfd_cp;
49 total_len += len - 1;
50 } 40 }
51 41
52 if (comptime magic.print) std.debug.print("CanonData magic number: {d}\n", .{total_len}); 42 // var keys = cdata.nfc.keyIterator();
43 // while (keys.next()) |key| {
44 // const c32: [2]u32 = .{ key[0], key[1] };
45 // if (c_map.get(c32)) |_| {
46 // std.debug.print("got", .{});
47 // }
48 // }
53 49
54 return cdata; 50 return cdata;
55} 51}
56 52
57pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void { 53pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void {
58 cdata.nfc.deinit(allocator); 54 cdata.nfc.deinit(allocator);
59 allocator.free(cdata.cps);
60 allocator.free(cdata.nfd);
61} 55}
62 56
63/// Returns canonical decomposition for `cp`. 57/// Returns canonical decomposition for `cp`.
64pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 { 58pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 {
65 return cdata.nfd[cp]; 59 const canon = &cdata.s2[cdata.s1[cp >> 8] + (cp & 0xff)];
60 return canon.cps[0..canon.len];
66} 61}
67 62
68// Returns the primary composite for the codepoints in `cp`. 63// Returns the primary composite for the codepoints in `cp`.
69pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 { 64pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 {
70 return cdata.nfc.get(cps); 65 _ = cdata;
66 if (c_map.get(cps)) |cpp| {
67 return cpp.*;
68 } else {
69 return null;
70 }
71 unreachable;
71} 72}
72 73
73const std = @import("std"); 74const std = @import("std");
74const builtin = @import("builtin"); 75const builtin = @import("builtin");
75const compress = std.compress;
76const mem = std.mem; 76const mem = std.mem;
77const magic = @import("magic"); 77const comptime_map = @import("comptime_map.zig");
78const options = @import("options");