summaryrefslogtreecommitdiff
path: root/src/CanonData.zig
blob: c9725349926e9ded44ff077f4683139850987a5c (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
//! Canonicalization Data

s1: []const u16 = undefined,
s2: []const @import("canon").Canonicalization = undefined,
nfc: std.AutoHashMapUnmanaged([2]u21, u21),

const CanonData = @This();

// There's a bug here, which is down to how static u21 vs. runtime are handled,
// the "unique representation" claim is not working out.  So we do this:

const Context = struct {
    pub fn hash(_: Context, cps: [2]u21) u64 {
        const cp_44: u64 = (@as(u64, cps[0]) << 22) + cps[1];
        return std.hash.int(cp_44);
    }

    pub fn eql(_: Context, cps1: [2]u21, cps2: [2]u21) bool {
        return cps1[0] == cps2[0] and cps1[1] == cps2[1];
    }
};

const c_map = comptime_map.ComptimeHashMap([2]u21, u21, Context, @import("canon").c_map);

pub fn init(allocator: mem.Allocator) !CanonData {
    var cdata = CanonData{
        .nfc = .empty,
    };
    errdefer cdata.deinit(allocator);

    const data = @import("canon");
    cdata.s1 = &data.s1;
    cdata.s2 = &data.s2;
    var count: usize = 0;
    for (data.composite) |cp| {
        count += 1;
        const cps = cdata.toNfd(cp);
        std.debug.assert(cps.len == 2);
        try cdata.nfc.put(allocator, cps[0..2].*, cp);
    }

    // var keys = cdata.nfc.keyIterator();
    // while (keys.next()) |key| {
    //     const c32: [2]u32 = .{ key[0], key[1] };
    //     if (c_map.get(c32)) |_| {
    //         std.debug.print("got", .{});
    //     }
    // }

    return cdata;
}

pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void {
    cdata.nfc.deinit(allocator);
}

/// Returns canonical decomposition for `cp`.
pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 {
    const canon = &cdata.s2[cdata.s1[cp >> 8] + (cp & 0xff)];
    return canon.cps[0..canon.len];
}

// Returns the primary composite for the codepoints in `cp`.
pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 {
    _ = cdata;
    if (c_map.get(cps)) |cpp| {
        return cpp.*;
    } else {
        return null;
    }
    unreachable;
}

const std = @import("std");
const builtin = @import("builtin");
const mem = std.mem;
const comptime_map = @import("comptime_map.zig");