From 958c13ba442e7077a50d7163fdeb9bba378f95c2 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 30 Apr 2025 15:32:34 -0400 Subject: Rest of the Renamings These get different names, but don't otherwise change. --- src/CaseData.zig | 200 ----------------------------- src/CaseFold.zig | 321 ---------------------------------------------- src/CaseFolding.zig | 321 ++++++++++++++++++++++++++++++++++++++++++++++ src/GenCatData.zig | 170 ------------------------ src/GeneralCategories.zig | 170 ++++++++++++++++++++++++ src/LetterCasing.zig | 200 +++++++++++++++++++++++++++++ src/Properties.zig | 163 +++++++++++++++++++++++ src/PropsData.zig | 163 ----------------------- src/Scripts.zig | 227 ++++++++++++++++++++++++++++++++ src/ScriptsData.zig | 227 -------------------------------- 10 files changed, 1081 insertions(+), 1081 deletions(-) delete mode 100644 src/CaseData.zig delete mode 100644 src/CaseFold.zig create mode 100644 src/CaseFolding.zig delete mode 100644 src/GenCatData.zig create mode 100644 src/GeneralCategories.zig create mode 100644 src/LetterCasing.zig create mode 100644 src/Properties.zig delete mode 100644 src/PropsData.zig create mode 100644 src/Scripts.zig delete mode 100644 src/ScriptsData.zig (limited to 'src') diff --git a/src/CaseData.zig b/src/CaseData.zig deleted file mode 100644 index 0a0acb1..0000000 --- a/src/CaseData.zig +++ /dev/null @@ -1,200 +0,0 @@ -const std = @import("std"); -const builtin = @import("builtin"); -const compress = std.compress; -const mem = std.mem; -const testing = std.testing; -const unicode = std.unicode; - -const CodePointIterator = @import("code_point").Iterator; - -case_map: [][2]u21, -prop_s1: []u16 = undefined, -prop_s2: []u8 = undefined, - -const Self = @This(); - -pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.flate.inflate.decompressor; - const endian = builtin.cpu.arch.endian(); - - var self = Self{ - .case_map = try allocator.alloc([2]u21, 0x110000), - }; - errdefer allocator.free(self.case_map); - - for (0..0x110000) |i| { - const cp: u21 = @intCast(i); - self.case_map[cp] = .{ cp, cp }; - } - - // Uppercase - const upper_bytes = @embedFile("upper"); - var upper_fbs = std.io.fixedBufferStream(upper_bytes); - var upper_decomp = decompressor(.raw, upper_fbs.reader()); - var upper_reader = upper_decomp.reader(); - - while (true) { - const cp = try upper_reader.readInt(i24, endian); - if (cp == 0) break; - const diff = try upper_reader.readInt(i24, endian); - self.case_map[@intCast(cp)][0] = @intCast(cp + diff); - } - - // Lowercase - const lower_bytes = @embedFile("lower"); - var lower_fbs = std.io.fixedBufferStream(lower_bytes); - var lower_decomp = decompressor(.raw, lower_fbs.reader()); - var lower_reader = lower_decomp.reader(); - - while (true) { - const cp = try lower_reader.readInt(i24, endian); - if (cp == 0) break; - const diff = try lower_reader.readInt(i24, endian); - self.case_map[@intCast(cp)][1] = @intCast(cp + diff); - } - - // Case properties - const cp_bytes = @embedFile("case_prop"); - var cp_fbs = std.io.fixedBufferStream(cp_bytes); - var cp_decomp = decompressor(.raw, cp_fbs.reader()); - var cp_reader = cp_decomp.reader(); - - const stage_1_len: u16 = try cp_reader.readInt(u16, endian); - self.prop_s1 = try allocator.alloc(u16, stage_1_len); - errdefer allocator.free(self.prop_s1); - for (0..stage_1_len) |i| self.prop_s1[i] = try cp_reader.readInt(u16, endian); - - const stage_2_len: u16 = try cp_reader.readInt(u16, endian); - self.prop_s2 = try allocator.alloc(u8, stage_2_len); - errdefer allocator.free(self.prop_s2); - _ = try cp_reader.readAll(self.prop_s2); - - return self; -} - -pub fn deinit(self: *const Self, allocator: mem.Allocator) void { - allocator.free(self.case_map); - allocator.free(self.prop_s1); - allocator.free(self.prop_s2); -} - -// Returns true if `cp` is either upper, lower, or title case. -pub fn isCased(self: Self, cp: u21) bool { - return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; -} - -// Returns true if `cp` is uppercase. -pub fn isUpper(self: Self, cp: u21) bool { - return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; -} - -/// Returns true if `str` is all uppercase. -pub fn isUpperStr(self: Self, str: []const u8) bool { - var iter = CodePointIterator{ .bytes = str }; - - return while (iter.next()) |cp| { - if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false; - } else true; -} - -test "isUpperStr" { - const cd = try init(testing.allocator); - defer cd.deinit(testing.allocator); - - try testing.expect(cd.isUpperStr("HELLO, WORLD 2112!")); - try testing.expect(!cd.isUpperStr("hello, world 2112!")); - try testing.expect(!cd.isUpperStr("Hello, World 2112!")); -} - -/// Returns uppercase mapping for `cp`. -pub fn toUpper(self: Self, cp: u21) u21 { - return self.case_map[cp][0]; -} - -/// Returns a new string with all letters in uppercase. -/// Caller must free returned bytes with `allocator`. -pub fn toUpperStr( - self: Self, - allocator: mem.Allocator, - str: []const u8, -) ![]u8 { - var bytes = std.ArrayList(u8).init(allocator); - defer bytes.deinit(); - - var iter = CodePointIterator{ .bytes = str }; - var buf: [4]u8 = undefined; - - while (iter.next()) |cp| { - const len = try unicode.utf8Encode(self.toUpper(cp.code), &buf); - try bytes.appendSlice(buf[0..len]); - } - - return try bytes.toOwnedSlice(); -} - -test "toUpperStr" { - const cd = try init(testing.allocator); - defer cd.deinit(testing.allocator); - - const uppered = try cd.toUpperStr(testing.allocator, "Hello, World 2112!"); - defer testing.allocator.free(uppered); - try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered); -} - -// Returns true if `cp` is lowercase. -pub fn isLower(self: Self, cp: u21) bool { - return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; -} - -/// Returns true if `str` is all lowercase. -pub fn isLowerStr(self: Self, str: []const u8) bool { - var iter = CodePointIterator{ .bytes = str }; - - return while (iter.next()) |cp| { - if (self.isCased(cp.code) and !self.isLower(cp.code)) break false; - } else true; -} - -test "isLowerStr" { - const cd = try init(testing.allocator); - defer cd.deinit(testing.allocator); - - try testing.expect(cd.isLowerStr("hello, world 2112!")); - try testing.expect(!cd.isLowerStr("HELLO, WORLD 2112!")); - try testing.expect(!cd.isLowerStr("Hello, World 2112!")); -} - -/// Returns lowercase mapping for `cp`. -pub fn toLower(self: Self, cp: u21) u21 { - return self.case_map[cp][1]; -} - -/// Returns a new string with all letters in lowercase. -/// Caller must free returned bytes with `allocator`. -pub fn toLowerStr( - self: Self, - allocator: mem.Allocator, - str: []const u8, -) ![]u8 { - var bytes = std.ArrayList(u8).init(allocator); - defer bytes.deinit(); - - var iter = CodePointIterator{ .bytes = str }; - var buf: [4]u8 = undefined; - - while (iter.next()) |cp| { - const len = try unicode.utf8Encode(self.toLower(cp.code), &buf); - try bytes.appendSlice(buf[0..len]); - } - - return try bytes.toOwnedSlice(); -} - -test "toLowerStr" { - const cd = try init(testing.allocator); - defer cd.deinit(testing.allocator); - - const lowered = try cd.toLowerStr(testing.allocator, "Hello, World 2112!"); - defer testing.allocator.free(lowered); - try testing.expectEqualStrings("hello, world 2112!", lowered); -} diff --git a/src/CaseFold.zig b/src/CaseFold.zig deleted file mode 100644 index 162e82f..0000000 --- a/src/CaseFold.zig +++ /dev/null @@ -1,321 +0,0 @@ -cutoff: u21 = undefined, -cwcf_exceptions_min: u21 = undefined, -cwcf_exceptions_max: u21 = undefined, -cwcf_exceptions: []u21 = undefined, -multiple_start: u21 = undefined, -stage1: []u8 = undefined, -stage2: []u8 = undefined, -stage3: []i24 = undefined, -normalize: Normalize, -owns_normalize: bool, - -const CaseFolding = @This(); - -pub fn init(allocator: Allocator) !CaseFolding { - var case_fold: CaseFolding = undefined; - try case_fold.setup(allocator); - return case_fold; -} - -pub fn initWithNormalize(allocator: Allocator, norm: Normalize) !CaseFolding { - var casefold: CaseFolding = undefined; - try casefold.setupWithNormalize(allocator, norm); - return casefold; -} - -pub fn setup(casefold: *CaseFolding, allocator: Allocator) !void { - try casefold.setupImpl(allocator); - casefold.owns_normalize = false; - errdefer casefold.deinit(allocator); - try casefold.normalize.setup(allocator); - casefold.owns_normalize = true; -} - -pub fn setupWithNormalize(casefold: *CaseFolding, allocator: Allocator, norm: Normalize) !void { - try casefold.setupImpl(allocator); - casefold.normalize = norm; - casefold.owns_normalize = false; -} - -fn setupImpl(casefold: *CaseFolding, allocator: Allocator) !void { - const decompressor = compress.flate.inflate.decompressor; - const in_bytes = @embedFile("fold"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = decompressor(.raw, in_fbs.reader()); - var reader = in_decomp.reader(); - - const endian = builtin.cpu.arch.endian(); - - casefold.cutoff = @intCast(try reader.readInt(u24, endian)); - casefold.multiple_start = @intCast(try reader.readInt(u24, endian)); - - var len = try reader.readInt(u16, endian); - casefold.stage1 = try allocator.alloc(u8, len); - errdefer allocator.free(casefold.stage1); - for (0..len) |i| casefold.stage1[i] = try reader.readInt(u8, endian); - - len = try reader.readInt(u16, endian); - casefold.stage2 = try allocator.alloc(u8, len); - errdefer allocator.free(casefold.stage2); - for (0..len) |i| casefold.stage2[i] = try reader.readInt(u8, endian); - - len = try reader.readInt(u16, endian); - casefold.stage3 = try allocator.alloc(i24, len); - errdefer allocator.free(casefold.stage3); - for (0..len) |i| casefold.stage3[i] = try reader.readInt(i24, endian); - - casefold.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian)); - casefold.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian)); - len = try reader.readInt(u16, endian); - casefold.cwcf_exceptions = try allocator.alloc(u21, len); - errdefer allocator.free(casefold.cwcf_exceptions); - for (0..len) |i| casefold.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian)); -} - -pub fn deinit(fdata: *const CaseFolding, allocator: mem.Allocator) void { - allocator.free(fdata.stage1); - allocator.free(fdata.stage2); - allocator.free(fdata.stage3); - allocator.free(fdata.cwcf_exceptions); - if (fdata.owns_normalize) fdata.normalize.deinit(allocator); -} - -/// Returns the case fold for `cp`. -pub fn caseFold(fdata: *const CaseFolding, cp: u21, buf: []u21) []const u21 { - if (cp >= fdata.cutoff) return &.{}; - - const stage1_val = fdata.stage1[cp >> 8]; - if (stage1_val == 0) return &.{}; - - const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF); - const stage3_index = fdata.stage2[stage2_index]; - - if (stage3_index & 0x80 != 0) { - const real_index = @as(usize, fdata.multiple_start) + (stage3_index ^ 0x80) * 3; - const mapping = mem.sliceTo(fdata.stage3[real_index..][0..3], 0); - for (mapping, 0..) |c, i| buf[i] = @intCast(c); - - return buf[0..mapping.len]; - } - - const offset = fdata.stage3[stage3_index]; - if (offset == 0) return &.{}; - - buf[0] = @intCast(@as(i32, cp) + offset); - - return buf[0..1]; -} - -/// Produces the case folded code points for `cps`. Caller must free returned -/// slice with `allocator`. -pub fn caseFoldAlloc( - casefold: *const CaseFolding, - allocator: Allocator, - cps: []const u21, -) Allocator.Error![]const u21 { - var cfcps = std.ArrayList(u21).init(allocator); - defer cfcps.deinit(); - var buf: [3]u21 = undefined; - - for (cps) |cp| { - const cf = casefold.caseFold(cp, &buf); - - if (cf.len == 0) { - try cfcps.append(cp); - } else { - try cfcps.appendSlice(cf); - } - } - - return try cfcps.toOwnedSlice(); -} - -/// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). -pub fn cpChangesWhenCaseFolded(casefold: *const CaseFolding, cp: u21) bool { - var buf: [3]u21 = undefined; - const has_mapping = casefold.caseFold(cp, &buf).len != 0; - return has_mapping and !casefold.isCwcfException(cp); -} - -pub fn changesWhenCaseFolded(casefold: *const CaseFolding, cps: []const u21) bool { - return for (cps) |cp| { - if (casefold.cpChangesWhenCaseFolded(cp)) break true; - } else false; -} - -fn isCwcfException(casefold: *const CaseFolding, cp: u21) bool { - return cp >= casefold.cwcf_exceptions_min and - cp <= casefold.cwcf_exceptions_max and - std.mem.indexOfScalar(u21, casefold.cwcf_exceptions, cp) != null; -} - -/// Caseless compare `a` and `b` by decomposing to NFKD. This is the most -/// comprehensive comparison possible, but slower than `canonCaselessMatch`. -pub fn compatCaselessMatch( - casefold: *const CaseFolding, - allocator: Allocator, - a: []const u8, - b: []const u8, -) Allocator.Error!bool { - if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); - - // Process a - const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd); - defer allocator.free(nfd_a); - - var need_free_cf_nfd_a = false; - var cf_nfd_a: []const u21 = nfd_a; - if (casefold.changesWhenCaseFolded(nfd_a)) { - cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a); - need_free_cf_nfd_a = true; - } - defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); - - const nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_a); - defer allocator.free(nfkd_cf_nfd_a); - const cf_nfkd_cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_a); - defer allocator.free(cf_nfkd_cf_nfd_a); - const nfkd_cf_nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); - defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); - - // Process b - const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd); - defer allocator.free(nfd_b); - - var need_free_cf_nfd_b = false; - var cf_nfd_b: []const u21 = nfd_b; - if (casefold.changesWhenCaseFolded(nfd_b)) { - cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b); - need_free_cf_nfd_b = true; - } - defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); - - const nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_b); - defer allocator.free(nfkd_cf_nfd_b); - const cf_nfkd_cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_b); - defer allocator.free(cf_nfkd_cf_nfd_b); - const nfkd_cf_nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); - defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); - - return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); -} - -test "compatCaselessMatch" { - const allocator = testing.allocator; - - const caser = try CaseFolding.init(allocator); - defer caser.deinit(allocator); - - try testing.expect(try caser.compatCaselessMatch(allocator, "ascii only!", "ASCII Only!")); - - const a = "Héllo World! \u{3d3}"; - const b = "He\u{301}llo World! \u{3a5}\u{301}"; - try testing.expect(try caser.compatCaselessMatch(allocator, a, b)); - - const c = "He\u{301}llo World! \u{3d2}\u{301}"; - try testing.expect(try caser.compatCaselessMatch(allocator, a, c)); -} - -/// Performs canonical caseless string matching by decomposing to NFD. This is -/// faster than `compatCaselessMatch`, but less comprehensive. -pub fn canonCaselessMatch( - casefold: *const CaseFolding, - allocator: Allocator, - a: []const u8, - b: []const u8, -) Allocator.Error!bool { - if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); - - // Process a - const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd); - defer allocator.free(nfd_a); - - var need_free_cf_nfd_a = false; - var cf_nfd_a: []const u21 = nfd_a; - if (casefold.changesWhenCaseFolded(nfd_a)) { - cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a); - need_free_cf_nfd_a = true; - } - defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); - - var need_free_nfd_cf_nfd_a = false; - var nfd_cf_nfd_a = cf_nfd_a; - if (!need_free_cf_nfd_a) { - nfd_cf_nfd_a = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_a); - need_free_nfd_cf_nfd_a = true; - } - defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); - - // Process b - const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd); - defer allocator.free(nfd_b); - - var need_free_cf_nfd_b = false; - var cf_nfd_b: []const u21 = nfd_b; - if (casefold.changesWhenCaseFolded(nfd_b)) { - cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b); - need_free_cf_nfd_b = true; - } - defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); - - var need_free_nfd_cf_nfd_b = false; - var nfd_cf_nfd_b = cf_nfd_b; - if (!need_free_cf_nfd_b) { - nfd_cf_nfd_b = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_b); - need_free_nfd_cf_nfd_b = true; - } - defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b); - - return mem.eql(u21, nfd_cf_nfd_a, nfd_cf_nfd_b); -} - -test "canonCaselessMatch" { - const allocator = testing.allocator; - - const caser = try CaseFolding.init(allocator); - defer caser.deinit(allocator); - - try testing.expect(try caser.canonCaselessMatch(allocator, "ascii only!", "ASCII Only!")); - - const a = "Héllo World! \u{3d3}"; - const b = "He\u{301}llo World! \u{3a5}\u{301}"; - try testing.expect(!try caser.canonCaselessMatch(allocator, a, b)); - - const c = "He\u{301}llo World! \u{3d2}\u{301}"; - try testing.expect(try caser.canonCaselessMatch(allocator, a, c)); -} - -fn testAllocations(allocator: Allocator) !void { - // With normalize provided - { - const normalize = try Normalize.init(allocator); - defer normalize.deinit(allocator); - const caser1 = try CaseFolding.initWithNormalize(allocator, normalize); - defer caser1.deinit(allocator); - } - // With normalize owned - { - const caser2 = try CaseFolding.init(allocator); - defer caser2.deinit(allocator); - } -} - -// test "Allocation Failures" { -// if (true) return error.SkipZigTest; // XXX: remove -// try testing.checkAllAllocationFailures( -// testing.allocator, -// testAllocations, -// .{}, -// ); -// } - -const std = @import("std"); -const builtin = @import("builtin"); -const mem = std.mem; -const testing = std.testing; -const Allocator = mem.Allocator; - -const ascii = @import("ascii"); -const Normalize = @import("Normalize"); - -const compress = std.compress; diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig new file mode 100644 index 0000000..162e82f --- /dev/null +++ b/src/CaseFolding.zig @@ -0,0 +1,321 @@ +cutoff: u21 = undefined, +cwcf_exceptions_min: u21 = undefined, +cwcf_exceptions_max: u21 = undefined, +cwcf_exceptions: []u21 = undefined, +multiple_start: u21 = undefined, +stage1: []u8 = undefined, +stage2: []u8 = undefined, +stage3: []i24 = undefined, +normalize: Normalize, +owns_normalize: bool, + +const CaseFolding = @This(); + +pub fn init(allocator: Allocator) !CaseFolding { + var case_fold: CaseFolding = undefined; + try case_fold.setup(allocator); + return case_fold; +} + +pub fn initWithNormalize(allocator: Allocator, norm: Normalize) !CaseFolding { + var casefold: CaseFolding = undefined; + try casefold.setupWithNormalize(allocator, norm); + return casefold; +} + +pub fn setup(casefold: *CaseFolding, allocator: Allocator) !void { + try casefold.setupImpl(allocator); + casefold.owns_normalize = false; + errdefer casefold.deinit(allocator); + try casefold.normalize.setup(allocator); + casefold.owns_normalize = true; +} + +pub fn setupWithNormalize(casefold: *CaseFolding, allocator: Allocator, norm: Normalize) !void { + try casefold.setupImpl(allocator); + casefold.normalize = norm; + casefold.owns_normalize = false; +} + +fn setupImpl(casefold: *CaseFolding, allocator: Allocator) !void { + const decompressor = compress.flate.inflate.decompressor; + const in_bytes = @embedFile("fold"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = decompressor(.raw, in_fbs.reader()); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + casefold.cutoff = @intCast(try reader.readInt(u24, endian)); + casefold.multiple_start = @intCast(try reader.readInt(u24, endian)); + + var len = try reader.readInt(u16, endian); + casefold.stage1 = try allocator.alloc(u8, len); + errdefer allocator.free(casefold.stage1); + for (0..len) |i| casefold.stage1[i] = try reader.readInt(u8, endian); + + len = try reader.readInt(u16, endian); + casefold.stage2 = try allocator.alloc(u8, len); + errdefer allocator.free(casefold.stage2); + for (0..len) |i| casefold.stage2[i] = try reader.readInt(u8, endian); + + len = try reader.readInt(u16, endian); + casefold.stage3 = try allocator.alloc(i24, len); + errdefer allocator.free(casefold.stage3); + for (0..len) |i| casefold.stage3[i] = try reader.readInt(i24, endian); + + casefold.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian)); + casefold.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian)); + len = try reader.readInt(u16, endian); + casefold.cwcf_exceptions = try allocator.alloc(u21, len); + errdefer allocator.free(casefold.cwcf_exceptions); + for (0..len) |i| casefold.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian)); +} + +pub fn deinit(fdata: *const CaseFolding, allocator: mem.Allocator) void { + allocator.free(fdata.stage1); + allocator.free(fdata.stage2); + allocator.free(fdata.stage3); + allocator.free(fdata.cwcf_exceptions); + if (fdata.owns_normalize) fdata.normalize.deinit(allocator); +} + +/// Returns the case fold for `cp`. +pub fn caseFold(fdata: *const CaseFolding, cp: u21, buf: []u21) []const u21 { + if (cp >= fdata.cutoff) return &.{}; + + const stage1_val = fdata.stage1[cp >> 8]; + if (stage1_val == 0) return &.{}; + + const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF); + const stage3_index = fdata.stage2[stage2_index]; + + if (stage3_index & 0x80 != 0) { + const real_index = @as(usize, fdata.multiple_start) + (stage3_index ^ 0x80) * 3; + const mapping = mem.sliceTo(fdata.stage3[real_index..][0..3], 0); + for (mapping, 0..) |c, i| buf[i] = @intCast(c); + + return buf[0..mapping.len]; + } + + const offset = fdata.stage3[stage3_index]; + if (offset == 0) return &.{}; + + buf[0] = @intCast(@as(i32, cp) + offset); + + return buf[0..1]; +} + +/// Produces the case folded code points for `cps`. Caller must free returned +/// slice with `allocator`. +pub fn caseFoldAlloc( + casefold: *const CaseFolding, + allocator: Allocator, + cps: []const u21, +) Allocator.Error![]const u21 { + var cfcps = std.ArrayList(u21).init(allocator); + defer cfcps.deinit(); + var buf: [3]u21 = undefined; + + for (cps) |cp| { + const cf = casefold.caseFold(cp, &buf); + + if (cf.len == 0) { + try cfcps.append(cp); + } else { + try cfcps.appendSlice(cf); + } + } + + return try cfcps.toOwnedSlice(); +} + +/// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). +pub fn cpChangesWhenCaseFolded(casefold: *const CaseFolding, cp: u21) bool { + var buf: [3]u21 = undefined; + const has_mapping = casefold.caseFold(cp, &buf).len != 0; + return has_mapping and !casefold.isCwcfException(cp); +} + +pub fn changesWhenCaseFolded(casefold: *const CaseFolding, cps: []const u21) bool { + return for (cps) |cp| { + if (casefold.cpChangesWhenCaseFolded(cp)) break true; + } else false; +} + +fn isCwcfException(casefold: *const CaseFolding, cp: u21) bool { + return cp >= casefold.cwcf_exceptions_min and + cp <= casefold.cwcf_exceptions_max and + std.mem.indexOfScalar(u21, casefold.cwcf_exceptions, cp) != null; +} + +/// Caseless compare `a` and `b` by decomposing to NFKD. This is the most +/// comprehensive comparison possible, but slower than `canonCaselessMatch`. +pub fn compatCaselessMatch( + casefold: *const CaseFolding, + allocator: Allocator, + a: []const u8, + b: []const u8, +) Allocator.Error!bool { + if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); + + // Process a + const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd); + defer allocator.free(nfd_a); + + var need_free_cf_nfd_a = false; + var cf_nfd_a: []const u21 = nfd_a; + if (casefold.changesWhenCaseFolded(nfd_a)) { + cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a); + need_free_cf_nfd_a = true; + } + defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); + + const nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_a); + defer allocator.free(nfkd_cf_nfd_a); + const cf_nfkd_cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_a); + defer allocator.free(cf_nfkd_cf_nfd_a); + const nfkd_cf_nfkd_cf_nfd_a = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); + defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); + + // Process b + const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd); + defer allocator.free(nfd_b); + + var need_free_cf_nfd_b = false; + var cf_nfd_b: []const u21 = nfd_b; + if (casefold.changesWhenCaseFolded(nfd_b)) { + cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b); + need_free_cf_nfd_b = true; + } + defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); + + const nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfd_b); + defer allocator.free(nfkd_cf_nfd_b); + const cf_nfkd_cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfkd_cf_nfd_b); + defer allocator.free(cf_nfkd_cf_nfd_b); + const nfkd_cf_nfkd_cf_nfd_b = try casefold.normalize.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b); + defer allocator.free(nfkd_cf_nfkd_cf_nfd_b); + + return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b); +} + +test "compatCaselessMatch" { + const allocator = testing.allocator; + + const caser = try CaseFolding.init(allocator); + defer caser.deinit(allocator); + + try testing.expect(try caser.compatCaselessMatch(allocator, "ascii only!", "ASCII Only!")); + + const a = "Héllo World! \u{3d3}"; + const b = "He\u{301}llo World! \u{3a5}\u{301}"; + try testing.expect(try caser.compatCaselessMatch(allocator, a, b)); + + const c = "He\u{301}llo World! \u{3d2}\u{301}"; + try testing.expect(try caser.compatCaselessMatch(allocator, a, c)); +} + +/// Performs canonical caseless string matching by decomposing to NFD. This is +/// faster than `compatCaselessMatch`, but less comprehensive. +pub fn canonCaselessMatch( + casefold: *const CaseFolding, + allocator: Allocator, + a: []const u8, + b: []const u8, +) Allocator.Error!bool { + if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); + + // Process a + const nfd_a = try casefold.normalize.nfxdCodePoints(allocator, a, .nfd); + defer allocator.free(nfd_a); + + var need_free_cf_nfd_a = false; + var cf_nfd_a: []const u21 = nfd_a; + if (casefold.changesWhenCaseFolded(nfd_a)) { + cf_nfd_a = try casefold.caseFoldAlloc(allocator, nfd_a); + need_free_cf_nfd_a = true; + } + defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a); + + var need_free_nfd_cf_nfd_a = false; + var nfd_cf_nfd_a = cf_nfd_a; + if (!need_free_cf_nfd_a) { + nfd_cf_nfd_a = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_a); + need_free_nfd_cf_nfd_a = true; + } + defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a); + + // Process b + const nfd_b = try casefold.normalize.nfxdCodePoints(allocator, b, .nfd); + defer allocator.free(nfd_b); + + var need_free_cf_nfd_b = false; + var cf_nfd_b: []const u21 = nfd_b; + if (casefold.changesWhenCaseFolded(nfd_b)) { + cf_nfd_b = try casefold.caseFoldAlloc(allocator, nfd_b); + need_free_cf_nfd_b = true; + } + defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b); + + var need_free_nfd_cf_nfd_b = false; + var nfd_cf_nfd_b = cf_nfd_b; + if (!need_free_cf_nfd_b) { + nfd_cf_nfd_b = try casefold.normalize.nfdCodePoints(allocator, cf_nfd_b); + need_free_nfd_cf_nfd_b = true; + } + defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b); + + return mem.eql(u21, nfd_cf_nfd_a, nfd_cf_nfd_b); +} + +test "canonCaselessMatch" { + const allocator = testing.allocator; + + const caser = try CaseFolding.init(allocator); + defer caser.deinit(allocator); + + try testing.expect(try caser.canonCaselessMatch(allocator, "ascii only!", "ASCII Only!")); + + const a = "Héllo World! \u{3d3}"; + const b = "He\u{301}llo World! \u{3a5}\u{301}"; + try testing.expect(!try caser.canonCaselessMatch(allocator, a, b)); + + const c = "He\u{301}llo World! \u{3d2}\u{301}"; + try testing.expect(try caser.canonCaselessMatch(allocator, a, c)); +} + +fn testAllocations(allocator: Allocator) !void { + // With normalize provided + { + const normalize = try Normalize.init(allocator); + defer normalize.deinit(allocator); + const caser1 = try CaseFolding.initWithNormalize(allocator, normalize); + defer caser1.deinit(allocator); + } + // With normalize owned + { + const caser2 = try CaseFolding.init(allocator); + defer caser2.deinit(allocator); + } +} + +// test "Allocation Failures" { +// if (true) return error.SkipZigTest; // XXX: remove +// try testing.checkAllAllocationFailures( +// testing.allocator, +// testAllocations, +// .{}, +// ); +// } + +const std = @import("std"); +const builtin = @import("builtin"); +const mem = std.mem; +const testing = std.testing; +const Allocator = mem.Allocator; + +const ascii = @import("ascii"); +const Normalize = @import("Normalize"); + +const compress = std.compress; diff --git a/src/GenCatData.zig b/src/GenCatData.zig deleted file mode 100644 index a69f7a2..0000000 --- a/src/GenCatData.zig +++ /dev/null @@ -1,170 +0,0 @@ -const std = @import("std"); -const builtin = @import("builtin"); -const compress = std.compress; -const mem = std.mem; - -/// General Category -pub const Gc = enum { - Cc, // Other, Control - Cf, // Other, Format - Cn, // Other, Unassigned - Co, // Other, Private Use - Cs, // Other, Surrogate - Ll, // Letter, Lowercase - Lm, // Letter, Modifier - Lo, // Letter, Other - Lu, // Letter, Uppercase - Lt, // Letter, Titlecase - Mc, // Mark, Spacing Combining - Me, // Mark, Enclosing - Mn, // Mark, Non-Spacing - Nd, // Number, Decimal Digit - Nl, // Number, Letter - No, // Number, Other - Pc, // Punctuation, Connector - Pd, // Punctuation, Dash - Pe, // Punctuation, Close - Pf, // Punctuation, Final quote (may behave like Ps or Pe depending on usage) - Pi, // Punctuation, Initial quote (may behave like Ps or Pe depending on usage) - Po, // Punctuation, Other - Ps, // Punctuation, Open - Sc, // Symbol, Currency - Sk, // Symbol, Modifier - Sm, // Symbol, Math - So, // Symbol, Other - Zl, // Separator, Line - Zp, // Separator, Paragraph - Zs, // Separator, Space -}; - -s1: []u16 = undefined, -s2: []u5 = undefined, -s3: []u5 = undefined, - -const Self = @This(); - -pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.flate.inflate.decompressor; - const in_bytes = @embedFile("gencat"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = decompressor(.raw, in_fbs.reader()); - var reader = in_decomp.reader(); - - const endian = builtin.cpu.arch.endian(); - - var self = Self{}; - - const s1_len: u16 = try reader.readInt(u16, endian); - self.s1 = try allocator.alloc(u16, s1_len); - errdefer allocator.free(self.s1); - for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian); - - const s2_len: u16 = try reader.readInt(u16, endian); - self.s2 = try allocator.alloc(u5, s2_len); - errdefer allocator.free(self.s2); - for (0..s2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian)); - - const s3_len: u16 = try reader.readInt(u8, endian); - self.s3 = try allocator.alloc(u5, s3_len); - errdefer allocator.free(self.s3); - for (0..s3_len) |i| self.s3[i] = @intCast(try reader.readInt(u8, endian)); - - return self; -} - -pub fn deinit(self: *const Self, allocator: mem.Allocator) void { - allocator.free(self.s1); - allocator.free(self.s2); - allocator.free(self.s3); -} - -/// Lookup the General Category for `cp`. -pub fn gc(self: Self, cp: u21) Gc { - return @enumFromInt(self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]]); -} - -/// True if `cp` has an C general category. -pub fn isControl(self: Self, cp: u21) bool { - return switch (self.gc(cp)) { - .Cc, - .Cf, - .Cn, - .Co, - .Cs, - => true, - else => false, - }; -} - -/// True if `cp` has an L general category. -pub fn isLetter(self: Self, cp: u21) bool { - return switch (self.gc(cp)) { - .Ll, - .Lm, - .Lo, - .Lu, - .Lt, - => true, - else => false, - }; -} - -/// True if `cp` has an M general category. -pub fn isMark(self: Self, cp: u21) bool { - return switch (self.gc(cp)) { - .Mc, - .Me, - .Mn, - => true, - else => false, - }; -} - -/// True if `cp` has an N general category. -pub fn isNumber(self: Self, cp: u21) bool { - return switch (self.gc(cp)) { - .Nd, - .Nl, - .No, - => true, - else => false, - }; -} - -/// True if `cp` has an P general category. -pub fn isPunctuation(self: Self, cp: u21) bool { - return switch (self.gc(cp)) { - .Pc, - .Pd, - .Pe, - .Pf, - .Pi, - .Po, - .Ps, - => true, - else => false, - }; -} - -/// True if `cp` has an S general category. -pub fn isSymbol(self: Self, cp: u21) bool { - return switch (self.gc(cp)) { - .Sc, - .Sk, - .Sm, - .So, - => true, - else => false, - }; -} - -/// True if `cp` has an Z general category. -pub fn isSeparator(self: Self, cp: u21) bool { - return switch (self.gc(cp)) { - .Zl, - .Zp, - .Zs, - => true, - else => false, - }; -} diff --git a/src/GeneralCategories.zig b/src/GeneralCategories.zig new file mode 100644 index 0000000..a69f7a2 --- /dev/null +++ b/src/GeneralCategories.zig @@ -0,0 +1,170 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; + +/// General Category +pub const Gc = enum { + Cc, // Other, Control + Cf, // Other, Format + Cn, // Other, Unassigned + Co, // Other, Private Use + Cs, // Other, Surrogate + Ll, // Letter, Lowercase + Lm, // Letter, Modifier + Lo, // Letter, Other + Lu, // Letter, Uppercase + Lt, // Letter, Titlecase + Mc, // Mark, Spacing Combining + Me, // Mark, Enclosing + Mn, // Mark, Non-Spacing + Nd, // Number, Decimal Digit + Nl, // Number, Letter + No, // Number, Other + Pc, // Punctuation, Connector + Pd, // Punctuation, Dash + Pe, // Punctuation, Close + Pf, // Punctuation, Final quote (may behave like Ps or Pe depending on usage) + Pi, // Punctuation, Initial quote (may behave like Ps or Pe depending on usage) + Po, // Punctuation, Other + Ps, // Punctuation, Open + Sc, // Symbol, Currency + Sk, // Symbol, Modifier + Sm, // Symbol, Math + So, // Symbol, Other + Zl, // Separator, Line + Zp, // Separator, Paragraph + Zs, // Separator, Space +}; + +s1: []u16 = undefined, +s2: []u5 = undefined, +s3: []u5 = undefined, + +const Self = @This(); + +pub fn init(allocator: mem.Allocator) !Self { + const decompressor = compress.flate.inflate.decompressor; + const in_bytes = @embedFile("gencat"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = decompressor(.raw, in_fbs.reader()); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + var self = Self{}; + + const s1_len: u16 = try reader.readInt(u16, endian); + self.s1 = try allocator.alloc(u16, s1_len); + errdefer allocator.free(self.s1); + for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian); + + const s2_len: u16 = try reader.readInt(u16, endian); + self.s2 = try allocator.alloc(u5, s2_len); + errdefer allocator.free(self.s2); + for (0..s2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian)); + + const s3_len: u16 = try reader.readInt(u8, endian); + self.s3 = try allocator.alloc(u5, s3_len); + errdefer allocator.free(self.s3); + for (0..s3_len) |i| self.s3[i] = @intCast(try reader.readInt(u8, endian)); + + return self; +} + +pub fn deinit(self: *const Self, allocator: mem.Allocator) void { + allocator.free(self.s1); + allocator.free(self.s2); + allocator.free(self.s3); +} + +/// Lookup the General Category for `cp`. +pub fn gc(self: Self, cp: u21) Gc { + return @enumFromInt(self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]]); +} + +/// True if `cp` has an C general category. +pub fn isControl(self: Self, cp: u21) bool { + return switch (self.gc(cp)) { + .Cc, + .Cf, + .Cn, + .Co, + .Cs, + => true, + else => false, + }; +} + +/// True if `cp` has an L general category. +pub fn isLetter(self: Self, cp: u21) bool { + return switch (self.gc(cp)) { + .Ll, + .Lm, + .Lo, + .Lu, + .Lt, + => true, + else => false, + }; +} + +/// True if `cp` has an M general category. +pub fn isMark(self: Self, cp: u21) bool { + return switch (self.gc(cp)) { + .Mc, + .Me, + .Mn, + => true, + else => false, + }; +} + +/// True if `cp` has an N general category. +pub fn isNumber(self: Self, cp: u21) bool { + return switch (self.gc(cp)) { + .Nd, + .Nl, + .No, + => true, + else => false, + }; +} + +/// True if `cp` has an P general category. +pub fn isPunctuation(self: Self, cp: u21) bool { + return switch (self.gc(cp)) { + .Pc, + .Pd, + .Pe, + .Pf, + .Pi, + .Po, + .Ps, + => true, + else => false, + }; +} + +/// True if `cp` has an S general category. +pub fn isSymbol(self: Self, cp: u21) bool { + return switch (self.gc(cp)) { + .Sc, + .Sk, + .Sm, + .So, + => true, + else => false, + }; +} + +/// True if `cp` has an Z general category. +pub fn isSeparator(self: Self, cp: u21) bool { + return switch (self.gc(cp)) { + .Zl, + .Zp, + .Zs, + => true, + else => false, + }; +} diff --git a/src/LetterCasing.zig b/src/LetterCasing.zig new file mode 100644 index 0000000..0a0acb1 --- /dev/null +++ b/src/LetterCasing.zig @@ -0,0 +1,200 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; +const testing = std.testing; +const unicode = std.unicode; + +const CodePointIterator = @import("code_point").Iterator; + +case_map: [][2]u21, +prop_s1: []u16 = undefined, +prop_s2: []u8 = undefined, + +const Self = @This(); + +pub fn init(allocator: mem.Allocator) !Self { + const decompressor = compress.flate.inflate.decompressor; + const endian = builtin.cpu.arch.endian(); + + var self = Self{ + .case_map = try allocator.alloc([2]u21, 0x110000), + }; + errdefer allocator.free(self.case_map); + + for (0..0x110000) |i| { + const cp: u21 = @intCast(i); + self.case_map[cp] = .{ cp, cp }; + } + + // Uppercase + const upper_bytes = @embedFile("upper"); + var upper_fbs = std.io.fixedBufferStream(upper_bytes); + var upper_decomp = decompressor(.raw, upper_fbs.reader()); + var upper_reader = upper_decomp.reader(); + + while (true) { + const cp = try upper_reader.readInt(i24, endian); + if (cp == 0) break; + const diff = try upper_reader.readInt(i24, endian); + self.case_map[@intCast(cp)][0] = @intCast(cp + diff); + } + + // Lowercase + const lower_bytes = @embedFile("lower"); + var lower_fbs = std.io.fixedBufferStream(lower_bytes); + var lower_decomp = decompressor(.raw, lower_fbs.reader()); + var lower_reader = lower_decomp.reader(); + + while (true) { + const cp = try lower_reader.readInt(i24, endian); + if (cp == 0) break; + const diff = try lower_reader.readInt(i24, endian); + self.case_map[@intCast(cp)][1] = @intCast(cp + diff); + } + + // Case properties + const cp_bytes = @embedFile("case_prop"); + var cp_fbs = std.io.fixedBufferStream(cp_bytes); + var cp_decomp = decompressor(.raw, cp_fbs.reader()); + var cp_reader = cp_decomp.reader(); + + const stage_1_len: u16 = try cp_reader.readInt(u16, endian); + self.prop_s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(self.prop_s1); + for (0..stage_1_len) |i| self.prop_s1[i] = try cp_reader.readInt(u16, endian); + + const stage_2_len: u16 = try cp_reader.readInt(u16, endian); + self.prop_s2 = try allocator.alloc(u8, stage_2_len); + errdefer allocator.free(self.prop_s2); + _ = try cp_reader.readAll(self.prop_s2); + + return self; +} + +pub fn deinit(self: *const Self, allocator: mem.Allocator) void { + allocator.free(self.case_map); + allocator.free(self.prop_s1); + allocator.free(self.prop_s2); +} + +// Returns true if `cp` is either upper, lower, or title case. +pub fn isCased(self: Self, cp: u21) bool { + return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; +} + +// Returns true if `cp` is uppercase. +pub fn isUpper(self: Self, cp: u21) bool { + return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; +} + +/// Returns true if `str` is all uppercase. +pub fn isUpperStr(self: Self, str: []const u8) bool { + var iter = CodePointIterator{ .bytes = str }; + + return while (iter.next()) |cp| { + if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false; + } else true; +} + +test "isUpperStr" { + const cd = try init(testing.allocator); + defer cd.deinit(testing.allocator); + + try testing.expect(cd.isUpperStr("HELLO, WORLD 2112!")); + try testing.expect(!cd.isUpperStr("hello, world 2112!")); + try testing.expect(!cd.isUpperStr("Hello, World 2112!")); +} + +/// Returns uppercase mapping for `cp`. +pub fn toUpper(self: Self, cp: u21) u21 { + return self.case_map[cp][0]; +} + +/// Returns a new string with all letters in uppercase. +/// Caller must free returned bytes with `allocator`. +pub fn toUpperStr( + self: Self, + allocator: mem.Allocator, + str: []const u8, +) ![]u8 { + var bytes = std.ArrayList(u8).init(allocator); + defer bytes.deinit(); + + var iter = CodePointIterator{ .bytes = str }; + var buf: [4]u8 = undefined; + + while (iter.next()) |cp| { + const len = try unicode.utf8Encode(self.toUpper(cp.code), &buf); + try bytes.appendSlice(buf[0..len]); + } + + return try bytes.toOwnedSlice(); +} + +test "toUpperStr" { + const cd = try init(testing.allocator); + defer cd.deinit(testing.allocator); + + const uppered = try cd.toUpperStr(testing.allocator, "Hello, World 2112!"); + defer testing.allocator.free(uppered); + try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered); +} + +// Returns true if `cp` is lowercase. +pub fn isLower(self: Self, cp: u21) bool { + return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; +} + +/// Returns true if `str` is all lowercase. +pub fn isLowerStr(self: Self, str: []const u8) bool { + var iter = CodePointIterator{ .bytes = str }; + + return while (iter.next()) |cp| { + if (self.isCased(cp.code) and !self.isLower(cp.code)) break false; + } else true; +} + +test "isLowerStr" { + const cd = try init(testing.allocator); + defer cd.deinit(testing.allocator); + + try testing.expect(cd.isLowerStr("hello, world 2112!")); + try testing.expect(!cd.isLowerStr("HELLO, WORLD 2112!")); + try testing.expect(!cd.isLowerStr("Hello, World 2112!")); +} + +/// Returns lowercase mapping for `cp`. +pub fn toLower(self: Self, cp: u21) u21 { + return self.case_map[cp][1]; +} + +/// Returns a new string with all letters in lowercase. +/// Caller must free returned bytes with `allocator`. +pub fn toLowerStr( + self: Self, + allocator: mem.Allocator, + str: []const u8, +) ![]u8 { + var bytes = std.ArrayList(u8).init(allocator); + defer bytes.deinit(); + + var iter = CodePointIterator{ .bytes = str }; + var buf: [4]u8 = undefined; + + while (iter.next()) |cp| { + const len = try unicode.utf8Encode(self.toLower(cp.code), &buf); + try bytes.appendSlice(buf[0..len]); + } + + return try bytes.toOwnedSlice(); +} + +test "toLowerStr" { + const cd = try init(testing.allocator); + defer cd.deinit(testing.allocator); + + const lowered = try cd.toLowerStr(testing.allocator, "Hello, World 2112!"); + defer testing.allocator.free(lowered); + try testing.expectEqualStrings("hello, world 2112!", lowered); +} diff --git a/src/Properties.zig b/src/Properties.zig new file mode 100644 index 0000000..46920be --- /dev/null +++ b/src/Properties.zig @@ -0,0 +1,163 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; +const testing = std.testing; + +core_s1: []u16 = undefined, +core_s2: []u8 = undefined, +props_s1: []u16 = undefined, +props_s2: []u8 = undefined, +num_s1: []u16 = undefined, +num_s2: []u8 = undefined, + +const Self = @This(); + +pub fn init(allocator: mem.Allocator) !Self { + const decompressor = compress.flate.inflate.decompressor; + const endian = builtin.cpu.arch.endian(); + + // Process DerivedCoreProperties.txt + const core_bytes = @embedFile("core_props"); + var core_fbs = std.io.fixedBufferStream(core_bytes); + var core_decomp = decompressor(.raw, core_fbs.reader()); + var core_reader = core_decomp.reader(); + + var self = Self{}; + + const core_stage_1_len: u16 = try core_reader.readInt(u16, endian); + self.core_s1 = try allocator.alloc(u16, core_stage_1_len); + errdefer allocator.free(self.core_s1); + for (0..core_stage_1_len) |i| self.core_s1[i] = try core_reader.readInt(u16, endian); + + const core_stage_2_len: u16 = try core_reader.readInt(u16, endian); + self.core_s2 = try allocator.alloc(u8, core_stage_2_len); + errdefer allocator.free(self.core_s2); + _ = try core_reader.readAll(self.core_s2); + + // Process PropList.txt + const props_bytes = @embedFile("props"); + var props_fbs = std.io.fixedBufferStream(props_bytes); + var props_decomp = decompressor(.raw, props_fbs.reader()); + var props_reader = props_decomp.reader(); + + const stage_1_len: u16 = try props_reader.readInt(u16, endian); + self.props_s1 = try allocator.alloc(u16, stage_1_len); + errdefer allocator.free(self.props_s1); + for (0..stage_1_len) |i| self.props_s1[i] = try props_reader.readInt(u16, endian); + + const stage_2_len: u16 = try props_reader.readInt(u16, endian); + self.props_s2 = try allocator.alloc(u8, stage_2_len); + errdefer allocator.free(self.props_s2); + _ = try props_reader.readAll(self.props_s2); + + // Process DerivedNumericType.txt + const num_bytes = @embedFile("numeric"); + var num_fbs = std.io.fixedBufferStream(num_bytes); + var num_decomp = decompressor(.raw, num_fbs.reader()); + var num_reader = num_decomp.reader(); + + const num_stage_1_len: u16 = try num_reader.readInt(u16, endian); + self.num_s1 = try allocator.alloc(u16, num_stage_1_len); + errdefer allocator.free(self.num_s1); + for (0..num_stage_1_len) |i| self.num_s1[i] = try num_reader.readInt(u16, endian); + + const num_stage_2_len: u16 = try num_reader.readInt(u16, endian); + self.num_s2 = try allocator.alloc(u8, num_stage_2_len); + errdefer allocator.free(self.num_s2); + _ = try num_reader.readAll(self.num_s2); + + return self; +} + +pub fn deinit(self: *const Self, allocator: mem.Allocator) void { + allocator.free(self.core_s1); + allocator.free(self.core_s2); + allocator.free(self.props_s1); + allocator.free(self.props_s2); + allocator.free(self.num_s1); + allocator.free(self.num_s2); +} + +/// True if `cp` is a mathematical symbol. +pub fn isMath(self: Self, cp: u21) bool { + return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; +} + +/// True if `cp` is an alphabetic character. +pub fn isAlphabetic(self: Self, cp: u21) bool { + return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; +} + +/// True if `cp` is a valid identifier start character. +pub fn isIdStart(self: Self, cp: u21) bool { + return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; +} + +/// True if `cp` is a valid identifier continuation character. +pub fn isIdContinue(self: Self, cp: u21) bool { + return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 8 == 8; +} + +/// True if `cp` is a valid extended identifier start character. +pub fn isXidStart(self: Self, cp: u21) bool { + return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 16 == 16; +} + +/// True if `cp` is a valid extended identifier continuation character. +pub fn isXidContinue(self: Self, cp: u21) bool { + return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 32 == 32; +} + +/// True if `cp` is a whitespace character. +pub fn isWhitespace(self: Self, cp: u21) bool { + return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; +} + +/// True if `cp` is a hexadecimal digit. +pub fn isHexDigit(self: Self, cp: u21) bool { + return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; +} + +/// True if `cp` is a diacritic mark. +pub fn isDiacritic(self: Self, cp: u21) bool { + return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; +} + +/// True if `cp` is numeric. +pub fn isNumeric(self: Self, cp: u21) bool { + return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; +} + +/// True if `cp` is a digit. +pub fn isDigit(self: Self, cp: u21) bool { + return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; +} + +/// True if `cp` is decimal. +pub fn isDecimal(self: Self, cp: u21) bool { + return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; +} + +test "Props" { + const self = try init(testing.allocator); + defer self.deinit(testing.allocator); + + try testing.expect(self.isHexDigit('F')); + try testing.expect(self.isHexDigit('a')); + try testing.expect(self.isHexDigit('8')); + try testing.expect(!self.isHexDigit('z')); + + try testing.expect(self.isDiacritic('\u{301}')); + try testing.expect(self.isAlphabetic('A')); + try testing.expect(!self.isAlphabetic('3')); + try testing.expect(self.isMath('+')); + + try testing.expect(self.isNumeric('\u{277f}')); + try testing.expect(self.isDigit('\u{2070}')); + try testing.expect(self.isDecimal('3')); + + try testing.expect(!self.isNumeric('1')); + try testing.expect(!self.isDigit('2')); + try testing.expect(!self.isDecimal('g')); +} diff --git a/src/PropsData.zig b/src/PropsData.zig deleted file mode 100644 index 46920be..0000000 --- a/src/PropsData.zig +++ /dev/null @@ -1,163 +0,0 @@ -const std = @import("std"); -const builtin = @import("builtin"); -const compress = std.compress; -const mem = std.mem; -const testing = std.testing; - -core_s1: []u16 = undefined, -core_s2: []u8 = undefined, -props_s1: []u16 = undefined, -props_s2: []u8 = undefined, -num_s1: []u16 = undefined, -num_s2: []u8 = undefined, - -const Self = @This(); - -pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.flate.inflate.decompressor; - const endian = builtin.cpu.arch.endian(); - - // Process DerivedCoreProperties.txt - const core_bytes = @embedFile("core_props"); - var core_fbs = std.io.fixedBufferStream(core_bytes); - var core_decomp = decompressor(.raw, core_fbs.reader()); - var core_reader = core_decomp.reader(); - - var self = Self{}; - - const core_stage_1_len: u16 = try core_reader.readInt(u16, endian); - self.core_s1 = try allocator.alloc(u16, core_stage_1_len); - errdefer allocator.free(self.core_s1); - for (0..core_stage_1_len) |i| self.core_s1[i] = try core_reader.readInt(u16, endian); - - const core_stage_2_len: u16 = try core_reader.readInt(u16, endian); - self.core_s2 = try allocator.alloc(u8, core_stage_2_len); - errdefer allocator.free(self.core_s2); - _ = try core_reader.readAll(self.core_s2); - - // Process PropList.txt - const props_bytes = @embedFile("props"); - var props_fbs = std.io.fixedBufferStream(props_bytes); - var props_decomp = decompressor(.raw, props_fbs.reader()); - var props_reader = props_decomp.reader(); - - const stage_1_len: u16 = try props_reader.readInt(u16, endian); - self.props_s1 = try allocator.alloc(u16, stage_1_len); - errdefer allocator.free(self.props_s1); - for (0..stage_1_len) |i| self.props_s1[i] = try props_reader.readInt(u16, endian); - - const stage_2_len: u16 = try props_reader.readInt(u16, endian); - self.props_s2 = try allocator.alloc(u8, stage_2_len); - errdefer allocator.free(self.props_s2); - _ = try props_reader.readAll(self.props_s2); - - // Process DerivedNumericType.txt - const num_bytes = @embedFile("numeric"); - var num_fbs = std.io.fixedBufferStream(num_bytes); - var num_decomp = decompressor(.raw, num_fbs.reader()); - var num_reader = num_decomp.reader(); - - const num_stage_1_len: u16 = try num_reader.readInt(u16, endian); - self.num_s1 = try allocator.alloc(u16, num_stage_1_len); - errdefer allocator.free(self.num_s1); - for (0..num_stage_1_len) |i| self.num_s1[i] = try num_reader.readInt(u16, endian); - - const num_stage_2_len: u16 = try num_reader.readInt(u16, endian); - self.num_s2 = try allocator.alloc(u8, num_stage_2_len); - errdefer allocator.free(self.num_s2); - _ = try num_reader.readAll(self.num_s2); - - return self; -} - -pub fn deinit(self: *const Self, allocator: mem.Allocator) void { - allocator.free(self.core_s1); - allocator.free(self.core_s2); - allocator.free(self.props_s1); - allocator.free(self.props_s2); - allocator.free(self.num_s1); - allocator.free(self.num_s2); -} - -/// True if `cp` is a mathematical symbol. -pub fn isMath(self: Self, cp: u21) bool { - return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; -} - -/// True if `cp` is an alphabetic character. -pub fn isAlphabetic(self: Self, cp: u21) bool { - return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; -} - -/// True if `cp` is a valid identifier start character. -pub fn isIdStart(self: Self, cp: u21) bool { - return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; -} - -/// True if `cp` is a valid identifier continuation character. -pub fn isIdContinue(self: Self, cp: u21) bool { - return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 8 == 8; -} - -/// True if `cp` is a valid extended identifier start character. -pub fn isXidStart(self: Self, cp: u21) bool { - return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 16 == 16; -} - -/// True if `cp` is a valid extended identifier continuation character. -pub fn isXidContinue(self: Self, cp: u21) bool { - return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 32 == 32; -} - -/// True if `cp` is a whitespace character. -pub fn isWhitespace(self: Self, cp: u21) bool { - return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; -} - -/// True if `cp` is a hexadecimal digit. -pub fn isHexDigit(self: Self, cp: u21) bool { - return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; -} - -/// True if `cp` is a diacritic mark. -pub fn isDiacritic(self: Self, cp: u21) bool { - return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; -} - -/// True if `cp` is numeric. -pub fn isNumeric(self: Self, cp: u21) bool { - return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; -} - -/// True if `cp` is a digit. -pub fn isDigit(self: Self, cp: u21) bool { - return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; -} - -/// True if `cp` is decimal. -pub fn isDecimal(self: Self, cp: u21) bool { - return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; -} - -test "Props" { - const self = try init(testing.allocator); - defer self.deinit(testing.allocator); - - try testing.expect(self.isHexDigit('F')); - try testing.expect(self.isHexDigit('a')); - try testing.expect(self.isHexDigit('8')); - try testing.expect(!self.isHexDigit('z')); - - try testing.expect(self.isDiacritic('\u{301}')); - try testing.expect(self.isAlphabetic('A')); - try testing.expect(!self.isAlphabetic('3')); - try testing.expect(self.isMath('+')); - - try testing.expect(self.isNumeric('\u{277f}')); - try testing.expect(self.isDigit('\u{2070}')); - try testing.expect(self.isDecimal('3')); - - try testing.expect(!self.isNumeric('1')); - try testing.expect(!self.isDigit('2')); - try testing.expect(!self.isDecimal('g')); -} diff --git a/src/Scripts.zig b/src/Scripts.zig new file mode 100644 index 0000000..4ad8549 --- /dev/null +++ b/src/Scripts.zig @@ -0,0 +1,227 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const compress = std.compress; +const mem = std.mem; +const testing = std.testing; + +/// Scripts +pub const Script = enum { + none, + Adlam, + Ahom, + Anatolian_Hieroglyphs, + Arabic, + Armenian, + Avestan, + Balinese, + Bamum, + Bassa_Vah, + Batak, + Bengali, + Bhaiksuki, + Bopomofo, + Brahmi, + Braille, + Buginese, + Buhid, + Canadian_Aboriginal, + Carian, + Caucasian_Albanian, + Chakma, + Cham, + Cherokee, + Chorasmian, + Common, + Coptic, + Cuneiform, + Cypriot, + Cypro_Minoan, + Cyrillic, + Deseret, + Devanagari, + Dives_Akuru, + Dogra, + Duployan, + Egyptian_Hieroglyphs, + Elbasan, + Elymaic, + Ethiopic, + Georgian, + Glagolitic, + Gothic, + Grantha, + Greek, + Gujarati, + Gunjala_Gondi, + Gurmukhi, + Han, + Hangul, + Hanifi_Rohingya, + Hanunoo, + Hatran, + Hebrew, + Hiragana, + Imperial_Aramaic, + Inherited, + Inscriptional_Pahlavi, + Inscriptional_Parthian, + Javanese, + Kaithi, + Kannada, + Katakana, + Kawi, + Kayah_Li, + Kharoshthi, + Khitan_Small_Script, + Khmer, + Khojki, + Khudawadi, + Lao, + Latin, + Lepcha, + Limbu, + Linear_A, + Linear_B, + Lisu, + Lycian, + Lydian, + Mahajani, + Makasar, + Malayalam, + Mandaic, + Manichaean, + Marchen, + Masaram_Gondi, + Medefaidrin, + Meetei_Mayek, + Mende_Kikakui, + Meroitic_Cursive, + Meroitic_Hieroglyphs, + Miao, + Modi, + Mongolian, + Mro, + Multani, + Myanmar, + Nabataean, + Nag_Mundari, + Nandinagari, + New_Tai_Lue, + Newa, + Nko, + Nushu, + Nyiakeng_Puachue_Hmong, + Ogham, + Ol_Chiki, + Old_Hungarian, + Old_Italic, + Old_North_Arabian, + Old_Permic, + Old_Persian, + Old_Sogdian, + Old_South_Arabian, + Old_Turkic, + Old_Uyghur, + Oriya, + Osage, + Osmanya, + Pahawh_Hmong, + Palmyrene, + Pau_Cin_Hau, + Phags_Pa, + Phoenician, + Psalter_Pahlavi, + Rejang, + Runic, + Samaritan, + Saurashtra, + Sharada, + Shavian, + Siddham, + SignWriting, + Sinhala, + Sogdian, + Sora_Sompeng, + Soyombo, + Sundanese, + Syloti_Nagri, + Syriac, + Tagalog, + Tagbanwa, + Tai_Le, + Tai_Tham, + Tai_Viet, + Takri, + Tamil, + Tangsa, + Tangut, + Telugu, + Thaana, + Thai, + Tibetan, + Tifinagh, + Tirhuta, + Toto, + Ugaritic, + Vai, + Vithkuqi, + Wancho, + Warang_Citi, + Yezidi, + Yi, + Zanabazar_Square, +}; + +s1: []u16 = undefined, +s2: []u8 = undefined, +s3: []u8 = undefined, + +const Self = @This(); + +pub fn init(allocator: mem.Allocator) !Self { + const decompressor = compress.flate.inflate.decompressor; + const in_bytes = @embedFile("scripts"); + var in_fbs = std.io.fixedBufferStream(in_bytes); + var in_decomp = decompressor(.raw, in_fbs.reader()); + var reader = in_decomp.reader(); + + const endian = builtin.cpu.arch.endian(); + + var self = Self{}; + + const s1_len: u16 = try reader.readInt(u16, endian); + self.s1 = try allocator.alloc(u16, s1_len); + errdefer allocator.free(self.s1); + for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian); + + const s2_len: u16 = try reader.readInt(u16, endian); + self.s2 = try allocator.alloc(u8, s2_len); + errdefer allocator.free(self.s2); + _ = try reader.readAll(self.s2); + + const s3_len: u16 = try reader.readInt(u8, endian); + self.s3 = try allocator.alloc(u8, s3_len); + errdefer allocator.free(self.s3); + _ = try reader.readAll(self.s3); + + return self; +} + +pub fn deinit(self: *const Self, allocator: mem.Allocator) void { + allocator.free(self.s1); + allocator.free(self.s2); + allocator.free(self.s3); +} + +/// Lookup the Script type for `cp`. +pub fn script(self: Self, cp: u21) ?Script { + const byte = self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]]; + if (byte == 0) return null; + return @enumFromInt(byte); +} + +test "script" { + const self = try init(std.testing.allocator); + defer self.deinit(std.testing.allocator); + try testing.expectEqual(Script.Latin, self.script('A').?); +} diff --git a/src/ScriptsData.zig b/src/ScriptsData.zig deleted file mode 100644 index 4ad8549..0000000 --- a/src/ScriptsData.zig +++ /dev/null @@ -1,227 +0,0 @@ -const std = @import("std"); -const builtin = @import("builtin"); -const compress = std.compress; -const mem = std.mem; -const testing = std.testing; - -/// Scripts -pub const Script = enum { - none, - Adlam, - Ahom, - Anatolian_Hieroglyphs, - Arabic, - Armenian, - Avestan, - Balinese, - Bamum, - Bassa_Vah, - Batak, - Bengali, - Bhaiksuki, - Bopomofo, - Brahmi, - Braille, - Buginese, - Buhid, - Canadian_Aboriginal, - Carian, - Caucasian_Albanian, - Chakma, - Cham, - Cherokee, - Chorasmian, - Common, - Coptic, - Cuneiform, - Cypriot, - Cypro_Minoan, - Cyrillic, - Deseret, - Devanagari, - Dives_Akuru, - Dogra, - Duployan, - Egyptian_Hieroglyphs, - Elbasan, - Elymaic, - Ethiopic, - Georgian, - Glagolitic, - Gothic, - Grantha, - Greek, - Gujarati, - Gunjala_Gondi, - Gurmukhi, - Han, - Hangul, - Hanifi_Rohingya, - Hanunoo, - Hatran, - Hebrew, - Hiragana, - Imperial_Aramaic, - Inherited, - Inscriptional_Pahlavi, - Inscriptional_Parthian, - Javanese, - Kaithi, - Kannada, - Katakana, - Kawi, - Kayah_Li, - Kharoshthi, - Khitan_Small_Script, - Khmer, - Khojki, - Khudawadi, - Lao, - Latin, - Lepcha, - Limbu, - Linear_A, - Linear_B, - Lisu, - Lycian, - Lydian, - Mahajani, - Makasar, - Malayalam, - Mandaic, - Manichaean, - Marchen, - Masaram_Gondi, - Medefaidrin, - Meetei_Mayek, - Mende_Kikakui, - Meroitic_Cursive, - Meroitic_Hieroglyphs, - Miao, - Modi, - Mongolian, - Mro, - Multani, - Myanmar, - Nabataean, - Nag_Mundari, - Nandinagari, - New_Tai_Lue, - Newa, - Nko, - Nushu, - Nyiakeng_Puachue_Hmong, - Ogham, - Ol_Chiki, - Old_Hungarian, - Old_Italic, - Old_North_Arabian, - Old_Permic, - Old_Persian, - Old_Sogdian, - Old_South_Arabian, - Old_Turkic, - Old_Uyghur, - Oriya, - Osage, - Osmanya, - Pahawh_Hmong, - Palmyrene, - Pau_Cin_Hau, - Phags_Pa, - Phoenician, - Psalter_Pahlavi, - Rejang, - Runic, - Samaritan, - Saurashtra, - Sharada, - Shavian, - Siddham, - SignWriting, - Sinhala, - Sogdian, - Sora_Sompeng, - Soyombo, - Sundanese, - Syloti_Nagri, - Syriac, - Tagalog, - Tagbanwa, - Tai_Le, - Tai_Tham, - Tai_Viet, - Takri, - Tamil, - Tangsa, - Tangut, - Telugu, - Thaana, - Thai, - Tibetan, - Tifinagh, - Tirhuta, - Toto, - Ugaritic, - Vai, - Vithkuqi, - Wancho, - Warang_Citi, - Yezidi, - Yi, - Zanabazar_Square, -}; - -s1: []u16 = undefined, -s2: []u8 = undefined, -s3: []u8 = undefined, - -const Self = @This(); - -pub fn init(allocator: mem.Allocator) !Self { - const decompressor = compress.flate.inflate.decompressor; - const in_bytes = @embedFile("scripts"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var in_decomp = decompressor(.raw, in_fbs.reader()); - var reader = in_decomp.reader(); - - const endian = builtin.cpu.arch.endian(); - - var self = Self{}; - - const s1_len: u16 = try reader.readInt(u16, endian); - self.s1 = try allocator.alloc(u16, s1_len); - errdefer allocator.free(self.s1); - for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian); - - const s2_len: u16 = try reader.readInt(u16, endian); - self.s2 = try allocator.alloc(u8, s2_len); - errdefer allocator.free(self.s2); - _ = try reader.readAll(self.s2); - - const s3_len: u16 = try reader.readInt(u8, endian); - self.s3 = try allocator.alloc(u8, s3_len); - errdefer allocator.free(self.s3); - _ = try reader.readAll(self.s3); - - return self; -} - -pub fn deinit(self: *const Self, allocator: mem.Allocator) void { - allocator.free(self.s1); - allocator.free(self.s2); - allocator.free(self.s3); -} - -/// Lookup the Script type for `cp`. -pub fn script(self: Self, cp: u21) ?Script { - const byte = self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]]; - if (byte == 0) return null; - return @enumFromInt(byte); -} - -test "script" { - const self = try init(std.testing.allocator); - defer self.deinit(std.testing.allocator); - try testing.expectEqual(Script.Latin, self.script('A').?); -} -- cgit v1.2.3