diff options
| author | 2026-02-04 18:36:18 -0500 | |
|---|---|---|
| committer | 2026-02-04 18:36:18 -0500 | |
| commit | e476250ea9326b2550847b301c265115ff375a31 (patch) | |
| tree | cf627ced47cecce80020b7a1f30aa51852c0c59b /src | |
| parent | Normalization and case folding (diff) | |
| download | zg-e476250ea9326b2550847b301c265115ff375a31.tar.gz zg-e476250ea9326b2550847b301c265115ff375a31.tar.xz zg-e476250ea9326b2550847b301c265115ff375a31.zip | |
Rest of the 'easy' stuff
This gets us up to feature parity with Jacob's work. I want to
eliminate that last allocation using the comptime hash map, and then
see about eliminating allocations from case comparisons as well.
That should just about do it.
Diffstat (limited to 'src')
| -rw-r--r-- | src/GeneralCategories.zig | 102 | ||||
| -rw-r--r-- | src/LetterCasing.zig | 179 | ||||
| -rw-r--r-- | src/Properties.zig | 195 | ||||
| -rw-r--r-- | src/Scripts.zig | 82 |
4 files changed, 162 insertions, 396 deletions
diff --git a/src/GeneralCategories.zig b/src/GeneralCategories.zig index eee7e56..9a383bf 100644 --- a/src/GeneralCategories.zig +++ b/src/GeneralCategories.zig | |||
| @@ -1,8 +1,19 @@ | |||
| 1 | //! General Categories | 1 | //! General Categories |
| 2 | 2 | ||
| 3 | s1: []u16 = undefined, | 3 | const Data = struct { |
| 4 | s2: []u5 = undefined, | 4 | s1: []const u16 = undefined, |
| 5 | s3: []u5 = undefined, | 5 | s2: []const u5 = undefined, |
| 6 | s3: []const u5 = undefined, | ||
| 7 | }; | ||
| 8 | |||
| 9 | const general_categories = general_categories: { | ||
| 10 | const data = @import("gencat"); | ||
| 11 | break :general_categories Data{ | ||
| 12 | .s1 = &data.s1, | ||
| 13 | .s2 = &data.s2, | ||
| 14 | .s3 = &data.s3, | ||
| 15 | }; | ||
| 16 | }; | ||
| 6 | 17 | ||
| 7 | /// General Category | 18 | /// General Category |
| 8 | pub const Gc = enum { | 19 | pub const Gc = enum { |
| @@ -38,51 +49,14 @@ pub const Gc = enum { | |||
| 38 | Zs, // Separator, Space | 49 | Zs, // Separator, Space |
| 39 | }; | 50 | }; |
| 40 | 51 | ||
| 41 | const GeneralCategories = @This(); | ||
| 42 | |||
| 43 | pub fn init(allocator: Allocator) Allocator.Error!GeneralCategories { | ||
| 44 | var gencat = GeneralCategories{}; | ||
| 45 | try gencat.setup(allocator); | ||
| 46 | return gencat; | ||
| 47 | } | ||
| 48 | |||
| 49 | pub fn setup(gencat: *GeneralCategories, allocator: Allocator) Allocator.Error!void { | ||
| 50 | const in_bytes = @embedFile("gencat"); | ||
| 51 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 52 | var reader = in_fbs.reader(); | ||
| 53 | |||
| 54 | const endian = builtin.cpu.arch.endian(); | ||
| 55 | |||
| 56 | const s1_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 57 | gencat.s1 = try allocator.alloc(u16, s1_len); | ||
| 58 | errdefer allocator.free(gencat.s1); | ||
| 59 | for (0..s1_len) |i| gencat.s1[i] = reader.readInt(u16, endian) catch unreachable; | ||
| 60 | |||
| 61 | const s2_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 62 | gencat.s2 = try allocator.alloc(u5, s2_len); | ||
| 63 | errdefer allocator.free(gencat.s2); | ||
| 64 | for (0..s2_len) |i| gencat.s2[i] = @intCast(reader.readInt(u8, endian) catch unreachable); | ||
| 65 | |||
| 66 | const s3_len: u16 = reader.readInt(u8, endian) catch unreachable; | ||
| 67 | gencat.s3 = try allocator.alloc(u5, s3_len); | ||
| 68 | errdefer allocator.free(gencat.s3); | ||
| 69 | for (0..s3_len) |i| gencat.s3[i] = @intCast(reader.readInt(u8, endian) catch unreachable); | ||
| 70 | } | ||
| 71 | |||
| 72 | pub fn deinit(gencat: *const GeneralCategories, allocator: mem.Allocator) void { | ||
| 73 | allocator.free(gencat.s1); | ||
| 74 | allocator.free(gencat.s2); | ||
| 75 | allocator.free(gencat.s3); | ||
| 76 | } | ||
| 77 | |||
| 78 | /// Lookup the General Category for `cp`. | 52 | /// Lookup the General Category for `cp`. |
| 79 | pub fn gc(gencat: GeneralCategories, cp: u21) Gc { | 53 | pub fn gc(cp: u21) Gc { |
| 80 | return @enumFromInt(gencat.s3[gencat.s2[gencat.s1[cp >> 8] + (cp & 0xff)]]); | 54 | return @enumFromInt(general_categories.s3[general_categories.s2[general_categories.s1[cp >> 8] + (cp & 0xff)]]); |
| 81 | } | 55 | } |
| 82 | 56 | ||
| 83 | /// True if `cp` has an C general category. | 57 | /// True if `cp` has an C general category. |
| 84 | pub fn isControl(gencat: GeneralCategories, cp: u21) bool { | 58 | pub fn isControl(cp: u21) bool { |
| 85 | return switch (gencat.gc(cp)) { | 59 | return switch (gc(cp)) { |
| 86 | .Cc, | 60 | .Cc, |
| 87 | .Cf, | 61 | .Cf, |
| 88 | .Cn, | 62 | .Cn, |
| @@ -94,8 +68,8 @@ pub fn isControl(gencat: GeneralCategories, cp: u21) bool { | |||
| 94 | } | 68 | } |
| 95 | 69 | ||
| 96 | /// True if `cp` has an L general category. | 70 | /// True if `cp` has an L general category. |
| 97 | pub fn isLetter(gencat: GeneralCategories, cp: u21) bool { | 71 | pub fn isLetter(cp: u21) bool { |
| 98 | return switch (gencat.gc(cp)) { | 72 | return switch (gc(cp)) { |
| 99 | .Ll, | 73 | .Ll, |
| 100 | .Lm, | 74 | .Lm, |
| 101 | .Lo, | 75 | .Lo, |
| @@ -107,8 +81,8 @@ pub fn isLetter(gencat: GeneralCategories, cp: u21) bool { | |||
| 107 | } | 81 | } |
| 108 | 82 | ||
| 109 | /// True if `cp` has an M general category. | 83 | /// True if `cp` has an M general category. |
| 110 | pub fn isMark(gencat: GeneralCategories, cp: u21) bool { | 84 | pub fn isMark(cp: u21) bool { |
| 111 | return switch (gencat.gc(cp)) { | 85 | return switch (gc(cp)) { |
| 112 | .Mc, | 86 | .Mc, |
| 113 | .Me, | 87 | .Me, |
| 114 | .Mn, | 88 | .Mn, |
| @@ -118,8 +92,8 @@ pub fn isMark(gencat: GeneralCategories, cp: u21) bool { | |||
| 118 | } | 92 | } |
| 119 | 93 | ||
| 120 | /// True if `cp` has an N general category. | 94 | /// True if `cp` has an N general category. |
| 121 | pub fn isNumber(gencat: GeneralCategories, cp: u21) bool { | 95 | pub fn isNumber(cp: u21) bool { |
| 122 | return switch (gencat.gc(cp)) { | 96 | return switch (gc(cp)) { |
| 123 | .Nd, | 97 | .Nd, |
| 124 | .Nl, | 98 | .Nl, |
| 125 | .No, | 99 | .No, |
| @@ -129,8 +103,8 @@ pub fn isNumber(gencat: GeneralCategories, cp: u21) bool { | |||
| 129 | } | 103 | } |
| 130 | 104 | ||
| 131 | /// True if `cp` has an P general category. | 105 | /// True if `cp` has an P general category. |
| 132 | pub fn isPunctuation(gencat: GeneralCategories, cp: u21) bool { | 106 | pub fn isPunctuation(cp: u21) bool { |
| 133 | return switch (gencat.gc(cp)) { | 107 | return switch (gc(cp)) { |
| 134 | .Pc, | 108 | .Pc, |
| 135 | .Pd, | 109 | .Pd, |
| 136 | .Pe, | 110 | .Pe, |
| @@ -144,8 +118,8 @@ pub fn isPunctuation(gencat: GeneralCategories, cp: u21) bool { | |||
| 144 | } | 118 | } |
| 145 | 119 | ||
| 146 | /// True if `cp` has an S general category. | 120 | /// True if `cp` has an S general category. |
| 147 | pub fn isSymbol(gencat: GeneralCategories, cp: u21) bool { | 121 | pub fn isSymbol(cp: u21) bool { |
| 148 | return switch (gencat.gc(cp)) { | 122 | return switch (gc(cp)) { |
| 149 | .Sc, | 123 | .Sc, |
| 150 | .Sk, | 124 | .Sk, |
| 151 | .Sm, | 125 | .Sm, |
| @@ -156,8 +130,8 @@ pub fn isSymbol(gencat: GeneralCategories, cp: u21) bool { | |||
| 156 | } | 130 | } |
| 157 | 131 | ||
| 158 | /// True if `cp` has an Z general category. | 132 | /// True if `cp` has an Z general category. |
| 159 | pub fn isSeparator(gencat: GeneralCategories, cp: u21) bool { | 133 | pub fn isSeparator(cp: u21) bool { |
| 160 | return switch (gencat.gc(cp)) { | 134 | return switch (gc(cp)) { |
| 161 | .Zl, | 135 | .Zl, |
| 162 | .Zp, | 136 | .Zp, |
| 163 | .Zs, | 137 | .Zs, |
| @@ -165,19 +139,3 @@ pub fn isSeparator(gencat: GeneralCategories, cp: u21) bool { | |||
| 165 | else => false, | 139 | else => false, |
| 166 | }; | 140 | }; |
| 167 | } | 141 | } |
| 168 | |||
| 169 | fn testAllocator(allocator: Allocator) !void { | ||
| 170 | var gen_cat = try GeneralCategories.init(allocator); | ||
| 171 | gen_cat.deinit(allocator); | ||
| 172 | } | ||
| 173 | |||
| 174 | test "Allocation failure" { | ||
| 175 | try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{}); | ||
| 176 | } | ||
| 177 | |||
| 178 | const std = @import("std"); | ||
| 179 | const builtin = @import("builtin"); | ||
| 180 | const compress = std.compress; | ||
| 181 | const mem = std.mem; | ||
| 182 | const testing = std.testing; | ||
| 183 | const Allocator = mem.Allocator; | ||
diff --git a/src/LetterCasing.zig b/src/LetterCasing.zig index 33096fc..24b67a0 100644 --- a/src/LetterCasing.zig +++ b/src/LetterCasing.zig | |||
| @@ -1,120 +1,58 @@ | |||
| 1 | const CodePointIterator = @import("code_point").Iterator; | 1 | const CodePointIterator = @import("code_point").Iterator; |
| 2 | 2 | const GeneralCategories = @import("GeneralCategories"); | |
| 3 | case_map: [][2]u21 = undefined, | 3 | |
| 4 | prop_s1: []u16 = undefined, | 4 | const Data = struct { |
| 5 | prop_s2: []u8 = undefined, | 5 | s1: []const u16 = undefined, |
| 6 | 6 | s2: []const u44 = undefined, | |
| 7 | const LetterCasing = @This(); | 7 | }; |
| 8 | 8 | ||
| 9 | pub fn init(allocator: Allocator) Allocator.Error!LetterCasing { | 9 | const letter_casing = letter_casing: { |
| 10 | var case = LetterCasing{}; | 10 | const data = @import("case"); |
| 11 | try case.setup(allocator); | 11 | break :letter_casing Data{ |
| 12 | return case; | 12 | .s1 = &data.s1, |
| 13 | } | 13 | .s2 = &data.s2, |
| 14 | |||
| 15 | pub fn setup(case: *LetterCasing, allocator: Allocator) Allocator.Error!void { | ||
| 16 | case.setupInner(allocator) catch |err| { | ||
| 17 | switch (err) { | ||
| 18 | error.OutOfMemory => |e| return e, | ||
| 19 | else => unreachable, | ||
| 20 | } | ||
| 21 | }; | 14 | }; |
| 22 | } | 15 | }; |
| 23 | |||
| 24 | inline fn setupInner(self: *LetterCasing, allocator: mem.Allocator) !void { | ||
| 25 | const endian = builtin.cpu.arch.endian(); | ||
| 26 | |||
| 27 | self.case_map = try allocator.alloc([2]u21, 0x110000); | ||
| 28 | errdefer allocator.free(self.case_map); | ||
| 29 | |||
| 30 | for (0..0x110000) |i| { | ||
| 31 | const cp: u21 = @intCast(i); | ||
| 32 | self.case_map[cp] = .{ cp, cp }; | ||
| 33 | } | ||
| 34 | |||
| 35 | // Uppercase | ||
| 36 | const upper_bytes = @embedFile("upper"); | ||
| 37 | var upper_fbs = std.io.fixedBufferStream(upper_bytes); | ||
| 38 | var upper_reader = upper_fbs.reader(); | ||
| 39 | |||
| 40 | while (true) { | ||
| 41 | const cp = try upper_reader.readInt(i24, endian); | ||
| 42 | if (cp == 0) break; | ||
| 43 | const diff = try upper_reader.readInt(i24, endian); | ||
| 44 | self.case_map[@intCast(cp)][0] = @intCast(cp + diff); | ||
| 45 | } | ||
| 46 | |||
| 47 | // Lowercase | ||
| 48 | const lower_bytes = @embedFile("lower"); | ||
| 49 | var lower_fbs = std.io.fixedBufferStream(lower_bytes); | ||
| 50 | var lower_reader = lower_fbs.reader(); | ||
| 51 | |||
| 52 | while (true) { | ||
| 53 | const cp = try lower_reader.readInt(i24, endian); | ||
| 54 | if (cp == 0) break; | ||
| 55 | const diff = try lower_reader.readInt(i24, endian); | ||
| 56 | self.case_map[@intCast(cp)][1] = @intCast(cp + diff); | ||
| 57 | } | ||
| 58 | |||
| 59 | // Case properties | ||
| 60 | const cp_bytes = @embedFile("case_prop"); | ||
| 61 | var cp_fbs = std.io.fixedBufferStream(cp_bytes); | ||
| 62 | var cp_reader = cp_fbs.reader(); | ||
| 63 | |||
| 64 | const stage_1_len: u16 = try cp_reader.readInt(u16, endian); | ||
| 65 | self.prop_s1 = try allocator.alloc(u16, stage_1_len); | ||
| 66 | errdefer allocator.free(self.prop_s1); | ||
| 67 | for (0..stage_1_len) |i| self.prop_s1[i] = try cp_reader.readInt(u16, endian); | ||
| 68 | |||
| 69 | const stage_2_len: u16 = try cp_reader.readInt(u16, endian); | ||
| 70 | self.prop_s2 = try allocator.alloc(u8, stage_2_len); | ||
| 71 | errdefer allocator.free(self.prop_s2); | ||
| 72 | _ = try cp_reader.readAll(self.prop_s2); | ||
| 73 | } | ||
| 74 | |||
| 75 | pub fn deinit(self: *const LetterCasing, allocator: mem.Allocator) void { | ||
| 76 | allocator.free(self.case_map); | ||
| 77 | allocator.free(self.prop_s1); | ||
| 78 | allocator.free(self.prop_s2); | ||
| 79 | } | ||
| 80 | 16 | ||
| 81 | // Returns true if `cp` is either upper, lower, or title case. | 17 | // Returns true if `cp` is either upper, lower, or title case. |
| 82 | pub fn isCased(self: LetterCasing, cp: u21) bool { | 18 | pub fn isCased(cp: u21) bool { |
| 83 | return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; | 19 | return isUpper(cp) or isLower(cp) or GeneralCategories.gc(cp) == .Lt; |
| 84 | } | 20 | } |
| 85 | 21 | ||
| 86 | // Returns true if `cp` is uppercase. | 22 | // Returns true if `cp` is uppercase. |
| 87 | pub fn isUpper(self: LetterCasing, cp: u21) bool { | 23 | pub fn isUpper(cp: u21) bool { |
| 88 | return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; | 24 | // isUpper is true if we have a mapping to a lower character (bit 1) |
| 25 | return letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)] & 1 == 1; | ||
| 89 | } | 26 | } |
| 90 | 27 | ||
| 91 | /// Returns true if `str` is all uppercase. | 28 | /// Returns true if `str` is all non-lowercase. |
| 92 | pub fn isUpperStr(self: LetterCasing, str: []const u8) bool { | 29 | pub fn isUpperStr(str: []const u8) bool { |
| 93 | var iter = CodePointIterator{ .bytes = str }; | 30 | var iter = CodePointIterator{ .bytes = str }; |
| 94 | 31 | ||
| 95 | return while (iter.next()) |cp| { | 32 | return while (iter.next()) |cp| { |
| 96 | if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false; | 33 | if (isLower(cp.code)) break false; |
| 97 | } else true; | 34 | } else true; |
| 98 | } | 35 | } |
| 99 | 36 | ||
| 100 | test "isUpperStr" { | 37 | test "isUpperStr" { |
| 101 | const cd = try init(testing.allocator); | 38 | try testing.expect(isUpperStr("HELLO, WORLD 2112!")); |
| 102 | defer cd.deinit(testing.allocator); | 39 | try testing.expect(!isUpperStr("hello, world 2112!")); |
| 103 | 40 | try testing.expect(!isUpperStr("Hello, World 2112!")); | |
| 104 | try testing.expect(cd.isUpperStr("HELLO, WORLD 2112!")); | ||
| 105 | try testing.expect(!cd.isUpperStr("hello, world 2112!")); | ||
| 106 | try testing.expect(!cd.isUpperStr("Hello, World 2112!")); | ||
| 107 | } | 41 | } |
| 108 | 42 | ||
| 109 | /// Returns uppercase mapping for `cp`. | 43 | /// Returns uppercase mapping for `cp`. |
| 110 | pub fn toUpper(self: LetterCasing, cp: u21) u21 { | 44 | pub fn toUpper(cp: u21) u21 { |
| 111 | return self.case_map[cp][0]; | 45 | const case_prop = letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)]; |
| 46 | if (case_prop & 2 == 2) { | ||
| 47 | return @intCast(case_prop >> (21 + 2)); | ||
| 48 | } else { | ||
| 49 | return cp; | ||
| 50 | } | ||
| 112 | } | 51 | } |
| 113 | 52 | ||
| 114 | /// Returns a new string with all letters in uppercase. | 53 | /// Returns a new string with all letters in uppercase. |
| 115 | /// Caller must free returned bytes with `allocator`. | 54 | /// Caller must free returned bytes with `allocator`. |
| 116 | pub fn toUpperStr( | 55 | pub fn toUpperStr( |
| 117 | self: LetterCasing, | ||
| 118 | allocator: mem.Allocator, | 56 | allocator: mem.Allocator, |
| 119 | str: []const u8, | 57 | str: []const u8, |
| 120 | ) ![]u8 { | 58 | ) ![]u8 { |
| @@ -125,7 +63,7 @@ pub fn toUpperStr( | |||
| 125 | var buf: [4]u8 = undefined; | 63 | var buf: [4]u8 = undefined; |
| 126 | 64 | ||
| 127 | while (iter.next()) |cp| { | 65 | while (iter.next()) |cp| { |
| 128 | const len = try unicode.utf8Encode(self.toUpper(cp.code), &buf); | 66 | const len = try unicode.utf8Encode(toUpper(cp.code), &buf); |
| 129 | try bytes.appendSlice(buf[0..len]); | 67 | try bytes.appendSlice(buf[0..len]); |
| 130 | } | 68 | } |
| 131 | 69 | ||
| @@ -133,46 +71,45 @@ pub fn toUpperStr( | |||
| 133 | } | 71 | } |
| 134 | 72 | ||
| 135 | test "toUpperStr" { | 73 | test "toUpperStr" { |
| 136 | const cd = try init(testing.allocator); | 74 | const uppered = try toUpperStr(testing.allocator, "Hello, World 2112!"); |
| 137 | defer cd.deinit(testing.allocator); | ||
| 138 | |||
| 139 | const uppered = try cd.toUpperStr(testing.allocator, "Hello, World 2112!"); | ||
| 140 | defer testing.allocator.free(uppered); | 75 | defer testing.allocator.free(uppered); |
| 141 | try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered); | 76 | try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered); |
| 142 | } | 77 | } |
| 143 | 78 | ||
| 144 | // Returns true if `cp` is lowercase. | 79 | // Returns true if `cp` is lowercase. |
| 145 | pub fn isLower(self: LetterCasing, cp: u21) bool { | 80 | pub fn isLower(cp: u21) bool { |
| 146 | return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; | 81 | // isLower is true if we have a mapping to an upper character (bit 2) |
| 82 | return letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)] & 2 == 2; | ||
| 147 | } | 83 | } |
| 148 | 84 | ||
| 149 | /// Returns true if `str` is all lowercase. | 85 | /// Returns true if `str` is all non-uppercase. |
| 150 | pub fn isLowerStr(self: LetterCasing, str: []const u8) bool { | 86 | pub fn isLowerStr(str: []const u8) bool { |
| 151 | var iter = CodePointIterator{ .bytes = str }; | 87 | var iter = CodePointIterator{ .bytes = str }; |
| 152 | 88 | ||
| 153 | return while (iter.next()) |cp| { | 89 | return while (iter.next()) |cp| { |
| 154 | if (self.isCased(cp.code) and !self.isLower(cp.code)) break false; | 90 | if (isUpper(cp.code)) break false; |
| 155 | } else true; | 91 | } else true; |
| 156 | } | 92 | } |
| 157 | 93 | ||
| 158 | test "isLowerStr" { | 94 | test "isLowerStr" { |
| 159 | const cd = try init(testing.allocator); | 95 | try testing.expect(isLowerStr("hello, world 2112!")); |
| 160 | defer cd.deinit(testing.allocator); | 96 | try testing.expect(!isLowerStr("HELLO, WORLD 2112!")); |
| 161 | 97 | try testing.expect(!isLowerStr("Hello, World 2112!")); | |
| 162 | try testing.expect(cd.isLowerStr("hello, world 2112!")); | ||
| 163 | try testing.expect(!cd.isLowerStr("HELLO, WORLD 2112!")); | ||
| 164 | try testing.expect(!cd.isLowerStr("Hello, World 2112!")); | ||
| 165 | } | 98 | } |
| 166 | 99 | ||
| 167 | /// Returns lowercase mapping for `cp`. | 100 | /// Returns lowercase mapping for `cp`. |
| 168 | pub fn toLower(self: LetterCasing, cp: u21) u21 { | 101 | pub fn toLower(cp: u21) u21 { |
| 169 | return self.case_map[cp][1]; | 102 | const case_prop = letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)]; |
| 103 | if (case_prop & 1 == 1) { | ||
| 104 | return @intCast((case_prop >> 2) & 0x1FFFFF); | ||
| 105 | } else { | ||
| 106 | return cp; | ||
| 107 | } | ||
| 170 | } | 108 | } |
| 171 | 109 | ||
| 172 | /// Returns a new string with all letters in lowercase. | 110 | /// Returns a new string with all letters in lowercase. |
| 173 | /// Caller must free returned bytes with `allocator`. | 111 | /// Caller must free returned bytes with `allocator`. |
| 174 | pub fn toLowerStr( | 112 | pub fn toLowerStr( |
| 175 | self: LetterCasing, | ||
| 176 | allocator: mem.Allocator, | 113 | allocator: mem.Allocator, |
| 177 | str: []const u8, | 114 | str: []const u8, |
| 178 | ) ![]u8 { | 115 | ) ![]u8 { |
| @@ -183,7 +120,7 @@ pub fn toLowerStr( | |||
| 183 | var buf: [4]u8 = undefined; | 120 | var buf: [4]u8 = undefined; |
| 184 | 121 | ||
| 185 | while (iter.next()) |cp| { | 122 | while (iter.next()) |cp| { |
| 186 | const len = try unicode.utf8Encode(self.toLower(cp.code), &buf); | 123 | const len = try unicode.utf8Encode(toLower(cp.code), &buf); |
| 187 | try bytes.appendSlice(buf[0..len]); | 124 | try bytes.appendSlice(buf[0..len]); |
| 188 | } | 125 | } |
| 189 | 126 | ||
| @@ -191,27 +128,13 @@ pub fn toLowerStr( | |||
| 191 | } | 128 | } |
| 192 | 129 | ||
| 193 | test "toLowerStr" { | 130 | test "toLowerStr" { |
| 194 | const cd = try init(testing.allocator); | 131 | const lowered = try toLowerStr(testing.allocator, "Hello, World 2112!"); |
| 195 | defer cd.deinit(testing.allocator); | ||
| 196 | |||
| 197 | const lowered = try cd.toLowerStr(testing.allocator, "Hello, World 2112!"); | ||
| 198 | defer testing.allocator.free(lowered); | 132 | defer testing.allocator.free(lowered); |
| 199 | try testing.expectEqualStrings("hello, world 2112!", lowered); | 133 | try testing.expectEqualStrings("hello, world 2112!", lowered); |
| 200 | } | 134 | } |
| 201 | 135 | ||
| 202 | fn testAllocator(allocator: Allocator) !void { | ||
| 203 | var prop = try LetterCasing.init(allocator); | ||
| 204 | prop.deinit(allocator); | ||
| 205 | } | ||
| 206 | |||
| 207 | test "Allocation failure" { | ||
| 208 | try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{}); | ||
| 209 | } | ||
| 210 | |||
| 211 | const std = @import("std"); | 136 | const std = @import("std"); |
| 212 | const builtin = @import("builtin"); | 137 | const builtin = @import("builtin"); |
| 213 | const compress = std.compress; | ||
| 214 | const mem = std.mem; | 138 | const mem = std.mem; |
| 215 | const Allocator = std.mem.Allocator; | ||
| 216 | const testing = std.testing; | 139 | const testing = std.testing; |
| 217 | const unicode = std.unicode; | 140 | const unicode = std.unicode; |
diff --git a/src/Properties.zig b/src/Properties.zig index 432d176..f8c7cfc 100644 --- a/src/Properties.zig +++ b/src/Properties.zig | |||
| @@ -1,177 +1,108 @@ | |||
| 1 | //! Properties module | 1 | //! Properties module |
| 2 | 2 | ||
| 3 | core_s1: []u16 = undefined, | 3 | const Data = struct { |
| 4 | core_s2: []u8 = undefined, | 4 | core_s1: []const u16 = undefined, |
| 5 | props_s1: []u16 = undefined, | 5 | core_s2: []const u8 = undefined, |
| 6 | props_s2: []u8 = undefined, | 6 | props_s1: []const u16 = undefined, |
| 7 | num_s1: []u16 = undefined, | 7 | props_s2: []const u8 = undefined, |
| 8 | num_s2: []u8 = undefined, | 8 | num_s1: []const u16 = undefined, |
| 9 | 9 | num_s2: []const u8 = undefined, | |
| 10 | const Properties = @This(); | 10 | }; |
| 11 | 11 | ||
| 12 | pub fn init(allocator: Allocator) Allocator.Error!Properties { | 12 | const properties = properties: { |
| 13 | var props = Properties{}; | 13 | const core_props = @import("core_props"); |
| 14 | try props.setup(allocator); | 14 | const props_data = @import("props"); |
| 15 | return props; | 15 | const numeric = @import("numeric"); |
| 16 | } | 16 | break :properties Data{ |
| 17 | 17 | .core_s1 = &core_props.s1, | |
| 18 | pub fn setup(props: *Properties, allocator: Allocator) Allocator.Error!void { | 18 | .core_s2 = &core_props.s2, |
| 19 | props.setupInner(allocator) catch |err| { | 19 | .props_s1 = &props_data.s1, |
| 20 | switch (err) { | 20 | .props_s2 = &props_data.s2, |
| 21 | error.OutOfMemory => |e| return e, | 21 | .num_s1 = &numeric.s1, |
| 22 | else => unreachable, | 22 | .num_s2 = &numeric.s2, |
| 23 | } | ||
| 24 | }; | 23 | }; |
| 25 | } | 24 | }; |
| 26 | |||
| 27 | inline fn setupInner(props: *Properties, allocator: Allocator) !void { | ||
| 28 | const endian = builtin.cpu.arch.endian(); | ||
| 29 | |||
| 30 | // Process DerivedCoreProperties.txt | ||
| 31 | const core_bytes = @embedFile("core_props"); | ||
| 32 | var core_fbs = std.io.fixedBufferStream(core_bytes); | ||
| 33 | var core_reader = core_fbs.reader(); | ||
| 34 | |||
| 35 | const core_stage_1_len: u16 = try core_reader.readInt(u16, endian); | ||
| 36 | props.core_s1 = try allocator.alloc(u16, core_stage_1_len); | ||
| 37 | errdefer allocator.free(props.core_s1); | ||
| 38 | for (0..core_stage_1_len) |i| props.core_s1[i] = try core_reader.readInt(u16, endian); | ||
| 39 | |||
| 40 | const core_stage_2_len: u16 = try core_reader.readInt(u16, endian); | ||
| 41 | props.core_s2 = try allocator.alloc(u8, core_stage_2_len); | ||
| 42 | errdefer allocator.free(props.core_s2); | ||
| 43 | _ = try core_reader.readAll(props.core_s2); | ||
| 44 | |||
| 45 | // Process PropList.txt | ||
| 46 | const props_bytes = @embedFile("props"); | ||
| 47 | var props_fbs = std.io.fixedBufferStream(props_bytes); | ||
| 48 | var props_reader = props_fbs.reader(); | ||
| 49 | |||
| 50 | const stage_1_len: u16 = try props_reader.readInt(u16, endian); | ||
| 51 | props.props_s1 = try allocator.alloc(u16, stage_1_len); | ||
| 52 | errdefer allocator.free(props.props_s1); | ||
| 53 | for (0..stage_1_len) |i| props.props_s1[i] = try props_reader.readInt(u16, endian); | ||
| 54 | |||
| 55 | const stage_2_len: u16 = try props_reader.readInt(u16, endian); | ||
| 56 | props.props_s2 = try allocator.alloc(u8, stage_2_len); | ||
| 57 | errdefer allocator.free(props.props_s2); | ||
| 58 | _ = try props_reader.readAll(props.props_s2); | ||
| 59 | |||
| 60 | // Process DerivedNumericType.txt | ||
| 61 | const num_bytes = @embedFile("numeric"); | ||
| 62 | var num_fbs = std.io.fixedBufferStream(num_bytes); | ||
| 63 | var num_reader = num_fbs.reader(); | ||
| 64 | |||
| 65 | const num_stage_1_len: u16 = try num_reader.readInt(u16, endian); | ||
| 66 | props.num_s1 = try allocator.alloc(u16, num_stage_1_len); | ||
| 67 | errdefer allocator.free(props.num_s1); | ||
| 68 | for (0..num_stage_1_len) |i| props.num_s1[i] = try num_reader.readInt(u16, endian); | ||
| 69 | |||
| 70 | const num_stage_2_len: u16 = try num_reader.readInt(u16, endian); | ||
| 71 | props.num_s2 = try allocator.alloc(u8, num_stage_2_len); | ||
| 72 | errdefer allocator.free(props.num_s2); | ||
| 73 | _ = try num_reader.readAll(props.num_s2); | ||
| 74 | } | ||
| 75 | 25 | ||
| 76 | pub fn deinit(self: *const Properties, allocator: Allocator) void { | 26 | const Properties = @This(); |
| 77 | allocator.free(self.core_s1); | ||
| 78 | allocator.free(self.core_s2); | ||
| 79 | allocator.free(self.props_s1); | ||
| 80 | allocator.free(self.props_s2); | ||
| 81 | allocator.free(self.num_s1); | ||
| 82 | allocator.free(self.num_s2); | ||
| 83 | } | ||
| 84 | 27 | ||
| 85 | /// True if `cp` is a mathematical symbol. | 28 | /// True if `cp` is a mathematical symbol. |
| 86 | pub fn isMath(self: Properties, cp: u21) bool { | 29 | pub fn isMath(cp: u21) bool { |
| 87 | return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; | 30 | return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; |
| 88 | } | 31 | } |
| 89 | 32 | ||
| 90 | /// True if `cp` is an alphabetic character. | 33 | /// True if `cp` is an alphabetic character. |
| 91 | pub fn isAlphabetic(self: Properties, cp: u21) bool { | 34 | pub fn isAlphabetic(cp: u21) bool { |
| 92 | return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; | 35 | return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; |
| 93 | } | 36 | } |
| 94 | 37 | ||
| 95 | /// True if `cp` is a valid identifier start character. | 38 | /// True if `cp` is a valid identifier start character. |
| 96 | pub fn isIdStart(self: Properties, cp: u21) bool { | 39 | pub fn isIdStart(cp: u21) bool { |
| 97 | return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; | 40 | return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; |
| 98 | } | 41 | } |
| 99 | 42 | ||
| 100 | /// True if `cp` is a valid identifier continuation character. | 43 | /// True if `cp` is a valid identifier continuation character. |
| 101 | pub fn isIdContinue(self: Properties, cp: u21) bool { | 44 | pub fn isIdContinue(cp: u21) bool { |
| 102 | return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 8 == 8; | 45 | return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 8 == 8; |
| 103 | } | 46 | } |
| 104 | 47 | ||
| 105 | /// True if `cp` is a valid extended identifier start character. | 48 | /// True if `cp` is a valid extended identifier start character. |
| 106 | pub fn isXidStart(self: Properties, cp: u21) bool { | 49 | pub fn isXidStart(cp: u21) bool { |
| 107 | return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 16 == 16; | 50 | return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 16 == 16; |
| 108 | } | 51 | } |
| 109 | 52 | ||
| 110 | /// True if `cp` is a valid extended identifier continuation character. | 53 | /// True if `cp` is a valid extended identifier continuation character. |
| 111 | pub fn isXidContinue(self: Properties, cp: u21) bool { | 54 | pub fn isXidContinue(cp: u21) bool { |
| 112 | return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 32 == 32; | 55 | return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 32 == 32; |
| 113 | } | 56 | } |
| 114 | 57 | ||
| 115 | /// True if `cp` is a whitespace character. | 58 | /// True if `cp` is a whitespace character. |
| 116 | pub fn isWhitespace(self: Properties, cp: u21) bool { | 59 | pub fn isWhitespace(cp: u21) bool { |
| 117 | return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; | 60 | return properties.props_s2[properties.props_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; |
| 118 | } | 61 | } |
| 119 | 62 | ||
| 120 | /// True if `cp` is a hexadecimal digit. | 63 | /// True if `cp` is a hexadecimal digit. |
| 121 | pub fn isHexDigit(self: Properties, cp: u21) bool { | 64 | pub fn isHexDigit(cp: u21) bool { |
| 122 | return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; | 65 | return properties.props_s2[properties.props_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; |
| 123 | } | 66 | } |
| 124 | 67 | ||
| 125 | /// True if `cp` is a diacritic mark. | 68 | /// True if `cp` is a diacritic mark. |
| 126 | pub fn isDiacritic(self: Properties, cp: u21) bool { | 69 | pub fn isDiacritic(cp: u21) bool { |
| 127 | return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; | 70 | return properties.props_s2[properties.props_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; |
| 128 | } | 71 | } |
| 129 | 72 | ||
| 130 | /// True if `cp` is numeric. | 73 | /// True if `cp` is numeric. |
| 131 | pub fn isNumeric(self: Properties, cp: u21) bool { | 74 | pub fn isNumeric(cp: u21) bool { |
| 132 | return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; | 75 | return properties.num_s2[properties.num_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; |
| 133 | } | 76 | } |
| 134 | 77 | ||
| 135 | /// True if `cp` is a digit. | 78 | /// True if `cp` is a digit. |
| 136 | pub fn isDigit(self: Properties, cp: u21) bool { | 79 | pub fn isDigit(cp: u21) bool { |
| 137 | return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; | 80 | return properties.num_s2[properties.num_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; |
| 138 | } | 81 | } |
| 139 | 82 | ||
| 140 | /// True if `cp` is decimal. | 83 | /// True if `cp` is decimal. |
| 141 | pub fn isDecimal(self: Properties, cp: u21) bool { | 84 | pub fn isDecimal(cp: u21) bool { |
| 142 | return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; | 85 | return properties.num_s2[properties.num_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; |
| 143 | } | 86 | } |
| 144 | 87 | ||
| 145 | test "Props" { | 88 | test "Props" { |
| 146 | const self = try init(testing.allocator); | 89 | try testing.expect(Properties.isHexDigit('F')); |
| 147 | defer self.deinit(testing.allocator); | 90 | try testing.expect(Properties.isHexDigit('a')); |
| 148 | 91 | try testing.expect(Properties.isHexDigit('8')); | |
| 149 | try testing.expect(self.isHexDigit('F')); | 92 | try testing.expect(!Properties.isHexDigit('z')); |
| 150 | try testing.expect(self.isHexDigit('a')); | 93 | |
| 151 | try testing.expect(self.isHexDigit('8')); | 94 | try testing.expect(Properties.isDiacritic('\u{301}')); |
| 152 | try testing.expect(!self.isHexDigit('z')); | 95 | try testing.expect(Properties.isAlphabetic('A')); |
| 153 | 96 | try testing.expect(!Properties.isAlphabetic('3')); | |
| 154 | try testing.expect(self.isDiacritic('\u{301}')); | 97 | try testing.expect(Properties.isMath('+')); |
| 155 | try testing.expect(self.isAlphabetic('A')); | 98 | |
| 156 | try testing.expect(!self.isAlphabetic('3')); | 99 | try testing.expect(Properties.isNumeric('\u{277f}')); |
| 157 | try testing.expect(self.isMath('+')); | 100 | try testing.expect(Properties.isDigit('\u{2070}')); |
| 158 | 101 | try testing.expect(Properties.isDecimal('3')); | |
| 159 | try testing.expect(self.isNumeric('\u{277f}')); | 102 | |
| 160 | try testing.expect(self.isDigit('\u{2070}')); | 103 | try testing.expect(!Properties.isNumeric('1')); |
| 161 | try testing.expect(self.isDecimal('3')); | 104 | try testing.expect(!Properties.isDigit('2')); |
| 162 | 105 | try testing.expect(!Properties.isDecimal('g')); | |
| 163 | try testing.expect(!self.isNumeric('1')); | ||
| 164 | try testing.expect(!self.isDigit('2')); | ||
| 165 | try testing.expect(!self.isDecimal('g')); | ||
| 166 | } | ||
| 167 | |||
| 168 | fn testAllocator(allocator: Allocator) !void { | ||
| 169 | var prop = try Properties.init(allocator); | ||
| 170 | prop.deinit(allocator); | ||
| 171 | } | ||
| 172 | |||
| 173 | test "Allocation failure" { | ||
| 174 | try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{}); | ||
| 175 | } | 106 | } |
| 176 | 107 | ||
| 177 | const std = @import("std"); | 108 | const std = @import("std"); |
diff --git a/src/Scripts.zig b/src/Scripts.zig index 719b01f..4938318 100644 --- a/src/Scripts.zig +++ b/src/Scripts.zig | |||
| @@ -1,8 +1,18 @@ | |||
| 1 | //! Scripts Module | 1 | //! Scripts Module |
| 2 | const Data = struct { | ||
| 3 | s1: []const u16 = undefined, | ||
| 4 | s2: []const u8 = undefined, | ||
| 5 | s3: []const u8 = undefined, | ||
| 6 | }; | ||
| 2 | 7 | ||
| 3 | s1: []u16 = undefined, | 8 | const scripts = scripts: { |
| 4 | s2: []u8 = undefined, | 9 | const data = @import("script"); |
| 5 | s3: []u8 = undefined, | 10 | break :scripts Data{ |
| 11 | .s1 = &data.s1, | ||
| 12 | .s2 = &data.s2, | ||
| 13 | .s3 = &data.s3, | ||
| 14 | }; | ||
| 15 | }; | ||
| 6 | 16 | ||
| 7 | /// Scripts enum | 17 | /// Scripts enum |
| 8 | pub const Script = enum { | 18 | pub const Script = enum { |
| @@ -178,76 +188,20 @@ pub const Script = enum { | |||
| 178 | Yi, | 188 | Yi, |
| 179 | Zanabazar_Square, | 189 | Zanabazar_Square, |
| 180 | }; | 190 | }; |
| 181 | const Scripts = @This(); | ||
| 182 | |||
| 183 | pub fn init(allocator: Allocator) Allocator.Error!Scripts { | ||
| 184 | var scripts = Scripts{}; | ||
| 185 | try scripts.setup(allocator); | ||
| 186 | return scripts; | ||
| 187 | } | ||
| 188 | |||
| 189 | pub fn setup(scripts: *Scripts, allocator: Allocator) Allocator.Error!void { | ||
| 190 | scripts.setupInner(allocator) catch |err| { | ||
| 191 | switch (err) { | ||
| 192 | error.OutOfMemory => |e| return e, | ||
| 193 | else => unreachable, | ||
| 194 | } | ||
| 195 | }; | ||
| 196 | } | ||
| 197 | |||
| 198 | inline fn setupInner(scripts: *Scripts, allocator: mem.Allocator) !void { | ||
| 199 | const in_bytes = @embedFile("scripts"); | ||
| 200 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 201 | var reader = in_fbs.reader(); | ||
| 202 | |||
| 203 | const endian = builtin.cpu.arch.endian(); | ||
| 204 | |||
| 205 | const s1_len: u16 = try reader.readInt(u16, endian); | ||
| 206 | scripts.s1 = try allocator.alloc(u16, s1_len); | ||
| 207 | errdefer allocator.free(scripts.s1); | ||
| 208 | for (0..s1_len) |i| scripts.s1[i] = try reader.readInt(u16, endian); | ||
| 209 | |||
| 210 | const s2_len: u16 = try reader.readInt(u16, endian); | ||
| 211 | scripts.s2 = try allocator.alloc(u8, s2_len); | ||
| 212 | errdefer allocator.free(scripts.s2); | ||
| 213 | _ = try reader.readAll(scripts.s2); | ||
| 214 | |||
| 215 | const s3_len: u16 = try reader.readInt(u8, endian); | ||
| 216 | scripts.s3 = try allocator.alloc(u8, s3_len); | ||
| 217 | errdefer allocator.free(scripts.s3); | ||
| 218 | _ = try reader.readAll(scripts.s3); | ||
| 219 | } | ||
| 220 | |||
| 221 | pub fn deinit(self: *const Scripts, allocator: mem.Allocator) void { | ||
| 222 | allocator.free(self.s1); | ||
| 223 | allocator.free(self.s2); | ||
| 224 | allocator.free(self.s3); | ||
| 225 | } | ||
| 226 | 191 | ||
| 227 | /// Lookup the Script type for `cp`. | 192 | /// Lookup the Script type for `cp`. |
| 228 | pub fn script(self: Scripts, cp: u21) ?Script { | 193 | pub fn script(cp: u21) ?Script { |
| 229 | const byte = self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]]; | 194 | const byte = scripts.s3[scripts.s2[scripts.s1[cp >> 8] + (cp & 0xff)]]; |
| 230 | if (byte == 0) return null; | 195 | if (byte == 0) return null; |
| 231 | return @enumFromInt(byte); | 196 | return @enumFromInt(byte); |
| 232 | } | 197 | } |
| 233 | 198 | ||
| 234 | test "script" { | 199 | test "script" { |
| 235 | const self = try init(std.testing.allocator); | 200 | try testing.expectEqual(Script.Latin, script('A').?); |
| 236 | defer self.deinit(std.testing.allocator); | 201 | // try testing.expectEqual(Script.Deseret, script('𐐌').?); |
| 237 | try testing.expectEqual(Script.Latin, self.script('A').?); | ||
| 238 | } | ||
| 239 | |||
| 240 | fn testAllocator(allocator: Allocator) !void { | ||
| 241 | var prop = try Scripts.init(allocator); | ||
| 242 | prop.deinit(allocator); | ||
| 243 | } | ||
| 244 | |||
| 245 | test "Allocation failure" { | ||
| 246 | try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{}); | ||
| 247 | } | 202 | } |
| 248 | 203 | ||
| 249 | const std = @import("std"); | 204 | const std = @import("std"); |
| 250 | const builtin = @import("builtin"); | 205 | const builtin = @import("builtin"); |
| 251 | const mem = std.mem; | 206 | const unicode = std.unicode; |
| 252 | const Allocator = mem.Allocator; | ||
| 253 | const testing = std.testing; | 207 | const testing = std.testing; |