diff options
Diffstat (limited to 'src/CaseData.zig')
| -rw-r--r-- | src/CaseData.zig | 223 |
1 files changed, 223 insertions, 0 deletions
diff --git a/src/CaseData.zig b/src/CaseData.zig new file mode 100644 index 0000000..38830e3 --- /dev/null +++ b/src/CaseData.zig | |||
| @@ -0,0 +1,223 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | const compress = std.compress; | ||
| 4 | const mem = std.mem; | ||
| 5 | const testing = std.testing; | ||
| 6 | const unicode = std.unicode; | ||
| 7 | |||
| 8 | const CodePointIterator = @import("code_point").Iterator; | ||
| 9 | |||
| 10 | allocator: mem.Allocator, | ||
| 11 | case_map: [][3]u21, | ||
| 12 | prop_s1: []u16 = undefined, | ||
| 13 | prop_s2: []u8 = undefined, | ||
| 14 | |||
| 15 | const Self = @This(); | ||
| 16 | |||
| 17 | pub fn init(allocator: mem.Allocator) !Self { | ||
| 18 | const decompressor = compress.deflate.decompressor; | ||
| 19 | const endian = builtin.cpu.arch.endian(); | ||
| 20 | |||
| 21 | var self = Self{ | ||
| 22 | .allocator = allocator, | ||
| 23 | .case_map = try allocator.alloc([3]u21, 0x110000), | ||
| 24 | }; | ||
| 25 | errdefer allocator.free(self.case_map); | ||
| 26 | |||
| 27 | for (0..0x110000) |i| { | ||
| 28 | const cp: u21 = @intCast(i); | ||
| 29 | self.case_map[cp] = .{ cp, cp, cp }; | ||
| 30 | } | ||
| 31 | |||
| 32 | // Uppercase | ||
| 33 | const upper_bytes = @embedFile("upper"); | ||
| 34 | var upper_fbs = std.io.fixedBufferStream(upper_bytes); | ||
| 35 | var upper_decomp = try decompressor(allocator, upper_fbs.reader(), null); | ||
| 36 | defer upper_decomp.deinit(); | ||
| 37 | var upper_reader = upper_decomp.reader(); | ||
| 38 | |||
| 39 | while (true) { | ||
| 40 | const cp = try upper_reader.readInt(u24, endian); | ||
| 41 | if (cp == 0) break; | ||
| 42 | self.case_map[cp][0] = @intCast(try upper_reader.readInt(u24, endian)); | ||
| 43 | } | ||
| 44 | |||
| 45 | // Lowercase | ||
| 46 | const lower_bytes = @embedFile("lower"); | ||
| 47 | var lower_fbs = std.io.fixedBufferStream(lower_bytes); | ||
| 48 | var lower_decomp = try decompressor(allocator, lower_fbs.reader(), null); | ||
| 49 | defer lower_decomp.deinit(); | ||
| 50 | var lower_reader = lower_decomp.reader(); | ||
| 51 | |||
| 52 | while (true) { | ||
| 53 | const cp = try lower_reader.readInt(u24, endian); | ||
| 54 | if (cp == 0) break; | ||
| 55 | self.case_map[cp][1] = @intCast(try lower_reader.readInt(u24, endian)); | ||
| 56 | } | ||
| 57 | |||
| 58 | // Titlercase | ||
| 59 | const title_bytes = @embedFile("title"); | ||
| 60 | var title_fbs = std.io.fixedBufferStream(title_bytes); | ||
| 61 | var title_decomp = try decompressor(allocator, title_fbs.reader(), null); | ||
| 62 | defer title_decomp.deinit(); | ||
| 63 | var title_reader = title_decomp.reader(); | ||
| 64 | |||
| 65 | while (true) { | ||
| 66 | const cp = try title_reader.readInt(u24, endian); | ||
| 67 | if (cp == 0) break; | ||
| 68 | self.case_map[cp][2] = @intCast(try title_reader.readInt(u24, endian)); | ||
| 69 | } | ||
| 70 | |||
| 71 | // Case properties | ||
| 72 | const cp_bytes = @embedFile("case_prop"); | ||
| 73 | var cp_fbs = std.io.fixedBufferStream(cp_bytes); | ||
| 74 | var cp_decomp = try decompressor(allocator, cp_fbs.reader(), null); | ||
| 75 | defer cp_decomp.deinit(); | ||
| 76 | var cp_reader = cp_decomp.reader(); | ||
| 77 | |||
| 78 | const stage_1_len: u16 = try cp_reader.readInt(u16, endian); | ||
| 79 | self.prop_s1 = try allocator.alloc(u16, stage_1_len); | ||
| 80 | errdefer allocator.free(self.prop_s1); | ||
| 81 | for (0..stage_1_len) |i| self.prop_s1[i] = try cp_reader.readInt(u16, endian); | ||
| 82 | |||
| 83 | const stage_2_len: u16 = try cp_reader.readInt(u16, endian); | ||
| 84 | self.prop_s2 = try allocator.alloc(u8, stage_2_len); | ||
| 85 | errdefer allocator.free(self.prop_s2); | ||
| 86 | _ = try cp_reader.readAll(self.prop_s2); | ||
| 87 | |||
| 88 | return self; | ||
| 89 | } | ||
| 90 | |||
| 91 | pub fn deinit(self: *Self) void { | ||
| 92 | self.allocator.free(self.case_map); | ||
| 93 | self.allocator.free(self.prop_s1); | ||
| 94 | self.allocator.free(self.prop_s2); | ||
| 95 | } | ||
| 96 | |||
| 97 | // Returns true if `cp` is either upper, lower, or title case. | ||
| 98 | pub inline fn isCased(self: Self, cp: u21) bool { | ||
| 99 | return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; | ||
| 100 | } | ||
| 101 | |||
| 102 | // Returns true if `cp` is uppercase. | ||
| 103 | pub fn isUpper(self: Self, cp: u21) bool { | ||
| 104 | if (!self.isCased(cp)) return true; | ||
| 105 | return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; | ||
| 106 | } | ||
| 107 | |||
| 108 | /// Returns true if `str` is all uppercase. | ||
| 109 | pub fn isUpperStr(self: Self, str: []const u8) bool { | ||
| 110 | var iter = CodePointIterator{ .bytes = str }; | ||
| 111 | |||
| 112 | return while (iter.next()) |cp| { | ||
| 113 | if (!self.isUpper(cp.code)) break false; | ||
| 114 | } else true; | ||
| 115 | } | ||
| 116 | |||
| 117 | test "isUpperStr" { | ||
| 118 | var cd = try init(testing.allocator); | ||
| 119 | defer cd.deinit(); | ||
| 120 | |||
| 121 | try testing.expect(cd.isUpperStr("HELLO, WORLD 2112!")); | ||
| 122 | try testing.expect(!cd.isUpperStr("hello, world 2112!")); | ||
| 123 | try testing.expect(!cd.isUpperStr("Hello, World 2112!")); | ||
| 124 | } | ||
| 125 | |||
| 126 | /// Returns a new string with all letters in uppercase. | ||
| 127 | /// Caller must free returned bytes with `allocator`. | ||
| 128 | pub fn toUpperStr( | ||
| 129 | self: Self, | ||
| 130 | allocator: mem.Allocator, | ||
| 131 | str: []const u8, | ||
| 132 | ) ![]u8 { | ||
| 133 | var bytes = std.ArrayList(u8).init(allocator); | ||
| 134 | defer bytes.deinit(); | ||
| 135 | |||
| 136 | var iter = CodePointIterator{ .bytes = str }; | ||
| 137 | var buf: [4]u8 = undefined; | ||
| 138 | |||
| 139 | while (iter.next()) |cp| { | ||
| 140 | const len = try unicode.utf8Encode(self.toUpper(cp.code), &buf); | ||
| 141 | try bytes.appendSlice(buf[0..len]); | ||
| 142 | } | ||
| 143 | |||
| 144 | return try bytes.toOwnedSlice(); | ||
| 145 | } | ||
| 146 | |||
| 147 | test "toUpperStr" { | ||
| 148 | var cd = try init(testing.allocator); | ||
| 149 | defer cd.deinit(); | ||
| 150 | |||
| 151 | const uppered = try cd.toUpperStr(testing.allocator, "Hello, World 2112!"); | ||
| 152 | defer testing.allocator.free(uppered); | ||
| 153 | try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered); | ||
| 154 | } | ||
| 155 | |||
| 156 | /// Returns uppercase mapping for `cp`. | ||
| 157 | pub inline fn toUpper(self: Self, cp: u21) u21 { | ||
| 158 | return self.case_map[cp][0]; | ||
| 159 | } | ||
| 160 | |||
| 161 | // Returns true if `cp` is lowercase. | ||
| 162 | pub fn isLower(self: Self, cp: u21) bool { | ||
| 163 | if (!self.isCased(cp)) return true; | ||
| 164 | return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; | ||
| 165 | } | ||
| 166 | |||
| 167 | /// Returns lowercase mapping for `cp`. | ||
| 168 | pub inline fn toLower(self: Self, cp: u21) u21 { | ||
| 169 | return self.case_map[cp][1]; | ||
| 170 | } | ||
| 171 | |||
| 172 | /// Returns true if `str` is all lowercase. | ||
| 173 | pub fn isLowerStr(self: Self, str: []const u8) bool { | ||
| 174 | var iter = CodePointIterator{ .bytes = str }; | ||
| 175 | |||
| 176 | return while (iter.next()) |cp| { | ||
| 177 | if (!self.isLower(cp.code)) break false; | ||
| 178 | } else true; | ||
| 179 | } | ||
| 180 | |||
| 181 | test "isLowerStr" { | ||
| 182 | var cd = try init(testing.allocator); | ||
| 183 | defer cd.deinit(); | ||
| 184 | |||
| 185 | try testing.expect(cd.isLowerStr("hello, world 2112!")); | ||
| 186 | try testing.expect(!cd.isLowerStr("HELLO, WORLD 2112!")); | ||
| 187 | try testing.expect(!cd.isLowerStr("Hello, World 2112!")); | ||
| 188 | } | ||
| 189 | |||
| 190 | /// Returns a new string with all letters in lowercase. | ||
| 191 | /// Caller must free returned bytes with `allocator`. | ||
| 192 | pub fn toLowerStr( | ||
| 193 | self: Self, | ||
| 194 | allocator: mem.Allocator, | ||
| 195 | str: []const u8, | ||
| 196 | ) ![]u8 { | ||
| 197 | var bytes = std.ArrayList(u8).init(allocator); | ||
| 198 | defer bytes.deinit(); | ||
| 199 | |||
| 200 | var iter = CodePointIterator{ .bytes = str }; | ||
| 201 | var buf: [4]u8 = undefined; | ||
| 202 | |||
| 203 | while (iter.next()) |cp| { | ||
| 204 | const len = try unicode.utf8Encode(self.toLower(cp.code), &buf); | ||
| 205 | try bytes.appendSlice(buf[0..len]); | ||
| 206 | } | ||
| 207 | |||
| 208 | return try bytes.toOwnedSlice(); | ||
| 209 | } | ||
| 210 | |||
| 211 | test "toLowerStr" { | ||
| 212 | var cd = try init(testing.allocator); | ||
| 213 | defer cd.deinit(); | ||
| 214 | |||
| 215 | const lowered = try cd.toLowerStr(testing.allocator, "Hello, World 2112!"); | ||
| 216 | defer testing.allocator.free(lowered); | ||
| 217 | try testing.expectEqualStrings("hello, world 2112!", lowered); | ||
| 218 | } | ||
| 219 | |||
| 220 | /// Returns titlecase mapping for `cp`. | ||
| 221 | pub inline fn toTitle(self: Self, cp: u21) u21 { | ||
| 222 | return self.case_map[cp][2]; | ||
| 223 | } | ||