diff options
Diffstat (limited to 'src/LetterCasing.zig')
| -rw-r--r-- | src/LetterCasing.zig | 179 |
1 files changed, 51 insertions, 128 deletions
diff --git a/src/LetterCasing.zig b/src/LetterCasing.zig index 33096fc..24b67a0 100644 --- a/src/LetterCasing.zig +++ b/src/LetterCasing.zig | |||
| @@ -1,120 +1,58 @@ | |||
| 1 | const CodePointIterator = @import("code_point").Iterator; | 1 | const CodePointIterator = @import("code_point").Iterator; |
| 2 | 2 | const GeneralCategories = @import("GeneralCategories"); | |
| 3 | case_map: [][2]u21 = undefined, | 3 | |
| 4 | prop_s1: []u16 = undefined, | 4 | const Data = struct { |
| 5 | prop_s2: []u8 = undefined, | 5 | s1: []const u16 = undefined, |
| 6 | 6 | s2: []const u44 = undefined, | |
| 7 | const LetterCasing = @This(); | 7 | }; |
| 8 | 8 | ||
| 9 | pub fn init(allocator: Allocator) Allocator.Error!LetterCasing { | 9 | const letter_casing = letter_casing: { |
| 10 | var case = LetterCasing{}; | 10 | const data = @import("case"); |
| 11 | try case.setup(allocator); | 11 | break :letter_casing Data{ |
| 12 | return case; | 12 | .s1 = &data.s1, |
| 13 | } | 13 | .s2 = &data.s2, |
| 14 | |||
| 15 | pub fn setup(case: *LetterCasing, allocator: Allocator) Allocator.Error!void { | ||
| 16 | case.setupInner(allocator) catch |err| { | ||
| 17 | switch (err) { | ||
| 18 | error.OutOfMemory => |e| return e, | ||
| 19 | else => unreachable, | ||
| 20 | } | ||
| 21 | }; | 14 | }; |
| 22 | } | 15 | }; |
| 23 | |||
| 24 | inline fn setupInner(self: *LetterCasing, allocator: mem.Allocator) !void { | ||
| 25 | const endian = builtin.cpu.arch.endian(); | ||
| 26 | |||
| 27 | self.case_map = try allocator.alloc([2]u21, 0x110000); | ||
| 28 | errdefer allocator.free(self.case_map); | ||
| 29 | |||
| 30 | for (0..0x110000) |i| { | ||
| 31 | const cp: u21 = @intCast(i); | ||
| 32 | self.case_map[cp] = .{ cp, cp }; | ||
| 33 | } | ||
| 34 | |||
| 35 | // Uppercase | ||
| 36 | const upper_bytes = @embedFile("upper"); | ||
| 37 | var upper_fbs = std.io.fixedBufferStream(upper_bytes); | ||
| 38 | var upper_reader = upper_fbs.reader(); | ||
| 39 | |||
| 40 | while (true) { | ||
| 41 | const cp = try upper_reader.readInt(i24, endian); | ||
| 42 | if (cp == 0) break; | ||
| 43 | const diff = try upper_reader.readInt(i24, endian); | ||
| 44 | self.case_map[@intCast(cp)][0] = @intCast(cp + diff); | ||
| 45 | } | ||
| 46 | |||
| 47 | // Lowercase | ||
| 48 | const lower_bytes = @embedFile("lower"); | ||
| 49 | var lower_fbs = std.io.fixedBufferStream(lower_bytes); | ||
| 50 | var lower_reader = lower_fbs.reader(); | ||
| 51 | |||
| 52 | while (true) { | ||
| 53 | const cp = try lower_reader.readInt(i24, endian); | ||
| 54 | if (cp == 0) break; | ||
| 55 | const diff = try lower_reader.readInt(i24, endian); | ||
| 56 | self.case_map[@intCast(cp)][1] = @intCast(cp + diff); | ||
| 57 | } | ||
| 58 | |||
| 59 | // Case properties | ||
| 60 | const cp_bytes = @embedFile("case_prop"); | ||
| 61 | var cp_fbs = std.io.fixedBufferStream(cp_bytes); | ||
| 62 | var cp_reader = cp_fbs.reader(); | ||
| 63 | |||
| 64 | const stage_1_len: u16 = try cp_reader.readInt(u16, endian); | ||
| 65 | self.prop_s1 = try allocator.alloc(u16, stage_1_len); | ||
| 66 | errdefer allocator.free(self.prop_s1); | ||
| 67 | for (0..stage_1_len) |i| self.prop_s1[i] = try cp_reader.readInt(u16, endian); | ||
| 68 | |||
| 69 | const stage_2_len: u16 = try cp_reader.readInt(u16, endian); | ||
| 70 | self.prop_s2 = try allocator.alloc(u8, stage_2_len); | ||
| 71 | errdefer allocator.free(self.prop_s2); | ||
| 72 | _ = try cp_reader.readAll(self.prop_s2); | ||
| 73 | } | ||
| 74 | |||
| 75 | pub fn deinit(self: *const LetterCasing, allocator: mem.Allocator) void { | ||
| 76 | allocator.free(self.case_map); | ||
| 77 | allocator.free(self.prop_s1); | ||
| 78 | allocator.free(self.prop_s2); | ||
| 79 | } | ||
| 80 | 16 | ||
| 81 | // Returns true if `cp` is either upper, lower, or title case. | 17 | // Returns true if `cp` is either upper, lower, or title case. |
| 82 | pub fn isCased(self: LetterCasing, cp: u21) bool { | 18 | pub fn isCased(cp: u21) bool { |
| 83 | return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; | 19 | return isUpper(cp) or isLower(cp) or GeneralCategories.gc(cp) == .Lt; |
| 84 | } | 20 | } |
| 85 | 21 | ||
| 86 | // Returns true if `cp` is uppercase. | 22 | // Returns true if `cp` is uppercase. |
| 87 | pub fn isUpper(self: LetterCasing, cp: u21) bool { | 23 | pub fn isUpper(cp: u21) bool { |
| 88 | return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; | 24 | // isUpper is true if we have a mapping to a lower character (bit 1) |
| 25 | return letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)] & 1 == 1; | ||
| 89 | } | 26 | } |
| 90 | 27 | ||
| 91 | /// Returns true if `str` is all uppercase. | 28 | /// Returns true if `str` is all non-lowercase. |
| 92 | pub fn isUpperStr(self: LetterCasing, str: []const u8) bool { | 29 | pub fn isUpperStr(str: []const u8) bool { |
| 93 | var iter = CodePointIterator{ .bytes = str }; | 30 | var iter = CodePointIterator{ .bytes = str }; |
| 94 | 31 | ||
| 95 | return while (iter.next()) |cp| { | 32 | return while (iter.next()) |cp| { |
| 96 | if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false; | 33 | if (isLower(cp.code)) break false; |
| 97 | } else true; | 34 | } else true; |
| 98 | } | 35 | } |
| 99 | 36 | ||
| 100 | test "isUpperStr" { | 37 | test "isUpperStr" { |
| 101 | const cd = try init(testing.allocator); | 38 | try testing.expect(isUpperStr("HELLO, WORLD 2112!")); |
| 102 | defer cd.deinit(testing.allocator); | 39 | try testing.expect(!isUpperStr("hello, world 2112!")); |
| 103 | 40 | try testing.expect(!isUpperStr("Hello, World 2112!")); | |
| 104 | try testing.expect(cd.isUpperStr("HELLO, WORLD 2112!")); | ||
| 105 | try testing.expect(!cd.isUpperStr("hello, world 2112!")); | ||
| 106 | try testing.expect(!cd.isUpperStr("Hello, World 2112!")); | ||
| 107 | } | 41 | } |
| 108 | 42 | ||
| 109 | /// Returns uppercase mapping for `cp`. | 43 | /// Returns uppercase mapping for `cp`. |
| 110 | pub fn toUpper(self: LetterCasing, cp: u21) u21 { | 44 | pub fn toUpper(cp: u21) u21 { |
| 111 | return self.case_map[cp][0]; | 45 | const case_prop = letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)]; |
| 46 | if (case_prop & 2 == 2) { | ||
| 47 | return @intCast(case_prop >> (21 + 2)); | ||
| 48 | } else { | ||
| 49 | return cp; | ||
| 50 | } | ||
| 112 | } | 51 | } |
| 113 | 52 | ||
| 114 | /// Returns a new string with all letters in uppercase. | 53 | /// Returns a new string with all letters in uppercase. |
| 115 | /// Caller must free returned bytes with `allocator`. | 54 | /// Caller must free returned bytes with `allocator`. |
| 116 | pub fn toUpperStr( | 55 | pub fn toUpperStr( |
| 117 | self: LetterCasing, | ||
| 118 | allocator: mem.Allocator, | 56 | allocator: mem.Allocator, |
| 119 | str: []const u8, | 57 | str: []const u8, |
| 120 | ) ![]u8 { | 58 | ) ![]u8 { |
| @@ -125,7 +63,7 @@ pub fn toUpperStr( | |||
| 125 | var buf: [4]u8 = undefined; | 63 | var buf: [4]u8 = undefined; |
| 126 | 64 | ||
| 127 | while (iter.next()) |cp| { | 65 | while (iter.next()) |cp| { |
| 128 | const len = try unicode.utf8Encode(self.toUpper(cp.code), &buf); | 66 | const len = try unicode.utf8Encode(toUpper(cp.code), &buf); |
| 129 | try bytes.appendSlice(buf[0..len]); | 67 | try bytes.appendSlice(buf[0..len]); |
| 130 | } | 68 | } |
| 131 | 69 | ||
| @@ -133,46 +71,45 @@ pub fn toUpperStr( | |||
| 133 | } | 71 | } |
| 134 | 72 | ||
| 135 | test "toUpperStr" { | 73 | test "toUpperStr" { |
| 136 | const cd = try init(testing.allocator); | 74 | const uppered = try toUpperStr(testing.allocator, "Hello, World 2112!"); |
| 137 | defer cd.deinit(testing.allocator); | ||
| 138 | |||
| 139 | const uppered = try cd.toUpperStr(testing.allocator, "Hello, World 2112!"); | ||
| 140 | defer testing.allocator.free(uppered); | 75 | defer testing.allocator.free(uppered); |
| 141 | try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered); | 76 | try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered); |
| 142 | } | 77 | } |
| 143 | 78 | ||
| 144 | // Returns true if `cp` is lowercase. | 79 | // Returns true if `cp` is lowercase. |
| 145 | pub fn isLower(self: LetterCasing, cp: u21) bool { | 80 | pub fn isLower(cp: u21) bool { |
| 146 | return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; | 81 | // isLower is true if we have a mapping to an upper character (bit 2) |
| 82 | return letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)] & 2 == 2; | ||
| 147 | } | 83 | } |
| 148 | 84 | ||
| 149 | /// Returns true if `str` is all lowercase. | 85 | /// Returns true if `str` is all non-uppercase. |
| 150 | pub fn isLowerStr(self: LetterCasing, str: []const u8) bool { | 86 | pub fn isLowerStr(str: []const u8) bool { |
| 151 | var iter = CodePointIterator{ .bytes = str }; | 87 | var iter = CodePointIterator{ .bytes = str }; |
| 152 | 88 | ||
| 153 | return while (iter.next()) |cp| { | 89 | return while (iter.next()) |cp| { |
| 154 | if (self.isCased(cp.code) and !self.isLower(cp.code)) break false; | 90 | if (isUpper(cp.code)) break false; |
| 155 | } else true; | 91 | } else true; |
| 156 | } | 92 | } |
| 157 | 93 | ||
| 158 | test "isLowerStr" { | 94 | test "isLowerStr" { |
| 159 | const cd = try init(testing.allocator); | 95 | try testing.expect(isLowerStr("hello, world 2112!")); |
| 160 | defer cd.deinit(testing.allocator); | 96 | try testing.expect(!isLowerStr("HELLO, WORLD 2112!")); |
| 161 | 97 | try testing.expect(!isLowerStr("Hello, World 2112!")); | |
| 162 | try testing.expect(cd.isLowerStr("hello, world 2112!")); | ||
| 163 | try testing.expect(!cd.isLowerStr("HELLO, WORLD 2112!")); | ||
| 164 | try testing.expect(!cd.isLowerStr("Hello, World 2112!")); | ||
| 165 | } | 98 | } |
| 166 | 99 | ||
| 167 | /// Returns lowercase mapping for `cp`. | 100 | /// Returns lowercase mapping for `cp`. |
| 168 | pub fn toLower(self: LetterCasing, cp: u21) u21 { | 101 | pub fn toLower(cp: u21) u21 { |
| 169 | return self.case_map[cp][1]; | 102 | const case_prop = letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)]; |
| 103 | if (case_prop & 1 == 1) { | ||
| 104 | return @intCast((case_prop >> 2) & 0x1FFFFF); | ||
| 105 | } else { | ||
| 106 | return cp; | ||
| 107 | } | ||
| 170 | } | 108 | } |
| 171 | 109 | ||
| 172 | /// Returns a new string with all letters in lowercase. | 110 | /// Returns a new string with all letters in lowercase. |
| 173 | /// Caller must free returned bytes with `allocator`. | 111 | /// Caller must free returned bytes with `allocator`. |
| 174 | pub fn toLowerStr( | 112 | pub fn toLowerStr( |
| 175 | self: LetterCasing, | ||
| 176 | allocator: mem.Allocator, | 113 | allocator: mem.Allocator, |
| 177 | str: []const u8, | 114 | str: []const u8, |
| 178 | ) ![]u8 { | 115 | ) ![]u8 { |
| @@ -183,7 +120,7 @@ pub fn toLowerStr( | |||
| 183 | var buf: [4]u8 = undefined; | 120 | var buf: [4]u8 = undefined; |
| 184 | 121 | ||
| 185 | while (iter.next()) |cp| { | 122 | while (iter.next()) |cp| { |
| 186 | const len = try unicode.utf8Encode(self.toLower(cp.code), &buf); | 123 | const len = try unicode.utf8Encode(toLower(cp.code), &buf); |
| 187 | try bytes.appendSlice(buf[0..len]); | 124 | try bytes.appendSlice(buf[0..len]); |
| 188 | } | 125 | } |
| 189 | 126 | ||
| @@ -191,27 +128,13 @@ pub fn toLowerStr( | |||
| 191 | } | 128 | } |
| 192 | 129 | ||
| 193 | test "toLowerStr" { | 130 | test "toLowerStr" { |
| 194 | const cd = try init(testing.allocator); | 131 | const lowered = try toLowerStr(testing.allocator, "Hello, World 2112!"); |
| 195 | defer cd.deinit(testing.allocator); | ||
| 196 | |||
| 197 | const lowered = try cd.toLowerStr(testing.allocator, "Hello, World 2112!"); | ||
| 198 | defer testing.allocator.free(lowered); | 132 | defer testing.allocator.free(lowered); |
| 199 | try testing.expectEqualStrings("hello, world 2112!", lowered); | 133 | try testing.expectEqualStrings("hello, world 2112!", lowered); |
| 200 | } | 134 | } |
| 201 | 135 | ||
| 202 | fn testAllocator(allocator: Allocator) !void { | ||
| 203 | var prop = try LetterCasing.init(allocator); | ||
| 204 | prop.deinit(allocator); | ||
| 205 | } | ||
| 206 | |||
| 207 | test "Allocation failure" { | ||
| 208 | try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{}); | ||
| 209 | } | ||
| 210 | |||
| 211 | const std = @import("std"); | 136 | const std = @import("std"); |
| 212 | const builtin = @import("builtin"); | 137 | const builtin = @import("builtin"); |
| 213 | const compress = std.compress; | ||
| 214 | const mem = std.mem; | 138 | const mem = std.mem; |
| 215 | const Allocator = std.mem.Allocator; | ||
| 216 | const testing = std.testing; | 139 | const testing = std.testing; |
| 217 | const unicode = std.unicode; | 140 | const unicode = std.unicode; |