diff options
| author | 2026-02-04 18:36:18 -0500 | |
|---|---|---|
| committer | 2026-02-04 18:36:18 -0500 | |
| commit | e476250ea9326b2550847b301c265115ff375a31 (patch) | |
| tree | cf627ced47cecce80020b7a1f30aa51852c0c59b /src/LetterCasing.zig | |
| parent | Normalization and case folding (diff) | |
| download | zg-e476250ea9326b2550847b301c265115ff375a31.tar.gz zg-e476250ea9326b2550847b301c265115ff375a31.tar.xz zg-e476250ea9326b2550847b301c265115ff375a31.zip | |
Rest of the 'easy' stuff
This gets us up to feature parity with Jacob's work. I want to
eliminate that last allocation using the comptime hash map, and then
see about eliminating allocations from case comparisons as well.
That should just about do it.
Diffstat (limited to 'src/LetterCasing.zig')
| -rw-r--r-- | src/LetterCasing.zig | 179 |
1 files changed, 51 insertions, 128 deletions
diff --git a/src/LetterCasing.zig b/src/LetterCasing.zig index 33096fc..24b67a0 100644 --- a/src/LetterCasing.zig +++ b/src/LetterCasing.zig | |||
| @@ -1,120 +1,58 @@ | |||
| 1 | const CodePointIterator = @import("code_point").Iterator; | 1 | const CodePointIterator = @import("code_point").Iterator; |
| 2 | 2 | const GeneralCategories = @import("GeneralCategories"); | |
| 3 | case_map: [][2]u21 = undefined, | 3 | |
| 4 | prop_s1: []u16 = undefined, | 4 | const Data = struct { |
| 5 | prop_s2: []u8 = undefined, | 5 | s1: []const u16 = undefined, |
| 6 | 6 | s2: []const u44 = undefined, | |
| 7 | const LetterCasing = @This(); | 7 | }; |
| 8 | 8 | ||
| 9 | pub fn init(allocator: Allocator) Allocator.Error!LetterCasing { | 9 | const letter_casing = letter_casing: { |
| 10 | var case = LetterCasing{}; | 10 | const data = @import("case"); |
| 11 | try case.setup(allocator); | 11 | break :letter_casing Data{ |
| 12 | return case; | 12 | .s1 = &data.s1, |
| 13 | } | 13 | .s2 = &data.s2, |
| 14 | |||
| 15 | pub fn setup(case: *LetterCasing, allocator: Allocator) Allocator.Error!void { | ||
| 16 | case.setupInner(allocator) catch |err| { | ||
| 17 | switch (err) { | ||
| 18 | error.OutOfMemory => |e| return e, | ||
| 19 | else => unreachable, | ||
| 20 | } | ||
| 21 | }; | 14 | }; |
| 22 | } | 15 | }; |
| 23 | |||
| 24 | inline fn setupInner(self: *LetterCasing, allocator: mem.Allocator) !void { | ||
| 25 | const endian = builtin.cpu.arch.endian(); | ||
| 26 | |||
| 27 | self.case_map = try allocator.alloc([2]u21, 0x110000); | ||
| 28 | errdefer allocator.free(self.case_map); | ||
| 29 | |||
| 30 | for (0..0x110000) |i| { | ||
| 31 | const cp: u21 = @intCast(i); | ||
| 32 | self.case_map[cp] = .{ cp, cp }; | ||
| 33 | } | ||
| 34 | |||
| 35 | // Uppercase | ||
| 36 | const upper_bytes = @embedFile("upper"); | ||
| 37 | var upper_fbs = std.io.fixedBufferStream(upper_bytes); | ||
| 38 | var upper_reader = upper_fbs.reader(); | ||
| 39 | |||
| 40 | while (true) { | ||
| 41 | const cp = try upper_reader.readInt(i24, endian); | ||
| 42 | if (cp == 0) break; | ||
| 43 | const diff = try upper_reader.readInt(i24, endian); | ||
| 44 | self.case_map[@intCast(cp)][0] = @intCast(cp + diff); | ||
| 45 | } | ||
| 46 | |||
| 47 | // Lowercase | ||
| 48 | const lower_bytes = @embedFile("lower"); | ||
| 49 | var lower_fbs = std.io.fixedBufferStream(lower_bytes); | ||
| 50 | var lower_reader = lower_fbs.reader(); | ||
| 51 | |||
| 52 | while (true) { | ||
| 53 | const cp = try lower_reader.readInt(i24, endian); | ||
| 54 | if (cp == 0) break; | ||
| 55 | const diff = try lower_reader.readInt(i24, endian); | ||
| 56 | self.case_map[@intCast(cp)][1] = @intCast(cp + diff); | ||
| 57 | } | ||
| 58 | |||
| 59 | // Case properties | ||
| 60 | const cp_bytes = @embedFile("case_prop"); | ||
| 61 | var cp_fbs = std.io.fixedBufferStream(cp_bytes); | ||
| 62 | var cp_reader = cp_fbs.reader(); | ||
| 63 | |||
| 64 | const stage_1_len: u16 = try cp_reader.readInt(u16, endian); | ||
| 65 | self.prop_s1 = try allocator.alloc(u16, stage_1_len); | ||
| 66 | errdefer allocator.free(self.prop_s1); | ||
| 67 | for (0..stage_1_len) |i| self.prop_s1[i] = try cp_reader.readInt(u16, endian); | ||
| 68 | |||
| 69 | const stage_2_len: u16 = try cp_reader.readInt(u16, endian); | ||
| 70 | self.prop_s2 = try allocator.alloc(u8, stage_2_len); | ||
| 71 | errdefer allocator.free(self.prop_s2); | ||
| 72 | _ = try cp_reader.readAll(self.prop_s2); | ||
| 73 | } | ||
| 74 | |||
| 75 | pub fn deinit(self: *const LetterCasing, allocator: mem.Allocator) void { | ||
| 76 | allocator.free(self.case_map); | ||
| 77 | allocator.free(self.prop_s1); | ||
| 78 | allocator.free(self.prop_s2); | ||
| 79 | } | ||
| 80 | 16 | ||
| 81 | // Returns true if `cp` is either upper, lower, or title case. | 17 | // Returns true if `cp` is either upper, lower, or title case. |
| 82 | pub fn isCased(self: LetterCasing, cp: u21) bool { | 18 | pub fn isCased(cp: u21) bool { |
| 83 | return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; | 19 | return isUpper(cp) or isLower(cp) or GeneralCategories.gc(cp) == .Lt; |
| 84 | } | 20 | } |
| 85 | 21 | ||
| 86 | // Returns true if `cp` is uppercase. | 22 | // Returns true if `cp` is uppercase. |
| 87 | pub fn isUpper(self: LetterCasing, cp: u21) bool { | 23 | pub fn isUpper(cp: u21) bool { |
| 88 | return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; | 24 | // isUpper is true if we have a mapping to a lower character (bit 1) |
| 25 | return letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)] & 1 == 1; | ||
| 89 | } | 26 | } |
| 90 | 27 | ||
| 91 | /// Returns true if `str` is all uppercase. | 28 | /// Returns true if `str` is all non-lowercase. |
| 92 | pub fn isUpperStr(self: LetterCasing, str: []const u8) bool { | 29 | pub fn isUpperStr(str: []const u8) bool { |
| 93 | var iter = CodePointIterator{ .bytes = str }; | 30 | var iter = CodePointIterator{ .bytes = str }; |
| 94 | 31 | ||
| 95 | return while (iter.next()) |cp| { | 32 | return while (iter.next()) |cp| { |
| 96 | if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false; | 33 | if (isLower(cp.code)) break false; |
| 97 | } else true; | 34 | } else true; |
| 98 | } | 35 | } |
| 99 | 36 | ||
| 100 | test "isUpperStr" { | 37 | test "isUpperStr" { |
| 101 | const cd = try init(testing.allocator); | 38 | try testing.expect(isUpperStr("HELLO, WORLD 2112!")); |
| 102 | defer cd.deinit(testing.allocator); | 39 | try testing.expect(!isUpperStr("hello, world 2112!")); |
| 103 | 40 | try testing.expect(!isUpperStr("Hello, World 2112!")); | |
| 104 | try testing.expect(cd.isUpperStr("HELLO, WORLD 2112!")); | ||
| 105 | try testing.expect(!cd.isUpperStr("hello, world 2112!")); | ||
| 106 | try testing.expect(!cd.isUpperStr("Hello, World 2112!")); | ||
| 107 | } | 41 | } |
| 108 | 42 | ||
| 109 | /// Returns uppercase mapping for `cp`. | 43 | /// Returns uppercase mapping for `cp`. |
| 110 | pub fn toUpper(self: LetterCasing, cp: u21) u21 { | 44 | pub fn toUpper(cp: u21) u21 { |
| 111 | return self.case_map[cp][0]; | 45 | const case_prop = letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)]; |
| 46 | if (case_prop & 2 == 2) { | ||
| 47 | return @intCast(case_prop >> (21 + 2)); | ||
| 48 | } else { | ||
| 49 | return cp; | ||
| 50 | } | ||
| 112 | } | 51 | } |
| 113 | 52 | ||
| 114 | /// Returns a new string with all letters in uppercase. | 53 | /// Returns a new string with all letters in uppercase. |
| 115 | /// Caller must free returned bytes with `allocator`. | 54 | /// Caller must free returned bytes with `allocator`. |
| 116 | pub fn toUpperStr( | 55 | pub fn toUpperStr( |
| 117 | self: LetterCasing, | ||
| 118 | allocator: mem.Allocator, | 56 | allocator: mem.Allocator, |
| 119 | str: []const u8, | 57 | str: []const u8, |
| 120 | ) ![]u8 { | 58 | ) ![]u8 { |
| @@ -125,7 +63,7 @@ pub fn toUpperStr( | |||
| 125 | var buf: [4]u8 = undefined; | 63 | var buf: [4]u8 = undefined; |
| 126 | 64 | ||
| 127 | while (iter.next()) |cp| { | 65 | while (iter.next()) |cp| { |
| 128 | const len = try unicode.utf8Encode(self.toUpper(cp.code), &buf); | 66 | const len = try unicode.utf8Encode(toUpper(cp.code), &buf); |
| 129 | try bytes.appendSlice(buf[0..len]); | 67 | try bytes.appendSlice(buf[0..len]); |
| 130 | } | 68 | } |
| 131 | 69 | ||
| @@ -133,46 +71,45 @@ pub fn toUpperStr( | |||
| 133 | } | 71 | } |
| 134 | 72 | ||
| 135 | test "toUpperStr" { | 73 | test "toUpperStr" { |
| 136 | const cd = try init(testing.allocator); | 74 | const uppered = try toUpperStr(testing.allocator, "Hello, World 2112!"); |
| 137 | defer cd.deinit(testing.allocator); | ||
| 138 | |||
| 139 | const uppered = try cd.toUpperStr(testing.allocator, "Hello, World 2112!"); | ||
| 140 | defer testing.allocator.free(uppered); | 75 | defer testing.allocator.free(uppered); |
| 141 | try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered); | 76 | try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered); |
| 142 | } | 77 | } |
| 143 | 78 | ||
| 144 | // Returns true if `cp` is lowercase. | 79 | // Returns true if `cp` is lowercase. |
| 145 | pub fn isLower(self: LetterCasing, cp: u21) bool { | 80 | pub fn isLower(cp: u21) bool { |
| 146 | return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; | 81 | // isLower is true if we have a mapping to an upper character (bit 2) |
| 82 | return letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)] & 2 == 2; | ||
| 147 | } | 83 | } |
| 148 | 84 | ||
| 149 | /// Returns true if `str` is all lowercase. | 85 | /// Returns true if `str` is all non-uppercase. |
| 150 | pub fn isLowerStr(self: LetterCasing, str: []const u8) bool { | 86 | pub fn isLowerStr(str: []const u8) bool { |
| 151 | var iter = CodePointIterator{ .bytes = str }; | 87 | var iter = CodePointIterator{ .bytes = str }; |
| 152 | 88 | ||
| 153 | return while (iter.next()) |cp| { | 89 | return while (iter.next()) |cp| { |
| 154 | if (self.isCased(cp.code) and !self.isLower(cp.code)) break false; | 90 | if (isUpper(cp.code)) break false; |
| 155 | } else true; | 91 | } else true; |
| 156 | } | 92 | } |
| 157 | 93 | ||
| 158 | test "isLowerStr" { | 94 | test "isLowerStr" { |
| 159 | const cd = try init(testing.allocator); | 95 | try testing.expect(isLowerStr("hello, world 2112!")); |
| 160 | defer cd.deinit(testing.allocator); | 96 | try testing.expect(!isLowerStr("HELLO, WORLD 2112!")); |
| 161 | 97 | try testing.expect(!isLowerStr("Hello, World 2112!")); | |
| 162 | try testing.expect(cd.isLowerStr("hello, world 2112!")); | ||
| 163 | try testing.expect(!cd.isLowerStr("HELLO, WORLD 2112!")); | ||
| 164 | try testing.expect(!cd.isLowerStr("Hello, World 2112!")); | ||
| 165 | } | 98 | } |
| 166 | 99 | ||
| 167 | /// Returns lowercase mapping for `cp`. | 100 | /// Returns lowercase mapping for `cp`. |
| 168 | pub fn toLower(self: LetterCasing, cp: u21) u21 { | 101 | pub fn toLower(cp: u21) u21 { |
| 169 | return self.case_map[cp][1]; | 102 | const case_prop = letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)]; |
| 103 | if (case_prop & 1 == 1) { | ||
| 104 | return @intCast((case_prop >> 2) & 0x1FFFFF); | ||
| 105 | } else { | ||
| 106 | return cp; | ||
| 107 | } | ||
| 170 | } | 108 | } |
| 171 | 109 | ||
| 172 | /// Returns a new string with all letters in lowercase. | 110 | /// Returns a new string with all letters in lowercase. |
| 173 | /// Caller must free returned bytes with `allocator`. | 111 | /// Caller must free returned bytes with `allocator`. |
| 174 | pub fn toLowerStr( | 112 | pub fn toLowerStr( |
| 175 | self: LetterCasing, | ||
| 176 | allocator: mem.Allocator, | 113 | allocator: mem.Allocator, |
| 177 | str: []const u8, | 114 | str: []const u8, |
| 178 | ) ![]u8 { | 115 | ) ![]u8 { |
| @@ -183,7 +120,7 @@ pub fn toLowerStr( | |||
| 183 | var buf: [4]u8 = undefined; | 120 | var buf: [4]u8 = undefined; |
| 184 | 121 | ||
| 185 | while (iter.next()) |cp| { | 122 | while (iter.next()) |cp| { |
| 186 | const len = try unicode.utf8Encode(self.toLower(cp.code), &buf); | 123 | const len = try unicode.utf8Encode(toLower(cp.code), &buf); |
| 187 | try bytes.appendSlice(buf[0..len]); | 124 | try bytes.appendSlice(buf[0..len]); |
| 188 | } | 125 | } |
| 189 | 126 | ||
| @@ -191,27 +128,13 @@ pub fn toLowerStr( | |||
| 191 | } | 128 | } |
| 192 | 129 | ||
| 193 | test "toLowerStr" { | 130 | test "toLowerStr" { |
| 194 | const cd = try init(testing.allocator); | 131 | const lowered = try toLowerStr(testing.allocator, "Hello, World 2112!"); |
| 195 | defer cd.deinit(testing.allocator); | ||
| 196 | |||
| 197 | const lowered = try cd.toLowerStr(testing.allocator, "Hello, World 2112!"); | ||
| 198 | defer testing.allocator.free(lowered); | 132 | defer testing.allocator.free(lowered); |
| 199 | try testing.expectEqualStrings("hello, world 2112!", lowered); | 133 | try testing.expectEqualStrings("hello, world 2112!", lowered); |
| 200 | } | 134 | } |
| 201 | 135 | ||
| 202 | fn testAllocator(allocator: Allocator) !void { | ||
| 203 | var prop = try LetterCasing.init(allocator); | ||
| 204 | prop.deinit(allocator); | ||
| 205 | } | ||
| 206 | |||
| 207 | test "Allocation failure" { | ||
| 208 | try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{}); | ||
| 209 | } | ||
| 210 | |||
| 211 | const std = @import("std"); | 136 | const std = @import("std"); |
| 212 | const builtin = @import("builtin"); | 137 | const builtin = @import("builtin"); |
| 213 | const compress = std.compress; | ||
| 214 | const mem = std.mem; | 138 | const mem = std.mem; |
| 215 | const Allocator = std.mem.Allocator; | ||
| 216 | const testing = std.testing; | 139 | const testing = std.testing; |
| 217 | const unicode = std.unicode; | 140 | const unicode = std.unicode; |