summaryrefslogtreecommitdiff
path: root/src/LetterCasing.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/LetterCasing.zig')
-rw-r--r--src/LetterCasing.zig179
1 files changed, 51 insertions, 128 deletions
diff --git a/src/LetterCasing.zig b/src/LetterCasing.zig
index 33096fc..24b67a0 100644
--- a/src/LetterCasing.zig
+++ b/src/LetterCasing.zig
@@ -1,120 +1,58 @@
1const CodePointIterator = @import("code_point").Iterator; 1const CodePointIterator = @import("code_point").Iterator;
2 2const GeneralCategories = @import("GeneralCategories");
3case_map: [][2]u21 = undefined, 3
4prop_s1: []u16 = undefined, 4const Data = struct {
5prop_s2: []u8 = undefined, 5 s1: []const u16 = undefined,
6 6 s2: []const u44 = undefined,
7const LetterCasing = @This(); 7};
8 8
9pub fn init(allocator: Allocator) Allocator.Error!LetterCasing { 9const letter_casing = letter_casing: {
10 var case = LetterCasing{}; 10 const data = @import("case");
11 try case.setup(allocator); 11 break :letter_casing Data{
12 return case; 12 .s1 = &data.s1,
13} 13 .s2 = &data.s2,
14
15pub fn setup(case: *LetterCasing, allocator: Allocator) Allocator.Error!void {
16 case.setupInner(allocator) catch |err| {
17 switch (err) {
18 error.OutOfMemory => |e| return e,
19 else => unreachable,
20 }
21 }; 14 };
22} 15};
23
24inline fn setupInner(self: *LetterCasing, allocator: mem.Allocator) !void {
25 const endian = builtin.cpu.arch.endian();
26
27 self.case_map = try allocator.alloc([2]u21, 0x110000);
28 errdefer allocator.free(self.case_map);
29
30 for (0..0x110000) |i| {
31 const cp: u21 = @intCast(i);
32 self.case_map[cp] = .{ cp, cp };
33 }
34
35 // Uppercase
36 const upper_bytes = @embedFile("upper");
37 var upper_fbs = std.io.fixedBufferStream(upper_bytes);
38 var upper_reader = upper_fbs.reader();
39
40 while (true) {
41 const cp = try upper_reader.readInt(i24, endian);
42 if (cp == 0) break;
43 const diff = try upper_reader.readInt(i24, endian);
44 self.case_map[@intCast(cp)][0] = @intCast(cp + diff);
45 }
46
47 // Lowercase
48 const lower_bytes = @embedFile("lower");
49 var lower_fbs = std.io.fixedBufferStream(lower_bytes);
50 var lower_reader = lower_fbs.reader();
51
52 while (true) {
53 const cp = try lower_reader.readInt(i24, endian);
54 if (cp == 0) break;
55 const diff = try lower_reader.readInt(i24, endian);
56 self.case_map[@intCast(cp)][1] = @intCast(cp + diff);
57 }
58
59 // Case properties
60 const cp_bytes = @embedFile("case_prop");
61 var cp_fbs = std.io.fixedBufferStream(cp_bytes);
62 var cp_reader = cp_fbs.reader();
63
64 const stage_1_len: u16 = try cp_reader.readInt(u16, endian);
65 self.prop_s1 = try allocator.alloc(u16, stage_1_len);
66 errdefer allocator.free(self.prop_s1);
67 for (0..stage_1_len) |i| self.prop_s1[i] = try cp_reader.readInt(u16, endian);
68
69 const stage_2_len: u16 = try cp_reader.readInt(u16, endian);
70 self.prop_s2 = try allocator.alloc(u8, stage_2_len);
71 errdefer allocator.free(self.prop_s2);
72 _ = try cp_reader.readAll(self.prop_s2);
73}
74
75pub fn deinit(self: *const LetterCasing, allocator: mem.Allocator) void {
76 allocator.free(self.case_map);
77 allocator.free(self.prop_s1);
78 allocator.free(self.prop_s2);
79}
80 16
81// Returns true if `cp` is either upper, lower, or title case. 17// Returns true if `cp` is either upper, lower, or title case.
82pub fn isCased(self: LetterCasing, cp: u21) bool { 18pub fn isCased(cp: u21) bool {
83 return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; 19 return isUpper(cp) or isLower(cp) or GeneralCategories.gc(cp) == .Lt;
84} 20}
85 21
86// Returns true if `cp` is uppercase. 22// Returns true if `cp` is uppercase.
87pub fn isUpper(self: LetterCasing, cp: u21) bool { 23pub fn isUpper(cp: u21) bool {
88 return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; 24 // isUpper is true if we have a mapping to a lower character (bit 1)
25 return letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
89} 26}
90 27
91/// Returns true if `str` is all uppercase. 28/// Returns true if `str` is all non-lowercase.
92pub fn isUpperStr(self: LetterCasing, str: []const u8) bool { 29pub fn isUpperStr(str: []const u8) bool {
93 var iter = CodePointIterator{ .bytes = str }; 30 var iter = CodePointIterator{ .bytes = str };
94 31
95 return while (iter.next()) |cp| { 32 return while (iter.next()) |cp| {
96 if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false; 33 if (isLower(cp.code)) break false;
97 } else true; 34 } else true;
98} 35}
99 36
100test "isUpperStr" { 37test "isUpperStr" {
101 const cd = try init(testing.allocator); 38 try testing.expect(isUpperStr("HELLO, WORLD 2112!"));
102 defer cd.deinit(testing.allocator); 39 try testing.expect(!isUpperStr("hello, world 2112!"));
103 40 try testing.expect(!isUpperStr("Hello, World 2112!"));
104 try testing.expect(cd.isUpperStr("HELLO, WORLD 2112!"));
105 try testing.expect(!cd.isUpperStr("hello, world 2112!"));
106 try testing.expect(!cd.isUpperStr("Hello, World 2112!"));
107} 41}
108 42
109/// Returns uppercase mapping for `cp`. 43/// Returns uppercase mapping for `cp`.
110pub fn toUpper(self: LetterCasing, cp: u21) u21 { 44pub fn toUpper(cp: u21) u21 {
111 return self.case_map[cp][0]; 45 const case_prop = letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)];
46 if (case_prop & 2 == 2) {
47 return @intCast(case_prop >> (21 + 2));
48 } else {
49 return cp;
50 }
112} 51}
113 52
114/// Returns a new string with all letters in uppercase. 53/// Returns a new string with all letters in uppercase.
115/// Caller must free returned bytes with `allocator`. 54/// Caller must free returned bytes with `allocator`.
116pub fn toUpperStr( 55pub fn toUpperStr(
117 self: LetterCasing,
118 allocator: mem.Allocator, 56 allocator: mem.Allocator,
119 str: []const u8, 57 str: []const u8,
120) ![]u8 { 58) ![]u8 {
@@ -125,7 +63,7 @@ pub fn toUpperStr(
125 var buf: [4]u8 = undefined; 63 var buf: [4]u8 = undefined;
126 64
127 while (iter.next()) |cp| { 65 while (iter.next()) |cp| {
128 const len = try unicode.utf8Encode(self.toUpper(cp.code), &buf); 66 const len = try unicode.utf8Encode(toUpper(cp.code), &buf);
129 try bytes.appendSlice(buf[0..len]); 67 try bytes.appendSlice(buf[0..len]);
130 } 68 }
131 69
@@ -133,46 +71,45 @@ pub fn toUpperStr(
133} 71}
134 72
135test "toUpperStr" { 73test "toUpperStr" {
136 const cd = try init(testing.allocator); 74 const uppered = try toUpperStr(testing.allocator, "Hello, World 2112!");
137 defer cd.deinit(testing.allocator);
138
139 const uppered = try cd.toUpperStr(testing.allocator, "Hello, World 2112!");
140 defer testing.allocator.free(uppered); 75 defer testing.allocator.free(uppered);
141 try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered); 76 try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered);
142} 77}
143 78
144// Returns true if `cp` is lowercase. 79// Returns true if `cp` is lowercase.
145pub fn isLower(self: LetterCasing, cp: u21) bool { 80pub fn isLower(cp: u21) bool {
146 return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; 81 // isLower is true if we have a mapping to an upper character (bit 2)
82 return letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
147} 83}
148 84
149/// Returns true if `str` is all lowercase. 85/// Returns true if `str` is all non-uppercase.
150pub fn isLowerStr(self: LetterCasing, str: []const u8) bool { 86pub fn isLowerStr(str: []const u8) bool {
151 var iter = CodePointIterator{ .bytes = str }; 87 var iter = CodePointIterator{ .bytes = str };
152 88
153 return while (iter.next()) |cp| { 89 return while (iter.next()) |cp| {
154 if (self.isCased(cp.code) and !self.isLower(cp.code)) break false; 90 if (isUpper(cp.code)) break false;
155 } else true; 91 } else true;
156} 92}
157 93
158test "isLowerStr" { 94test "isLowerStr" {
159 const cd = try init(testing.allocator); 95 try testing.expect(isLowerStr("hello, world 2112!"));
160 defer cd.deinit(testing.allocator); 96 try testing.expect(!isLowerStr("HELLO, WORLD 2112!"));
161 97 try testing.expect(!isLowerStr("Hello, World 2112!"));
162 try testing.expect(cd.isLowerStr("hello, world 2112!"));
163 try testing.expect(!cd.isLowerStr("HELLO, WORLD 2112!"));
164 try testing.expect(!cd.isLowerStr("Hello, World 2112!"));
165} 98}
166 99
167/// Returns lowercase mapping for `cp`. 100/// Returns lowercase mapping for `cp`.
168pub fn toLower(self: LetterCasing, cp: u21) u21 { 101pub fn toLower(cp: u21) u21 {
169 return self.case_map[cp][1]; 102 const case_prop = letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)];
103 if (case_prop & 1 == 1) {
104 return @intCast((case_prop >> 2) & 0x1FFFFF);
105 } else {
106 return cp;
107 }
170} 108}
171 109
172/// Returns a new string with all letters in lowercase. 110/// Returns a new string with all letters in lowercase.
173/// Caller must free returned bytes with `allocator`. 111/// Caller must free returned bytes with `allocator`.
174pub fn toLowerStr( 112pub fn toLowerStr(
175 self: LetterCasing,
176 allocator: mem.Allocator, 113 allocator: mem.Allocator,
177 str: []const u8, 114 str: []const u8,
178) ![]u8 { 115) ![]u8 {
@@ -183,7 +120,7 @@ pub fn toLowerStr(
183 var buf: [4]u8 = undefined; 120 var buf: [4]u8 = undefined;
184 121
185 while (iter.next()) |cp| { 122 while (iter.next()) |cp| {
186 const len = try unicode.utf8Encode(self.toLower(cp.code), &buf); 123 const len = try unicode.utf8Encode(toLower(cp.code), &buf);
187 try bytes.appendSlice(buf[0..len]); 124 try bytes.appendSlice(buf[0..len]);
188 } 125 }
189 126
@@ -191,27 +128,13 @@ pub fn toLowerStr(
191} 128}
192 129
193test "toLowerStr" { 130test "toLowerStr" {
194 const cd = try init(testing.allocator); 131 const lowered = try toLowerStr(testing.allocator, "Hello, World 2112!");
195 defer cd.deinit(testing.allocator);
196
197 const lowered = try cd.toLowerStr(testing.allocator, "Hello, World 2112!");
198 defer testing.allocator.free(lowered); 132 defer testing.allocator.free(lowered);
199 try testing.expectEqualStrings("hello, world 2112!", lowered); 133 try testing.expectEqualStrings("hello, world 2112!", lowered);
200} 134}
201 135
202fn testAllocator(allocator: Allocator) !void {
203 var prop = try LetterCasing.init(allocator);
204 prop.deinit(allocator);
205}
206
207test "Allocation failure" {
208 try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{});
209}
210
211const std = @import("std"); 136const std = @import("std");
212const builtin = @import("builtin"); 137const builtin = @import("builtin");
213const compress = std.compress;
214const mem = std.mem; 138const mem = std.mem;
215const Allocator = std.mem.Allocator;
216const testing = std.testing; 139const testing = std.testing;
217const unicode = std.unicode; 140const unicode = std.unicode;