summaryrefslogtreecommitdiff
path: root/src/CaseData.zig
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-04-30 15:32:34 -0400
committerGravatar Sam Atman2025-04-30 15:32:34 -0400
commit958c13ba442e7077a50d7163fdeb9bba378f95c2 (patch)
tree0727fd03ea2344ebbad842daa05b55ea0a143a6c /src/CaseData.zig
parentRemove FoldData, make CaseFolding (diff)
downloadzg-958c13ba442e7077a50d7163fdeb9bba378f95c2.tar.gz
zg-958c13ba442e7077a50d7163fdeb9bba378f95c2.tar.xz
zg-958c13ba442e7077a50d7163fdeb9bba378f95c2.zip
Rest of the Renamings
These get different names, but don't otherwise change.
Diffstat (limited to 'src/CaseData.zig')
-rw-r--r--src/CaseData.zig200
1 files changed, 0 insertions, 200 deletions
diff --git a/src/CaseData.zig b/src/CaseData.zig
deleted file mode 100644
index 0a0acb1..0000000
--- a/src/CaseData.zig
+++ /dev/null
@@ -1,200 +0,0 @@
1const std = @import("std");
2const builtin = @import("builtin");
3const compress = std.compress;
4const mem = std.mem;
5const testing = std.testing;
6const unicode = std.unicode;
7
8const CodePointIterator = @import("code_point").Iterator;
9
10case_map: [][2]u21,
11prop_s1: []u16 = undefined,
12prop_s2: []u8 = undefined,
13
14const Self = @This();
15
16pub fn init(allocator: mem.Allocator) !Self {
17 const decompressor = compress.flate.inflate.decompressor;
18 const endian = builtin.cpu.arch.endian();
19
20 var self = Self{
21 .case_map = try allocator.alloc([2]u21, 0x110000),
22 };
23 errdefer allocator.free(self.case_map);
24
25 for (0..0x110000) |i| {
26 const cp: u21 = @intCast(i);
27 self.case_map[cp] = .{ cp, cp };
28 }
29
30 // Uppercase
31 const upper_bytes = @embedFile("upper");
32 var upper_fbs = std.io.fixedBufferStream(upper_bytes);
33 var upper_decomp = decompressor(.raw, upper_fbs.reader());
34 var upper_reader = upper_decomp.reader();
35
36 while (true) {
37 const cp = try upper_reader.readInt(i24, endian);
38 if (cp == 0) break;
39 const diff = try upper_reader.readInt(i24, endian);
40 self.case_map[@intCast(cp)][0] = @intCast(cp + diff);
41 }
42
43 // Lowercase
44 const lower_bytes = @embedFile("lower");
45 var lower_fbs = std.io.fixedBufferStream(lower_bytes);
46 var lower_decomp = decompressor(.raw, lower_fbs.reader());
47 var lower_reader = lower_decomp.reader();
48
49 while (true) {
50 const cp = try lower_reader.readInt(i24, endian);
51 if (cp == 0) break;
52 const diff = try lower_reader.readInt(i24, endian);
53 self.case_map[@intCast(cp)][1] = @intCast(cp + diff);
54 }
55
56 // Case properties
57 const cp_bytes = @embedFile("case_prop");
58 var cp_fbs = std.io.fixedBufferStream(cp_bytes);
59 var cp_decomp = decompressor(.raw, cp_fbs.reader());
60 var cp_reader = cp_decomp.reader();
61
62 const stage_1_len: u16 = try cp_reader.readInt(u16, endian);
63 self.prop_s1 = try allocator.alloc(u16, stage_1_len);
64 errdefer allocator.free(self.prop_s1);
65 for (0..stage_1_len) |i| self.prop_s1[i] = try cp_reader.readInt(u16, endian);
66
67 const stage_2_len: u16 = try cp_reader.readInt(u16, endian);
68 self.prop_s2 = try allocator.alloc(u8, stage_2_len);
69 errdefer allocator.free(self.prop_s2);
70 _ = try cp_reader.readAll(self.prop_s2);
71
72 return self;
73}
74
75pub fn deinit(self: *const Self, allocator: mem.Allocator) void {
76 allocator.free(self.case_map);
77 allocator.free(self.prop_s1);
78 allocator.free(self.prop_s2);
79}
80
81// Returns true if `cp` is either upper, lower, or title case.
82pub fn isCased(self: Self, cp: u21) bool {
83 return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
84}
85
86// Returns true if `cp` is uppercase.
87pub fn isUpper(self: Self, cp: u21) bool {
88 return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
89}
90
91/// Returns true if `str` is all uppercase.
92pub fn isUpperStr(self: Self, str: []const u8) bool {
93 var iter = CodePointIterator{ .bytes = str };
94
95 return while (iter.next()) |cp| {
96 if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false;
97 } else true;
98}
99
100test "isUpperStr" {
101 const cd = try init(testing.allocator);
102 defer cd.deinit(testing.allocator);
103
104 try testing.expect(cd.isUpperStr("HELLO, WORLD 2112!"));
105 try testing.expect(!cd.isUpperStr("hello, world 2112!"));
106 try testing.expect(!cd.isUpperStr("Hello, World 2112!"));
107}
108
109/// Returns uppercase mapping for `cp`.
110pub fn toUpper(self: Self, cp: u21) u21 {
111 return self.case_map[cp][0];
112}
113
114/// Returns a new string with all letters in uppercase.
115/// Caller must free returned bytes with `allocator`.
116pub fn toUpperStr(
117 self: Self,
118 allocator: mem.Allocator,
119 str: []const u8,
120) ![]u8 {
121 var bytes = std.ArrayList(u8).init(allocator);
122 defer bytes.deinit();
123
124 var iter = CodePointIterator{ .bytes = str };
125 var buf: [4]u8 = undefined;
126
127 while (iter.next()) |cp| {
128 const len = try unicode.utf8Encode(self.toUpper(cp.code), &buf);
129 try bytes.appendSlice(buf[0..len]);
130 }
131
132 return try bytes.toOwnedSlice();
133}
134
135test "toUpperStr" {
136 const cd = try init(testing.allocator);
137 defer cd.deinit(testing.allocator);
138
139 const uppered = try cd.toUpperStr(testing.allocator, "Hello, World 2112!");
140 defer testing.allocator.free(uppered);
141 try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered);
142}
143
144// Returns true if `cp` is lowercase.
145pub fn isLower(self: Self, cp: u21) bool {
146 return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
147}
148
149/// Returns true if `str` is all lowercase.
150pub fn isLowerStr(self: Self, str: []const u8) bool {
151 var iter = CodePointIterator{ .bytes = str };
152
153 return while (iter.next()) |cp| {
154 if (self.isCased(cp.code) and !self.isLower(cp.code)) break false;
155 } else true;
156}
157
158test "isLowerStr" {
159 const cd = try init(testing.allocator);
160 defer cd.deinit(testing.allocator);
161
162 try testing.expect(cd.isLowerStr("hello, world 2112!"));
163 try testing.expect(!cd.isLowerStr("HELLO, WORLD 2112!"));
164 try testing.expect(!cd.isLowerStr("Hello, World 2112!"));
165}
166
167/// Returns lowercase mapping for `cp`.
168pub fn toLower(self: Self, cp: u21) u21 {
169 return self.case_map[cp][1];
170}
171
172/// Returns a new string with all letters in lowercase.
173/// Caller must free returned bytes with `allocator`.
174pub fn toLowerStr(
175 self: Self,
176 allocator: mem.Allocator,
177 str: []const u8,
178) ![]u8 {
179 var bytes = std.ArrayList(u8).init(allocator);
180 defer bytes.deinit();
181
182 var iter = CodePointIterator{ .bytes = str };
183 var buf: [4]u8 = undefined;
184
185 while (iter.next()) |cp| {
186 const len = try unicode.utf8Encode(self.toLower(cp.code), &buf);
187 try bytes.appendSlice(buf[0..len]);
188 }
189
190 return try bytes.toOwnedSlice();
191}
192
193test "toLowerStr" {
194 const cd = try init(testing.allocator);
195 defer cd.deinit(testing.allocator);
196
197 const lowered = try cd.toLowerStr(testing.allocator, "Hello, World 2112!");
198 defer testing.allocator.free(lowered);
199 try testing.expectEqualStrings("hello, world 2112!", lowered);
200}