summaryrefslogtreecommitdiff
path: root/src/LetterCasing.zig
diff options
context:
space:
mode:
authorGravatar Sam Atman2026-02-04 18:36:18 -0500
committerGravatar Sam Atman2026-02-04 18:36:18 -0500
commite476250ea9326b2550847b301c265115ff375a31 (patch)
treecf627ced47cecce80020b7a1f30aa51852c0c59b /src/LetterCasing.zig
parentNormalization and case folding (diff)
downloadzg-e476250ea9326b2550847b301c265115ff375a31.tar.gz
zg-e476250ea9326b2550847b301c265115ff375a31.tar.xz
zg-e476250ea9326b2550847b301c265115ff375a31.zip
Rest of the 'easy' stuff
This gets us up to feature parity with Jacob's work. I want to eliminate that last allocation using the comptime hash map, and then see about eliminating allocations from case comparisons as well. That should just about do it.
Diffstat (limited to 'src/LetterCasing.zig')
-rw-r--r--src/LetterCasing.zig179
1 files changed, 51 insertions, 128 deletions
diff --git a/src/LetterCasing.zig b/src/LetterCasing.zig
index 33096fc..24b67a0 100644
--- a/src/LetterCasing.zig
+++ b/src/LetterCasing.zig
@@ -1,120 +1,58 @@
1const CodePointIterator = @import("code_point").Iterator; 1const CodePointIterator = @import("code_point").Iterator;
2 2const GeneralCategories = @import("GeneralCategories");
3case_map: [][2]u21 = undefined, 3
4prop_s1: []u16 = undefined, 4const Data = struct {
5prop_s2: []u8 = undefined, 5 s1: []const u16 = undefined,
6 6 s2: []const u44 = undefined,
7const LetterCasing = @This(); 7};
8 8
9pub fn init(allocator: Allocator) Allocator.Error!LetterCasing { 9const letter_casing = letter_casing: {
10 var case = LetterCasing{}; 10 const data = @import("case");
11 try case.setup(allocator); 11 break :letter_casing Data{
12 return case; 12 .s1 = &data.s1,
13} 13 .s2 = &data.s2,
14
15pub fn setup(case: *LetterCasing, allocator: Allocator) Allocator.Error!void {
16 case.setupInner(allocator) catch |err| {
17 switch (err) {
18 error.OutOfMemory => |e| return e,
19 else => unreachable,
20 }
21 }; 14 };
22} 15};
23
24inline fn setupInner(self: *LetterCasing, allocator: mem.Allocator) !void {
25 const endian = builtin.cpu.arch.endian();
26
27 self.case_map = try allocator.alloc([2]u21, 0x110000);
28 errdefer allocator.free(self.case_map);
29
30 for (0..0x110000) |i| {
31 const cp: u21 = @intCast(i);
32 self.case_map[cp] = .{ cp, cp };
33 }
34
35 // Uppercase
36 const upper_bytes = @embedFile("upper");
37 var upper_fbs = std.io.fixedBufferStream(upper_bytes);
38 var upper_reader = upper_fbs.reader();
39
40 while (true) {
41 const cp = try upper_reader.readInt(i24, endian);
42 if (cp == 0) break;
43 const diff = try upper_reader.readInt(i24, endian);
44 self.case_map[@intCast(cp)][0] = @intCast(cp + diff);
45 }
46
47 // Lowercase
48 const lower_bytes = @embedFile("lower");
49 var lower_fbs = std.io.fixedBufferStream(lower_bytes);
50 var lower_reader = lower_fbs.reader();
51
52 while (true) {
53 const cp = try lower_reader.readInt(i24, endian);
54 if (cp == 0) break;
55 const diff = try lower_reader.readInt(i24, endian);
56 self.case_map[@intCast(cp)][1] = @intCast(cp + diff);
57 }
58
59 // Case properties
60 const cp_bytes = @embedFile("case_prop");
61 var cp_fbs = std.io.fixedBufferStream(cp_bytes);
62 var cp_reader = cp_fbs.reader();
63
64 const stage_1_len: u16 = try cp_reader.readInt(u16, endian);
65 self.prop_s1 = try allocator.alloc(u16, stage_1_len);
66 errdefer allocator.free(self.prop_s1);
67 for (0..stage_1_len) |i| self.prop_s1[i] = try cp_reader.readInt(u16, endian);
68
69 const stage_2_len: u16 = try cp_reader.readInt(u16, endian);
70 self.prop_s2 = try allocator.alloc(u8, stage_2_len);
71 errdefer allocator.free(self.prop_s2);
72 _ = try cp_reader.readAll(self.prop_s2);
73}
74
75pub fn deinit(self: *const LetterCasing, allocator: mem.Allocator) void {
76 allocator.free(self.case_map);
77 allocator.free(self.prop_s1);
78 allocator.free(self.prop_s2);
79}
80 16
81// Returns true if `cp` is either upper, lower, or title case. 17// Returns true if `cp` is either upper, lower, or title case.
82pub fn isCased(self: LetterCasing, cp: u21) bool { 18pub fn isCased(cp: u21) bool {
83 return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; 19 return isUpper(cp) or isLower(cp) or GeneralCategories.gc(cp) == .Lt;
84} 20}
85 21
86// Returns true if `cp` is uppercase. 22// Returns true if `cp` is uppercase.
87pub fn isUpper(self: LetterCasing, cp: u21) bool { 23pub fn isUpper(cp: u21) bool {
88 return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; 24 // isUpper is true if we have a mapping to a lower character (bit 1)
25 return letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
89} 26}
90 27
91/// Returns true if `str` is all uppercase. 28/// Returns true if `str` is all non-lowercase.
92pub fn isUpperStr(self: LetterCasing, str: []const u8) bool { 29pub fn isUpperStr(str: []const u8) bool {
93 var iter = CodePointIterator{ .bytes = str }; 30 var iter = CodePointIterator{ .bytes = str };
94 31
95 return while (iter.next()) |cp| { 32 return while (iter.next()) |cp| {
96 if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false; 33 if (isLower(cp.code)) break false;
97 } else true; 34 } else true;
98} 35}
99 36
100test "isUpperStr" { 37test "isUpperStr" {
101 const cd = try init(testing.allocator); 38 try testing.expect(isUpperStr("HELLO, WORLD 2112!"));
102 defer cd.deinit(testing.allocator); 39 try testing.expect(!isUpperStr("hello, world 2112!"));
103 40 try testing.expect(!isUpperStr("Hello, World 2112!"));
104 try testing.expect(cd.isUpperStr("HELLO, WORLD 2112!"));
105 try testing.expect(!cd.isUpperStr("hello, world 2112!"));
106 try testing.expect(!cd.isUpperStr("Hello, World 2112!"));
107} 41}
108 42
109/// Returns uppercase mapping for `cp`. 43/// Returns uppercase mapping for `cp`.
110pub fn toUpper(self: LetterCasing, cp: u21) u21 { 44pub fn toUpper(cp: u21) u21 {
111 return self.case_map[cp][0]; 45 const case_prop = letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)];
46 if (case_prop & 2 == 2) {
47 return @intCast(case_prop >> (21 + 2));
48 } else {
49 return cp;
50 }
112} 51}
113 52
114/// Returns a new string with all letters in uppercase. 53/// Returns a new string with all letters in uppercase.
115/// Caller must free returned bytes with `allocator`. 54/// Caller must free returned bytes with `allocator`.
116pub fn toUpperStr( 55pub fn toUpperStr(
117 self: LetterCasing,
118 allocator: mem.Allocator, 56 allocator: mem.Allocator,
119 str: []const u8, 57 str: []const u8,
120) ![]u8 { 58) ![]u8 {
@@ -125,7 +63,7 @@ pub fn toUpperStr(
125 var buf: [4]u8 = undefined; 63 var buf: [4]u8 = undefined;
126 64
127 while (iter.next()) |cp| { 65 while (iter.next()) |cp| {
128 const len = try unicode.utf8Encode(self.toUpper(cp.code), &buf); 66 const len = try unicode.utf8Encode(toUpper(cp.code), &buf);
129 try bytes.appendSlice(buf[0..len]); 67 try bytes.appendSlice(buf[0..len]);
130 } 68 }
131 69
@@ -133,46 +71,45 @@ pub fn toUpperStr(
133} 71}
134 72
135test "toUpperStr" { 73test "toUpperStr" {
136 const cd = try init(testing.allocator); 74 const uppered = try toUpperStr(testing.allocator, "Hello, World 2112!");
137 defer cd.deinit(testing.allocator);
138
139 const uppered = try cd.toUpperStr(testing.allocator, "Hello, World 2112!");
140 defer testing.allocator.free(uppered); 75 defer testing.allocator.free(uppered);
141 try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered); 76 try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered);
142} 77}
143 78
144// Returns true if `cp` is lowercase. 79// Returns true if `cp` is lowercase.
145pub fn isLower(self: LetterCasing, cp: u21) bool { 80pub fn isLower(cp: u21) bool {
146 return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; 81 // isLower is true if we have a mapping to an upper character (bit 2)
82 return letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
147} 83}
148 84
149/// Returns true if `str` is all lowercase. 85/// Returns true if `str` is all non-uppercase.
150pub fn isLowerStr(self: LetterCasing, str: []const u8) bool { 86pub fn isLowerStr(str: []const u8) bool {
151 var iter = CodePointIterator{ .bytes = str }; 87 var iter = CodePointIterator{ .bytes = str };
152 88
153 return while (iter.next()) |cp| { 89 return while (iter.next()) |cp| {
154 if (self.isCased(cp.code) and !self.isLower(cp.code)) break false; 90 if (isUpper(cp.code)) break false;
155 } else true; 91 } else true;
156} 92}
157 93
158test "isLowerStr" { 94test "isLowerStr" {
159 const cd = try init(testing.allocator); 95 try testing.expect(isLowerStr("hello, world 2112!"));
160 defer cd.deinit(testing.allocator); 96 try testing.expect(!isLowerStr("HELLO, WORLD 2112!"));
161 97 try testing.expect(!isLowerStr("Hello, World 2112!"));
162 try testing.expect(cd.isLowerStr("hello, world 2112!"));
163 try testing.expect(!cd.isLowerStr("HELLO, WORLD 2112!"));
164 try testing.expect(!cd.isLowerStr("Hello, World 2112!"));
165} 98}
166 99
167/// Returns lowercase mapping for `cp`. 100/// Returns lowercase mapping for `cp`.
168pub fn toLower(self: LetterCasing, cp: u21) u21 { 101pub fn toLower(cp: u21) u21 {
169 return self.case_map[cp][1]; 102 const case_prop = letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)];
103 if (case_prop & 1 == 1) {
104 return @intCast((case_prop >> 2) & 0x1FFFFF);
105 } else {
106 return cp;
107 }
170} 108}
171 109
172/// Returns a new string with all letters in lowercase. 110/// Returns a new string with all letters in lowercase.
173/// Caller must free returned bytes with `allocator`. 111/// Caller must free returned bytes with `allocator`.
174pub fn toLowerStr( 112pub fn toLowerStr(
175 self: LetterCasing,
176 allocator: mem.Allocator, 113 allocator: mem.Allocator,
177 str: []const u8, 114 str: []const u8,
178) ![]u8 { 115) ![]u8 {
@@ -183,7 +120,7 @@ pub fn toLowerStr(
183 var buf: [4]u8 = undefined; 120 var buf: [4]u8 = undefined;
184 121
185 while (iter.next()) |cp| { 122 while (iter.next()) |cp| {
186 const len = try unicode.utf8Encode(self.toLower(cp.code), &buf); 123 const len = try unicode.utf8Encode(toLower(cp.code), &buf);
187 try bytes.appendSlice(buf[0..len]); 124 try bytes.appendSlice(buf[0..len]);
188 } 125 }
189 126
@@ -191,27 +128,13 @@ pub fn toLowerStr(
191} 128}
192 129
193test "toLowerStr" { 130test "toLowerStr" {
194 const cd = try init(testing.allocator); 131 const lowered = try toLowerStr(testing.allocator, "Hello, World 2112!");
195 defer cd.deinit(testing.allocator);
196
197 const lowered = try cd.toLowerStr(testing.allocator, "Hello, World 2112!");
198 defer testing.allocator.free(lowered); 132 defer testing.allocator.free(lowered);
199 try testing.expectEqualStrings("hello, world 2112!", lowered); 133 try testing.expectEqualStrings("hello, world 2112!", lowered);
200} 134}
201 135
202fn testAllocator(allocator: Allocator) !void {
203 var prop = try LetterCasing.init(allocator);
204 prop.deinit(allocator);
205}
206
207test "Allocation failure" {
208 try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{});
209}
210
211const std = @import("std"); 136const std = @import("std");
212const builtin = @import("builtin"); 137const builtin = @import("builtin");
213const compress = std.compress;
214const mem = std.mem; 138const mem = std.mem;
215const Allocator = std.mem.Allocator;
216const testing = std.testing; 139const testing = std.testing;
217const unicode = std.unicode; 140const unicode = std.unicode;