summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Sam Atman2026-02-04 18:36:18 -0500
committerGravatar Sam Atman2026-02-04 18:36:18 -0500
commite476250ea9326b2550847b301c265115ff375a31 (patch)
treecf627ced47cecce80020b7a1f30aa51852c0c59b /src
parentNormalization and case folding (diff)
downloadzg-e476250ea9326b2550847b301c265115ff375a31.tar.gz
zg-e476250ea9326b2550847b301c265115ff375a31.tar.xz
zg-e476250ea9326b2550847b301c265115ff375a31.zip
Rest of the 'easy' stuff
This gets us up to feature parity with Jacob's work. I want to eliminate that last allocation using the comptime hash map, and then see about eliminating allocations from case comparisons as well. That should just about do it.
Diffstat (limited to 'src')
-rw-r--r--src/GeneralCategories.zig102
-rw-r--r--src/LetterCasing.zig179
-rw-r--r--src/Properties.zig195
-rw-r--r--src/Scripts.zig82
4 files changed, 162 insertions, 396 deletions
diff --git a/src/GeneralCategories.zig b/src/GeneralCategories.zig
index eee7e56..9a383bf 100644
--- a/src/GeneralCategories.zig
+++ b/src/GeneralCategories.zig
@@ -1,8 +1,19 @@
1//! General Categories 1//! General Categories
2 2
3s1: []u16 = undefined, 3const Data = struct {
4s2: []u5 = undefined, 4 s1: []const u16 = undefined,
5s3: []u5 = undefined, 5 s2: []const u5 = undefined,
6 s3: []const u5 = undefined,
7};
8
9const general_categories = general_categories: {
10 const data = @import("gencat");
11 break :general_categories Data{
12 .s1 = &data.s1,
13 .s2 = &data.s2,
14 .s3 = &data.s3,
15 };
16};
6 17
7/// General Category 18/// General Category
8pub const Gc = enum { 19pub const Gc = enum {
@@ -38,51 +49,14 @@ pub const Gc = enum {
38 Zs, // Separator, Space 49 Zs, // Separator, Space
39}; 50};
40 51
41const GeneralCategories = @This();
42
43pub fn init(allocator: Allocator) Allocator.Error!GeneralCategories {
44 var gencat = GeneralCategories{};
45 try gencat.setup(allocator);
46 return gencat;
47}
48
49pub fn setup(gencat: *GeneralCategories, allocator: Allocator) Allocator.Error!void {
50 const in_bytes = @embedFile("gencat");
51 var in_fbs = std.io.fixedBufferStream(in_bytes);
52 var reader = in_fbs.reader();
53
54 const endian = builtin.cpu.arch.endian();
55
56 const s1_len: u16 = reader.readInt(u16, endian) catch unreachable;
57 gencat.s1 = try allocator.alloc(u16, s1_len);
58 errdefer allocator.free(gencat.s1);
59 for (0..s1_len) |i| gencat.s1[i] = reader.readInt(u16, endian) catch unreachable;
60
61 const s2_len: u16 = reader.readInt(u16, endian) catch unreachable;
62 gencat.s2 = try allocator.alloc(u5, s2_len);
63 errdefer allocator.free(gencat.s2);
64 for (0..s2_len) |i| gencat.s2[i] = @intCast(reader.readInt(u8, endian) catch unreachable);
65
66 const s3_len: u16 = reader.readInt(u8, endian) catch unreachable;
67 gencat.s3 = try allocator.alloc(u5, s3_len);
68 errdefer allocator.free(gencat.s3);
69 for (0..s3_len) |i| gencat.s3[i] = @intCast(reader.readInt(u8, endian) catch unreachable);
70}
71
72pub fn deinit(gencat: *const GeneralCategories, allocator: mem.Allocator) void {
73 allocator.free(gencat.s1);
74 allocator.free(gencat.s2);
75 allocator.free(gencat.s3);
76}
77
78/// Lookup the General Category for `cp`. 52/// Lookup the General Category for `cp`.
79pub fn gc(gencat: GeneralCategories, cp: u21) Gc { 53pub fn gc(cp: u21) Gc {
80 return @enumFromInt(gencat.s3[gencat.s2[gencat.s1[cp >> 8] + (cp & 0xff)]]); 54 return @enumFromInt(general_categories.s3[general_categories.s2[general_categories.s1[cp >> 8] + (cp & 0xff)]]);
81} 55}
82 56
83/// True if `cp` has an C general category. 57/// True if `cp` has an C general category.
84pub fn isControl(gencat: GeneralCategories, cp: u21) bool { 58pub fn isControl(cp: u21) bool {
85 return switch (gencat.gc(cp)) { 59 return switch (gc(cp)) {
86 .Cc, 60 .Cc,
87 .Cf, 61 .Cf,
88 .Cn, 62 .Cn,
@@ -94,8 +68,8 @@ pub fn isControl(gencat: GeneralCategories, cp: u21) bool {
94} 68}
95 69
96/// True if `cp` has an L general category. 70/// True if `cp` has an L general category.
97pub fn isLetter(gencat: GeneralCategories, cp: u21) bool { 71pub fn isLetter(cp: u21) bool {
98 return switch (gencat.gc(cp)) { 72 return switch (gc(cp)) {
99 .Ll, 73 .Ll,
100 .Lm, 74 .Lm,
101 .Lo, 75 .Lo,
@@ -107,8 +81,8 @@ pub fn isLetter(gencat: GeneralCategories, cp: u21) bool {
107} 81}
108 82
109/// True if `cp` has an M general category. 83/// True if `cp` has an M general category.
110pub fn isMark(gencat: GeneralCategories, cp: u21) bool { 84pub fn isMark(cp: u21) bool {
111 return switch (gencat.gc(cp)) { 85 return switch (gc(cp)) {
112 .Mc, 86 .Mc,
113 .Me, 87 .Me,
114 .Mn, 88 .Mn,
@@ -118,8 +92,8 @@ pub fn isMark(gencat: GeneralCategories, cp: u21) bool {
118} 92}
119 93
120/// True if `cp` has an N general category. 94/// True if `cp` has an N general category.
121pub fn isNumber(gencat: GeneralCategories, cp: u21) bool { 95pub fn isNumber(cp: u21) bool {
122 return switch (gencat.gc(cp)) { 96 return switch (gc(cp)) {
123 .Nd, 97 .Nd,
124 .Nl, 98 .Nl,
125 .No, 99 .No,
@@ -129,8 +103,8 @@ pub fn isNumber(gencat: GeneralCategories, cp: u21) bool {
129} 103}
130 104
131/// True if `cp` has an P general category. 105/// True if `cp` has an P general category.
132pub fn isPunctuation(gencat: GeneralCategories, cp: u21) bool { 106pub fn isPunctuation(cp: u21) bool {
133 return switch (gencat.gc(cp)) { 107 return switch (gc(cp)) {
134 .Pc, 108 .Pc,
135 .Pd, 109 .Pd,
136 .Pe, 110 .Pe,
@@ -144,8 +118,8 @@ pub fn isPunctuation(gencat: GeneralCategories, cp: u21) bool {
144} 118}
145 119
146/// True if `cp` has an S general category. 120/// True if `cp` has an S general category.
147pub fn isSymbol(gencat: GeneralCategories, cp: u21) bool { 121pub fn isSymbol(cp: u21) bool {
148 return switch (gencat.gc(cp)) { 122 return switch (gc(cp)) {
149 .Sc, 123 .Sc,
150 .Sk, 124 .Sk,
151 .Sm, 125 .Sm,
@@ -156,8 +130,8 @@ pub fn isSymbol(gencat: GeneralCategories, cp: u21) bool {
156} 130}
157 131
158/// True if `cp` has an Z general category. 132/// True if `cp` has an Z general category.
159pub fn isSeparator(gencat: GeneralCategories, cp: u21) bool { 133pub fn isSeparator(cp: u21) bool {
160 return switch (gencat.gc(cp)) { 134 return switch (gc(cp)) {
161 .Zl, 135 .Zl,
162 .Zp, 136 .Zp,
163 .Zs, 137 .Zs,
@@ -165,19 +139,3 @@ pub fn isSeparator(gencat: GeneralCategories, cp: u21) bool {
165 else => false, 139 else => false,
166 }; 140 };
167} 141}
168
169fn testAllocator(allocator: Allocator) !void {
170 var gen_cat = try GeneralCategories.init(allocator);
171 gen_cat.deinit(allocator);
172}
173
174test "Allocation failure" {
175 try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{});
176}
177
178const std = @import("std");
179const builtin = @import("builtin");
180const compress = std.compress;
181const mem = std.mem;
182const testing = std.testing;
183const Allocator = mem.Allocator;
diff --git a/src/LetterCasing.zig b/src/LetterCasing.zig
index 33096fc..24b67a0 100644
--- a/src/LetterCasing.zig
+++ b/src/LetterCasing.zig
@@ -1,120 +1,58 @@
1const CodePointIterator = @import("code_point").Iterator; 1const CodePointIterator = @import("code_point").Iterator;
2 2const GeneralCategories = @import("GeneralCategories");
3case_map: [][2]u21 = undefined, 3
4prop_s1: []u16 = undefined, 4const Data = struct {
5prop_s2: []u8 = undefined, 5 s1: []const u16 = undefined,
6 6 s2: []const u44 = undefined,
7const LetterCasing = @This(); 7};
8 8
9pub fn init(allocator: Allocator) Allocator.Error!LetterCasing { 9const letter_casing = letter_casing: {
10 var case = LetterCasing{}; 10 const data = @import("case");
11 try case.setup(allocator); 11 break :letter_casing Data{
12 return case; 12 .s1 = &data.s1,
13} 13 .s2 = &data.s2,
14
15pub fn setup(case: *LetterCasing, allocator: Allocator) Allocator.Error!void {
16 case.setupInner(allocator) catch |err| {
17 switch (err) {
18 error.OutOfMemory => |e| return e,
19 else => unreachable,
20 }
21 }; 14 };
22} 15};
23
24inline fn setupInner(self: *LetterCasing, allocator: mem.Allocator) !void {
25 const endian = builtin.cpu.arch.endian();
26
27 self.case_map = try allocator.alloc([2]u21, 0x110000);
28 errdefer allocator.free(self.case_map);
29
30 for (0..0x110000) |i| {
31 const cp: u21 = @intCast(i);
32 self.case_map[cp] = .{ cp, cp };
33 }
34
35 // Uppercase
36 const upper_bytes = @embedFile("upper");
37 var upper_fbs = std.io.fixedBufferStream(upper_bytes);
38 var upper_reader = upper_fbs.reader();
39
40 while (true) {
41 const cp = try upper_reader.readInt(i24, endian);
42 if (cp == 0) break;
43 const diff = try upper_reader.readInt(i24, endian);
44 self.case_map[@intCast(cp)][0] = @intCast(cp + diff);
45 }
46
47 // Lowercase
48 const lower_bytes = @embedFile("lower");
49 var lower_fbs = std.io.fixedBufferStream(lower_bytes);
50 var lower_reader = lower_fbs.reader();
51
52 while (true) {
53 const cp = try lower_reader.readInt(i24, endian);
54 if (cp == 0) break;
55 const diff = try lower_reader.readInt(i24, endian);
56 self.case_map[@intCast(cp)][1] = @intCast(cp + diff);
57 }
58
59 // Case properties
60 const cp_bytes = @embedFile("case_prop");
61 var cp_fbs = std.io.fixedBufferStream(cp_bytes);
62 var cp_reader = cp_fbs.reader();
63
64 const stage_1_len: u16 = try cp_reader.readInt(u16, endian);
65 self.prop_s1 = try allocator.alloc(u16, stage_1_len);
66 errdefer allocator.free(self.prop_s1);
67 for (0..stage_1_len) |i| self.prop_s1[i] = try cp_reader.readInt(u16, endian);
68
69 const stage_2_len: u16 = try cp_reader.readInt(u16, endian);
70 self.prop_s2 = try allocator.alloc(u8, stage_2_len);
71 errdefer allocator.free(self.prop_s2);
72 _ = try cp_reader.readAll(self.prop_s2);
73}
74
75pub fn deinit(self: *const LetterCasing, allocator: mem.Allocator) void {
76 allocator.free(self.case_map);
77 allocator.free(self.prop_s1);
78 allocator.free(self.prop_s2);
79}
80 16
81// Returns true if `cp` is either upper, lower, or title case. 17// Returns true if `cp` is either upper, lower, or title case.
82pub fn isCased(self: LetterCasing, cp: u21) bool { 18pub fn isCased(cp: u21) bool {
83 return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; 19 return isUpper(cp) or isLower(cp) or GeneralCategories.gc(cp) == .Lt;
84} 20}
85 21
86// Returns true if `cp` is uppercase. 22// Returns true if `cp` is uppercase.
87pub fn isUpper(self: LetterCasing, cp: u21) bool { 23pub fn isUpper(cp: u21) bool {
88 return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; 24 // isUpper is true if we have a mapping to a lower character (bit 1)
25 return letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
89} 26}
90 27
91/// Returns true if `str` is all uppercase. 28/// Returns true if `str` is all non-lowercase.
92pub fn isUpperStr(self: LetterCasing, str: []const u8) bool { 29pub fn isUpperStr(str: []const u8) bool {
93 var iter = CodePointIterator{ .bytes = str }; 30 var iter = CodePointIterator{ .bytes = str };
94 31
95 return while (iter.next()) |cp| { 32 return while (iter.next()) |cp| {
96 if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false; 33 if (isLower(cp.code)) break false;
97 } else true; 34 } else true;
98} 35}
99 36
100test "isUpperStr" { 37test "isUpperStr" {
101 const cd = try init(testing.allocator); 38 try testing.expect(isUpperStr("HELLO, WORLD 2112!"));
102 defer cd.deinit(testing.allocator); 39 try testing.expect(!isUpperStr("hello, world 2112!"));
103 40 try testing.expect(!isUpperStr("Hello, World 2112!"));
104 try testing.expect(cd.isUpperStr("HELLO, WORLD 2112!"));
105 try testing.expect(!cd.isUpperStr("hello, world 2112!"));
106 try testing.expect(!cd.isUpperStr("Hello, World 2112!"));
107} 41}
108 42
109/// Returns uppercase mapping for `cp`. 43/// Returns uppercase mapping for `cp`.
110pub fn toUpper(self: LetterCasing, cp: u21) u21 { 44pub fn toUpper(cp: u21) u21 {
111 return self.case_map[cp][0]; 45 const case_prop = letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)];
46 if (case_prop & 2 == 2) {
47 return @intCast(case_prop >> (21 + 2));
48 } else {
49 return cp;
50 }
112} 51}
113 52
114/// Returns a new string with all letters in uppercase. 53/// Returns a new string with all letters in uppercase.
115/// Caller must free returned bytes with `allocator`. 54/// Caller must free returned bytes with `allocator`.
116pub fn toUpperStr( 55pub fn toUpperStr(
117 self: LetterCasing,
118 allocator: mem.Allocator, 56 allocator: mem.Allocator,
119 str: []const u8, 57 str: []const u8,
120) ![]u8 { 58) ![]u8 {
@@ -125,7 +63,7 @@ pub fn toUpperStr(
125 var buf: [4]u8 = undefined; 63 var buf: [4]u8 = undefined;
126 64
127 while (iter.next()) |cp| { 65 while (iter.next()) |cp| {
128 const len = try unicode.utf8Encode(self.toUpper(cp.code), &buf); 66 const len = try unicode.utf8Encode(toUpper(cp.code), &buf);
129 try bytes.appendSlice(buf[0..len]); 67 try bytes.appendSlice(buf[0..len]);
130 } 68 }
131 69
@@ -133,46 +71,45 @@ pub fn toUpperStr(
133} 71}
134 72
135test "toUpperStr" { 73test "toUpperStr" {
136 const cd = try init(testing.allocator); 74 const uppered = try toUpperStr(testing.allocator, "Hello, World 2112!");
137 defer cd.deinit(testing.allocator);
138
139 const uppered = try cd.toUpperStr(testing.allocator, "Hello, World 2112!");
140 defer testing.allocator.free(uppered); 75 defer testing.allocator.free(uppered);
141 try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered); 76 try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered);
142} 77}
143 78
144// Returns true if `cp` is lowercase. 79// Returns true if `cp` is lowercase.
145pub fn isLower(self: LetterCasing, cp: u21) bool { 80pub fn isLower(cp: u21) bool {
146 return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; 81 // isLower is true if we have a mapping to an upper character (bit 2)
82 return letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
147} 83}
148 84
149/// Returns true if `str` is all lowercase. 85/// Returns true if `str` is all non-uppercase.
150pub fn isLowerStr(self: LetterCasing, str: []const u8) bool { 86pub fn isLowerStr(str: []const u8) bool {
151 var iter = CodePointIterator{ .bytes = str }; 87 var iter = CodePointIterator{ .bytes = str };
152 88
153 return while (iter.next()) |cp| { 89 return while (iter.next()) |cp| {
154 if (self.isCased(cp.code) and !self.isLower(cp.code)) break false; 90 if (isUpper(cp.code)) break false;
155 } else true; 91 } else true;
156} 92}
157 93
158test "isLowerStr" { 94test "isLowerStr" {
159 const cd = try init(testing.allocator); 95 try testing.expect(isLowerStr("hello, world 2112!"));
160 defer cd.deinit(testing.allocator); 96 try testing.expect(!isLowerStr("HELLO, WORLD 2112!"));
161 97 try testing.expect(!isLowerStr("Hello, World 2112!"));
162 try testing.expect(cd.isLowerStr("hello, world 2112!"));
163 try testing.expect(!cd.isLowerStr("HELLO, WORLD 2112!"));
164 try testing.expect(!cd.isLowerStr("Hello, World 2112!"));
165} 98}
166 99
167/// Returns lowercase mapping for `cp`. 100/// Returns lowercase mapping for `cp`.
168pub fn toLower(self: LetterCasing, cp: u21) u21 { 101pub fn toLower(cp: u21) u21 {
169 return self.case_map[cp][1]; 102 const case_prop = letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)];
103 if (case_prop & 1 == 1) {
104 return @intCast((case_prop >> 2) & 0x1FFFFF);
105 } else {
106 return cp;
107 }
170} 108}
171 109
172/// Returns a new string with all letters in lowercase. 110/// Returns a new string with all letters in lowercase.
173/// Caller must free returned bytes with `allocator`. 111/// Caller must free returned bytes with `allocator`.
174pub fn toLowerStr( 112pub fn toLowerStr(
175 self: LetterCasing,
176 allocator: mem.Allocator, 113 allocator: mem.Allocator,
177 str: []const u8, 114 str: []const u8,
178) ![]u8 { 115) ![]u8 {
@@ -183,7 +120,7 @@ pub fn toLowerStr(
183 var buf: [4]u8 = undefined; 120 var buf: [4]u8 = undefined;
184 121
185 while (iter.next()) |cp| { 122 while (iter.next()) |cp| {
186 const len = try unicode.utf8Encode(self.toLower(cp.code), &buf); 123 const len = try unicode.utf8Encode(toLower(cp.code), &buf);
187 try bytes.appendSlice(buf[0..len]); 124 try bytes.appendSlice(buf[0..len]);
188 } 125 }
189 126
@@ -191,27 +128,13 @@ pub fn toLowerStr(
191} 128}
192 129
193test "toLowerStr" { 130test "toLowerStr" {
194 const cd = try init(testing.allocator); 131 const lowered = try toLowerStr(testing.allocator, "Hello, World 2112!");
195 defer cd.deinit(testing.allocator);
196
197 const lowered = try cd.toLowerStr(testing.allocator, "Hello, World 2112!");
198 defer testing.allocator.free(lowered); 132 defer testing.allocator.free(lowered);
199 try testing.expectEqualStrings("hello, world 2112!", lowered); 133 try testing.expectEqualStrings("hello, world 2112!", lowered);
200} 134}
201 135
202fn testAllocator(allocator: Allocator) !void {
203 var prop = try LetterCasing.init(allocator);
204 prop.deinit(allocator);
205}
206
207test "Allocation failure" {
208 try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{});
209}
210
211const std = @import("std"); 136const std = @import("std");
212const builtin = @import("builtin"); 137const builtin = @import("builtin");
213const compress = std.compress;
214const mem = std.mem; 138const mem = std.mem;
215const Allocator = std.mem.Allocator;
216const testing = std.testing; 139const testing = std.testing;
217const unicode = std.unicode; 140const unicode = std.unicode;
diff --git a/src/Properties.zig b/src/Properties.zig
index 432d176..f8c7cfc 100644
--- a/src/Properties.zig
+++ b/src/Properties.zig
@@ -1,177 +1,108 @@
1//! Properties module 1//! Properties module
2 2
3core_s1: []u16 = undefined, 3const Data = struct {
4core_s2: []u8 = undefined, 4 core_s1: []const u16 = undefined,
5props_s1: []u16 = undefined, 5 core_s2: []const u8 = undefined,
6props_s2: []u8 = undefined, 6 props_s1: []const u16 = undefined,
7num_s1: []u16 = undefined, 7 props_s2: []const u8 = undefined,
8num_s2: []u8 = undefined, 8 num_s1: []const u16 = undefined,
9 9 num_s2: []const u8 = undefined,
10const Properties = @This(); 10};
11 11
12pub fn init(allocator: Allocator) Allocator.Error!Properties { 12const properties = properties: {
13 var props = Properties{}; 13 const core_props = @import("core_props");
14 try props.setup(allocator); 14 const props_data = @import("props");
15 return props; 15 const numeric = @import("numeric");
16} 16 break :properties Data{
17 17 .core_s1 = &core_props.s1,
18pub fn setup(props: *Properties, allocator: Allocator) Allocator.Error!void { 18 .core_s2 = &core_props.s2,
19 props.setupInner(allocator) catch |err| { 19 .props_s1 = &props_data.s1,
20 switch (err) { 20 .props_s2 = &props_data.s2,
21 error.OutOfMemory => |e| return e, 21 .num_s1 = &numeric.s1,
22 else => unreachable, 22 .num_s2 = &numeric.s2,
23 }
24 }; 23 };
25} 24};
26
27inline fn setupInner(props: *Properties, allocator: Allocator) !void {
28 const endian = builtin.cpu.arch.endian();
29
30 // Process DerivedCoreProperties.txt
31 const core_bytes = @embedFile("core_props");
32 var core_fbs = std.io.fixedBufferStream(core_bytes);
33 var core_reader = core_fbs.reader();
34
35 const core_stage_1_len: u16 = try core_reader.readInt(u16, endian);
36 props.core_s1 = try allocator.alloc(u16, core_stage_1_len);
37 errdefer allocator.free(props.core_s1);
38 for (0..core_stage_1_len) |i| props.core_s1[i] = try core_reader.readInt(u16, endian);
39
40 const core_stage_2_len: u16 = try core_reader.readInt(u16, endian);
41 props.core_s2 = try allocator.alloc(u8, core_stage_2_len);
42 errdefer allocator.free(props.core_s2);
43 _ = try core_reader.readAll(props.core_s2);
44
45 // Process PropList.txt
46 const props_bytes = @embedFile("props");
47 var props_fbs = std.io.fixedBufferStream(props_bytes);
48 var props_reader = props_fbs.reader();
49
50 const stage_1_len: u16 = try props_reader.readInt(u16, endian);
51 props.props_s1 = try allocator.alloc(u16, stage_1_len);
52 errdefer allocator.free(props.props_s1);
53 for (0..stage_1_len) |i| props.props_s1[i] = try props_reader.readInt(u16, endian);
54
55 const stage_2_len: u16 = try props_reader.readInt(u16, endian);
56 props.props_s2 = try allocator.alloc(u8, stage_2_len);
57 errdefer allocator.free(props.props_s2);
58 _ = try props_reader.readAll(props.props_s2);
59
60 // Process DerivedNumericType.txt
61 const num_bytes = @embedFile("numeric");
62 var num_fbs = std.io.fixedBufferStream(num_bytes);
63 var num_reader = num_fbs.reader();
64
65 const num_stage_1_len: u16 = try num_reader.readInt(u16, endian);
66 props.num_s1 = try allocator.alloc(u16, num_stage_1_len);
67 errdefer allocator.free(props.num_s1);
68 for (0..num_stage_1_len) |i| props.num_s1[i] = try num_reader.readInt(u16, endian);
69
70 const num_stage_2_len: u16 = try num_reader.readInt(u16, endian);
71 props.num_s2 = try allocator.alloc(u8, num_stage_2_len);
72 errdefer allocator.free(props.num_s2);
73 _ = try num_reader.readAll(props.num_s2);
74}
75 25
76pub fn deinit(self: *const Properties, allocator: Allocator) void { 26const Properties = @This();
77 allocator.free(self.core_s1);
78 allocator.free(self.core_s2);
79 allocator.free(self.props_s1);
80 allocator.free(self.props_s2);
81 allocator.free(self.num_s1);
82 allocator.free(self.num_s2);
83}
84 27
85/// True if `cp` is a mathematical symbol. 28/// True if `cp` is a mathematical symbol.
86pub fn isMath(self: Properties, cp: u21) bool { 29pub fn isMath(cp: u21) bool {
87 return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; 30 return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
88} 31}
89 32
90/// True if `cp` is an alphabetic character. 33/// True if `cp` is an alphabetic character.
91pub fn isAlphabetic(self: Properties, cp: u21) bool { 34pub fn isAlphabetic(cp: u21) bool {
92 return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; 35 return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
93} 36}
94 37
95/// True if `cp` is a valid identifier start character. 38/// True if `cp` is a valid identifier start character.
96pub fn isIdStart(self: Properties, cp: u21) bool { 39pub fn isIdStart(cp: u21) bool {
97 return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; 40 return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
98} 41}
99 42
100/// True if `cp` is a valid identifier continuation character. 43/// True if `cp` is a valid identifier continuation character.
101pub fn isIdContinue(self: Properties, cp: u21) bool { 44pub fn isIdContinue(cp: u21) bool {
102 return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 8 == 8; 45 return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 8 == 8;
103} 46}
104 47
105/// True if `cp` is a valid extended identifier start character. 48/// True if `cp` is a valid extended identifier start character.
106pub fn isXidStart(self: Properties, cp: u21) bool { 49pub fn isXidStart(cp: u21) bool {
107 return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 16 == 16; 50 return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 16 == 16;
108} 51}
109 52
110/// True if `cp` is a valid extended identifier continuation character. 53/// True if `cp` is a valid extended identifier continuation character.
111pub fn isXidContinue(self: Properties, cp: u21) bool { 54pub fn isXidContinue(cp: u21) bool {
112 return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 32 == 32; 55 return properties.core_s2[properties.core_s1[cp >> 8] + (cp & 0xff)] & 32 == 32;
113} 56}
114 57
115/// True if `cp` is a whitespace character. 58/// True if `cp` is a whitespace character.
116pub fn isWhitespace(self: Properties, cp: u21) bool { 59pub fn isWhitespace(cp: u21) bool {
117 return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; 60 return properties.props_s2[properties.props_s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
118} 61}
119 62
120/// True if `cp` is a hexadecimal digit. 63/// True if `cp` is a hexadecimal digit.
121pub fn isHexDigit(self: Properties, cp: u21) bool { 64pub fn isHexDigit(cp: u21) bool {
122 return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; 65 return properties.props_s2[properties.props_s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
123} 66}
124 67
125/// True if `cp` is a diacritic mark. 68/// True if `cp` is a diacritic mark.
126pub fn isDiacritic(self: Properties, cp: u21) bool { 69pub fn isDiacritic(cp: u21) bool {
127 return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; 70 return properties.props_s2[properties.props_s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
128} 71}
129 72
130/// True if `cp` is numeric. 73/// True if `cp` is numeric.
131pub fn isNumeric(self: Properties, cp: u21) bool { 74pub fn isNumeric(cp: u21) bool {
132 return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; 75 return properties.num_s2[properties.num_s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
133} 76}
134 77
135/// True if `cp` is a digit. 78/// True if `cp` is a digit.
136pub fn isDigit(self: Properties, cp: u21) bool { 79pub fn isDigit(cp: u21) bool {
137 return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; 80 return properties.num_s2[properties.num_s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
138} 81}
139 82
140/// True if `cp` is decimal. 83/// True if `cp` is decimal.
141pub fn isDecimal(self: Properties, cp: u21) bool { 84pub fn isDecimal(cp: u21) bool {
142 return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 4 == 4; 85 return properties.num_s2[properties.num_s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
143} 86}
144 87
145test "Props" { 88test "Props" {
146 const self = try init(testing.allocator); 89 try testing.expect(Properties.isHexDigit('F'));
147 defer self.deinit(testing.allocator); 90 try testing.expect(Properties.isHexDigit('a'));
148 91 try testing.expect(Properties.isHexDigit('8'));
149 try testing.expect(self.isHexDigit('F')); 92 try testing.expect(!Properties.isHexDigit('z'));
150 try testing.expect(self.isHexDigit('a')); 93
151 try testing.expect(self.isHexDigit('8')); 94 try testing.expect(Properties.isDiacritic('\u{301}'));
152 try testing.expect(!self.isHexDigit('z')); 95 try testing.expect(Properties.isAlphabetic('A'));
153 96 try testing.expect(!Properties.isAlphabetic('3'));
154 try testing.expect(self.isDiacritic('\u{301}')); 97 try testing.expect(Properties.isMath('+'));
155 try testing.expect(self.isAlphabetic('A')); 98
156 try testing.expect(!self.isAlphabetic('3')); 99 try testing.expect(Properties.isNumeric('\u{277f}'));
157 try testing.expect(self.isMath('+')); 100 try testing.expect(Properties.isDigit('\u{2070}'));
158 101 try testing.expect(Properties.isDecimal('3'));
159 try testing.expect(self.isNumeric('\u{277f}')); 102
160 try testing.expect(self.isDigit('\u{2070}')); 103 try testing.expect(!Properties.isNumeric('1'));
161 try testing.expect(self.isDecimal('3')); 104 try testing.expect(!Properties.isDigit('2'));
162 105 try testing.expect(!Properties.isDecimal('g'));
163 try testing.expect(!self.isNumeric('1'));
164 try testing.expect(!self.isDigit('2'));
165 try testing.expect(!self.isDecimal('g'));
166}
167
168fn testAllocator(allocator: Allocator) !void {
169 var prop = try Properties.init(allocator);
170 prop.deinit(allocator);
171}
172
173test "Allocation failure" {
174 try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{});
175} 106}
176 107
177const std = @import("std"); 108const std = @import("std");
diff --git a/src/Scripts.zig b/src/Scripts.zig
index 719b01f..4938318 100644
--- a/src/Scripts.zig
+++ b/src/Scripts.zig
@@ -1,8 +1,18 @@
1//! Scripts Module 1//! Scripts Module
2const Data = struct {
3 s1: []const u16 = undefined,
4 s2: []const u8 = undefined,
5 s3: []const u8 = undefined,
6};
2 7
3s1: []u16 = undefined, 8const scripts = scripts: {
4s2: []u8 = undefined, 9 const data = @import("script");
5s3: []u8 = undefined, 10 break :scripts Data{
11 .s1 = &data.s1,
12 .s2 = &data.s2,
13 .s3 = &data.s3,
14 };
15};
6 16
7/// Scripts enum 17/// Scripts enum
8pub const Script = enum { 18pub const Script = enum {
@@ -178,76 +188,20 @@ pub const Script = enum {
178 Yi, 188 Yi,
179 Zanabazar_Square, 189 Zanabazar_Square,
180}; 190};
181const Scripts = @This();
182
183pub fn init(allocator: Allocator) Allocator.Error!Scripts {
184 var scripts = Scripts{};
185 try scripts.setup(allocator);
186 return scripts;
187}
188
189pub fn setup(scripts: *Scripts, allocator: Allocator) Allocator.Error!void {
190 scripts.setupInner(allocator) catch |err| {
191 switch (err) {
192 error.OutOfMemory => |e| return e,
193 else => unreachable,
194 }
195 };
196}
197
198inline fn setupInner(scripts: *Scripts, allocator: mem.Allocator) !void {
199 const in_bytes = @embedFile("scripts");
200 var in_fbs = std.io.fixedBufferStream(in_bytes);
201 var reader = in_fbs.reader();
202
203 const endian = builtin.cpu.arch.endian();
204
205 const s1_len: u16 = try reader.readInt(u16, endian);
206 scripts.s1 = try allocator.alloc(u16, s1_len);
207 errdefer allocator.free(scripts.s1);
208 for (0..s1_len) |i| scripts.s1[i] = try reader.readInt(u16, endian);
209
210 const s2_len: u16 = try reader.readInt(u16, endian);
211 scripts.s2 = try allocator.alloc(u8, s2_len);
212 errdefer allocator.free(scripts.s2);
213 _ = try reader.readAll(scripts.s2);
214
215 const s3_len: u16 = try reader.readInt(u8, endian);
216 scripts.s3 = try allocator.alloc(u8, s3_len);
217 errdefer allocator.free(scripts.s3);
218 _ = try reader.readAll(scripts.s3);
219}
220
221pub fn deinit(self: *const Scripts, allocator: mem.Allocator) void {
222 allocator.free(self.s1);
223 allocator.free(self.s2);
224 allocator.free(self.s3);
225}
226 191
227/// Lookup the Script type for `cp`. 192/// Lookup the Script type for `cp`.
228pub fn script(self: Scripts, cp: u21) ?Script { 193pub fn script(cp: u21) ?Script {
229 const byte = self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]]; 194 const byte = scripts.s3[scripts.s2[scripts.s1[cp >> 8] + (cp & 0xff)]];
230 if (byte == 0) return null; 195 if (byte == 0) return null;
231 return @enumFromInt(byte); 196 return @enumFromInt(byte);
232} 197}
233 198
234test "script" { 199test "script" {
235 const self = try init(std.testing.allocator); 200 try testing.expectEqual(Script.Latin, script('A').?);
236 defer self.deinit(std.testing.allocator); 201 // try testing.expectEqual(Script.Deseret, script('𐐌').?);
237 try testing.expectEqual(Script.Latin, self.script('A').?);
238}
239
240fn testAllocator(allocator: Allocator) !void {
241 var prop = try Scripts.init(allocator);
242 prop.deinit(allocator);
243}
244
245test "Allocation failure" {
246 try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{});
247} 202}
248 203
249const std = @import("std"); 204const std = @import("std");
250const builtin = @import("builtin"); 205const builtin = @import("builtin");
251const mem = std.mem; 206const unicode = std.unicode;
252const Allocator = mem.Allocator;
253const testing = std.testing; 207const testing = std.testing;