summaryrefslogtreecommitdiff
path: root/src/GeneralCategories.zig
diff options
context:
space:
mode:
authorGravatar Sam Atman2026-02-04 18:36:18 -0500
committerGravatar Sam Atman2026-02-04 18:36:18 -0500
commite476250ea9326b2550847b301c265115ff375a31 (patch)
treecf627ced47cecce80020b7a1f30aa51852c0c59b /src/GeneralCategories.zig
parentNormalization and case folding (diff)
downloadzg-e476250ea9326b2550847b301c265115ff375a31.tar.gz
zg-e476250ea9326b2550847b301c265115ff375a31.tar.xz
zg-e476250ea9326b2550847b301c265115ff375a31.zip
Rest of the 'easy' stuff
This gets us up to feature parity with Jacob's work. I want to eliminate that last allocation using the comptime hash map, and then see about eliminating allocations from case comparisons as well. That should just about do it.
Diffstat (limited to 'src/GeneralCategories.zig')
-rw-r--r--src/GeneralCategories.zig102
1 files changed, 30 insertions, 72 deletions
diff --git a/src/GeneralCategories.zig b/src/GeneralCategories.zig
index eee7e56..9a383bf 100644
--- a/src/GeneralCategories.zig
+++ b/src/GeneralCategories.zig
@@ -1,8 +1,19 @@
1//! General Categories 1//! General Categories
2 2
3s1: []u16 = undefined, 3const Data = struct {
4s2: []u5 = undefined, 4 s1: []const u16 = undefined,
5s3: []u5 = undefined, 5 s2: []const u5 = undefined,
6 s3: []const u5 = undefined,
7};
8
9const general_categories = general_categories: {
10 const data = @import("gencat");
11 break :general_categories Data{
12 .s1 = &data.s1,
13 .s2 = &data.s2,
14 .s3 = &data.s3,
15 };
16};
6 17
7/// General Category 18/// General Category
8pub const Gc = enum { 19pub const Gc = enum {
@@ -38,51 +49,14 @@ pub const Gc = enum {
38 Zs, // Separator, Space 49 Zs, // Separator, Space
39}; 50};
40 51
41const GeneralCategories = @This();
42
43pub fn init(allocator: Allocator) Allocator.Error!GeneralCategories {
44 var gencat = GeneralCategories{};
45 try gencat.setup(allocator);
46 return gencat;
47}
48
49pub fn setup(gencat: *GeneralCategories, allocator: Allocator) Allocator.Error!void {
50 const in_bytes = @embedFile("gencat");
51 var in_fbs = std.io.fixedBufferStream(in_bytes);
52 var reader = in_fbs.reader();
53
54 const endian = builtin.cpu.arch.endian();
55
56 const s1_len: u16 = reader.readInt(u16, endian) catch unreachable;
57 gencat.s1 = try allocator.alloc(u16, s1_len);
58 errdefer allocator.free(gencat.s1);
59 for (0..s1_len) |i| gencat.s1[i] = reader.readInt(u16, endian) catch unreachable;
60
61 const s2_len: u16 = reader.readInt(u16, endian) catch unreachable;
62 gencat.s2 = try allocator.alloc(u5, s2_len);
63 errdefer allocator.free(gencat.s2);
64 for (0..s2_len) |i| gencat.s2[i] = @intCast(reader.readInt(u8, endian) catch unreachable);
65
66 const s3_len: u16 = reader.readInt(u8, endian) catch unreachable;
67 gencat.s3 = try allocator.alloc(u5, s3_len);
68 errdefer allocator.free(gencat.s3);
69 for (0..s3_len) |i| gencat.s3[i] = @intCast(reader.readInt(u8, endian) catch unreachable);
70}
71
72pub fn deinit(gencat: *const GeneralCategories, allocator: mem.Allocator) void {
73 allocator.free(gencat.s1);
74 allocator.free(gencat.s2);
75 allocator.free(gencat.s3);
76}
77
78/// Lookup the General Category for `cp`. 52/// Lookup the General Category for `cp`.
79pub fn gc(gencat: GeneralCategories, cp: u21) Gc { 53pub fn gc(cp: u21) Gc {
80 return @enumFromInt(gencat.s3[gencat.s2[gencat.s1[cp >> 8] + (cp & 0xff)]]); 54 return @enumFromInt(general_categories.s3[general_categories.s2[general_categories.s1[cp >> 8] + (cp & 0xff)]]);
81} 55}
82 56
83/// True if `cp` has an C general category. 57/// True if `cp` has an C general category.
84pub fn isControl(gencat: GeneralCategories, cp: u21) bool { 58pub fn isControl(cp: u21) bool {
85 return switch (gencat.gc(cp)) { 59 return switch (gc(cp)) {
86 .Cc, 60 .Cc,
87 .Cf, 61 .Cf,
88 .Cn, 62 .Cn,
@@ -94,8 +68,8 @@ pub fn isControl(gencat: GeneralCategories, cp: u21) bool {
94} 68}
95 69
96/// True if `cp` has an L general category. 70/// True if `cp` has an L general category.
97pub fn isLetter(gencat: GeneralCategories, cp: u21) bool { 71pub fn isLetter(cp: u21) bool {
98 return switch (gencat.gc(cp)) { 72 return switch (gc(cp)) {
99 .Ll, 73 .Ll,
100 .Lm, 74 .Lm,
101 .Lo, 75 .Lo,
@@ -107,8 +81,8 @@ pub fn isLetter(gencat: GeneralCategories, cp: u21) bool {
107} 81}
108 82
109/// True if `cp` has an M general category. 83/// True if `cp` has an M general category.
110pub fn isMark(gencat: GeneralCategories, cp: u21) bool { 84pub fn isMark(cp: u21) bool {
111 return switch (gencat.gc(cp)) { 85 return switch (gc(cp)) {
112 .Mc, 86 .Mc,
113 .Me, 87 .Me,
114 .Mn, 88 .Mn,
@@ -118,8 +92,8 @@ pub fn isMark(gencat: GeneralCategories, cp: u21) bool {
118} 92}
119 93
120/// True if `cp` has an N general category. 94/// True if `cp` has an N general category.
121pub fn isNumber(gencat: GeneralCategories, cp: u21) bool { 95pub fn isNumber(cp: u21) bool {
122 return switch (gencat.gc(cp)) { 96 return switch (gc(cp)) {
123 .Nd, 97 .Nd,
124 .Nl, 98 .Nl,
125 .No, 99 .No,
@@ -129,8 +103,8 @@ pub fn isNumber(gencat: GeneralCategories, cp: u21) bool {
129} 103}
130 104
131/// True if `cp` has an P general category. 105/// True if `cp` has an P general category.
132pub fn isPunctuation(gencat: GeneralCategories, cp: u21) bool { 106pub fn isPunctuation(cp: u21) bool {
133 return switch (gencat.gc(cp)) { 107 return switch (gc(cp)) {
134 .Pc, 108 .Pc,
135 .Pd, 109 .Pd,
136 .Pe, 110 .Pe,
@@ -144,8 +118,8 @@ pub fn isPunctuation(gencat: GeneralCategories, cp: u21) bool {
144} 118}
145 119
146/// True if `cp` has an S general category. 120/// True if `cp` has an S general category.
147pub fn isSymbol(gencat: GeneralCategories, cp: u21) bool { 121pub fn isSymbol(cp: u21) bool {
148 return switch (gencat.gc(cp)) { 122 return switch (gc(cp)) {
149 .Sc, 123 .Sc,
150 .Sk, 124 .Sk,
151 .Sm, 125 .Sm,
@@ -156,8 +130,8 @@ pub fn isSymbol(gencat: GeneralCategories, cp: u21) bool {
156} 130}
157 131
158/// True if `cp` has an Z general category. 132/// True if `cp` has an Z general category.
159pub fn isSeparator(gencat: GeneralCategories, cp: u21) bool { 133pub fn isSeparator(cp: u21) bool {
160 return switch (gencat.gc(cp)) { 134 return switch (gc(cp)) {
161 .Zl, 135 .Zl,
162 .Zp, 136 .Zp,
163 .Zs, 137 .Zs,
@@ -165,19 +139,3 @@ pub fn isSeparator(gencat: GeneralCategories, cp: u21) bool {
165 else => false, 139 else => false,
166 }; 140 };
167} 141}
168
169fn testAllocator(allocator: Allocator) !void {
170 var gen_cat = try GeneralCategories.init(allocator);
171 gen_cat.deinit(allocator);
172}
173
174test "Allocation failure" {
175 try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{});
176}
177
178const std = @import("std");
179const builtin = @import("builtin");
180const compress = std.compress;
181const mem = std.mem;
182const testing = std.testing;
183const Allocator = mem.Allocator;