diff options
| author | 2026-02-04 18:36:18 -0500 | |
|---|---|---|
| committer | 2026-02-04 18:36:18 -0500 | |
| commit | e476250ea9326b2550847b301c265115ff375a31 (patch) | |
| tree | cf627ced47cecce80020b7a1f30aa51852c0c59b /src/GeneralCategories.zig | |
| parent | Normalization and case folding (diff) | |
| download | zg-e476250ea9326b2550847b301c265115ff375a31.tar.gz zg-e476250ea9326b2550847b301c265115ff375a31.tar.xz zg-e476250ea9326b2550847b301c265115ff375a31.zip | |
Rest of the 'easy' stuff
This gets us up to feature parity with Jacob's work. I want to
eliminate that last allocation using the comptime hash map, and then
see about eliminating allocations from case comparisons as well.
That should just about do it.
Diffstat (limited to 'src/GeneralCategories.zig')
| -rw-r--r-- | src/GeneralCategories.zig | 102 |
1 files changed, 30 insertions, 72 deletions
diff --git a/src/GeneralCategories.zig b/src/GeneralCategories.zig index eee7e56..9a383bf 100644 --- a/src/GeneralCategories.zig +++ b/src/GeneralCategories.zig | |||
| @@ -1,8 +1,19 @@ | |||
| 1 | //! General Categories | 1 | //! General Categories |
| 2 | 2 | ||
| 3 | s1: []u16 = undefined, | 3 | const Data = struct { |
| 4 | s2: []u5 = undefined, | 4 | s1: []const u16 = undefined, |
| 5 | s3: []u5 = undefined, | 5 | s2: []const u5 = undefined, |
| 6 | s3: []const u5 = undefined, | ||
| 7 | }; | ||
| 8 | |||
| 9 | const general_categories = general_categories: { | ||
| 10 | const data = @import("gencat"); | ||
| 11 | break :general_categories Data{ | ||
| 12 | .s1 = &data.s1, | ||
| 13 | .s2 = &data.s2, | ||
| 14 | .s3 = &data.s3, | ||
| 15 | }; | ||
| 16 | }; | ||
| 6 | 17 | ||
| 7 | /// General Category | 18 | /// General Category |
| 8 | pub const Gc = enum { | 19 | pub const Gc = enum { |
| @@ -38,51 +49,14 @@ pub const Gc = enum { | |||
| 38 | Zs, // Separator, Space | 49 | Zs, // Separator, Space |
| 39 | }; | 50 | }; |
| 40 | 51 | ||
| 41 | const GeneralCategories = @This(); | ||
| 42 | |||
| 43 | pub fn init(allocator: Allocator) Allocator.Error!GeneralCategories { | ||
| 44 | var gencat = GeneralCategories{}; | ||
| 45 | try gencat.setup(allocator); | ||
| 46 | return gencat; | ||
| 47 | } | ||
| 48 | |||
| 49 | pub fn setup(gencat: *GeneralCategories, allocator: Allocator) Allocator.Error!void { | ||
| 50 | const in_bytes = @embedFile("gencat"); | ||
| 51 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 52 | var reader = in_fbs.reader(); | ||
| 53 | |||
| 54 | const endian = builtin.cpu.arch.endian(); | ||
| 55 | |||
| 56 | const s1_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 57 | gencat.s1 = try allocator.alloc(u16, s1_len); | ||
| 58 | errdefer allocator.free(gencat.s1); | ||
| 59 | for (0..s1_len) |i| gencat.s1[i] = reader.readInt(u16, endian) catch unreachable; | ||
| 60 | |||
| 61 | const s2_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 62 | gencat.s2 = try allocator.alloc(u5, s2_len); | ||
| 63 | errdefer allocator.free(gencat.s2); | ||
| 64 | for (0..s2_len) |i| gencat.s2[i] = @intCast(reader.readInt(u8, endian) catch unreachable); | ||
| 65 | |||
| 66 | const s3_len: u16 = reader.readInt(u8, endian) catch unreachable; | ||
| 67 | gencat.s3 = try allocator.alloc(u5, s3_len); | ||
| 68 | errdefer allocator.free(gencat.s3); | ||
| 69 | for (0..s3_len) |i| gencat.s3[i] = @intCast(reader.readInt(u8, endian) catch unreachable); | ||
| 70 | } | ||
| 71 | |||
| 72 | pub fn deinit(gencat: *const GeneralCategories, allocator: mem.Allocator) void { | ||
| 73 | allocator.free(gencat.s1); | ||
| 74 | allocator.free(gencat.s2); | ||
| 75 | allocator.free(gencat.s3); | ||
| 76 | } | ||
| 77 | |||
| 78 | /// Lookup the General Category for `cp`. | 52 | /// Lookup the General Category for `cp`. |
| 79 | pub fn gc(gencat: GeneralCategories, cp: u21) Gc { | 53 | pub fn gc(cp: u21) Gc { |
| 80 | return @enumFromInt(gencat.s3[gencat.s2[gencat.s1[cp >> 8] + (cp & 0xff)]]); | 54 | return @enumFromInt(general_categories.s3[general_categories.s2[general_categories.s1[cp >> 8] + (cp & 0xff)]]); |
| 81 | } | 55 | } |
| 82 | 56 | ||
| 83 | /// True if `cp` has an C general category. | 57 | /// True if `cp` has an C general category. |
| 84 | pub fn isControl(gencat: GeneralCategories, cp: u21) bool { | 58 | pub fn isControl(cp: u21) bool { |
| 85 | return switch (gencat.gc(cp)) { | 59 | return switch (gc(cp)) { |
| 86 | .Cc, | 60 | .Cc, |
| 87 | .Cf, | 61 | .Cf, |
| 88 | .Cn, | 62 | .Cn, |
| @@ -94,8 +68,8 @@ pub fn isControl(gencat: GeneralCategories, cp: u21) bool { | |||
| 94 | } | 68 | } |
| 95 | 69 | ||
| 96 | /// True if `cp` has an L general category. | 70 | /// True if `cp` has an L general category. |
| 97 | pub fn isLetter(gencat: GeneralCategories, cp: u21) bool { | 71 | pub fn isLetter(cp: u21) bool { |
| 98 | return switch (gencat.gc(cp)) { | 72 | return switch (gc(cp)) { |
| 99 | .Ll, | 73 | .Ll, |
| 100 | .Lm, | 74 | .Lm, |
| 101 | .Lo, | 75 | .Lo, |
| @@ -107,8 +81,8 @@ pub fn isLetter(gencat: GeneralCategories, cp: u21) bool { | |||
| 107 | } | 81 | } |
| 108 | 82 | ||
| 109 | /// True if `cp` has an M general category. | 83 | /// True if `cp` has an M general category. |
| 110 | pub fn isMark(gencat: GeneralCategories, cp: u21) bool { | 84 | pub fn isMark(cp: u21) bool { |
| 111 | return switch (gencat.gc(cp)) { | 85 | return switch (gc(cp)) { |
| 112 | .Mc, | 86 | .Mc, |
| 113 | .Me, | 87 | .Me, |
| 114 | .Mn, | 88 | .Mn, |
| @@ -118,8 +92,8 @@ pub fn isMark(gencat: GeneralCategories, cp: u21) bool { | |||
| 118 | } | 92 | } |
| 119 | 93 | ||
| 120 | /// True if `cp` has an N general category. | 94 | /// True if `cp` has an N general category. |
| 121 | pub fn isNumber(gencat: GeneralCategories, cp: u21) bool { | 95 | pub fn isNumber(cp: u21) bool { |
| 122 | return switch (gencat.gc(cp)) { | 96 | return switch (gc(cp)) { |
| 123 | .Nd, | 97 | .Nd, |
| 124 | .Nl, | 98 | .Nl, |
| 125 | .No, | 99 | .No, |
| @@ -129,8 +103,8 @@ pub fn isNumber(gencat: GeneralCategories, cp: u21) bool { | |||
| 129 | } | 103 | } |
| 130 | 104 | ||
| 131 | /// True if `cp` has an P general category. | 105 | /// True if `cp` has an P general category. |
| 132 | pub fn isPunctuation(gencat: GeneralCategories, cp: u21) bool { | 106 | pub fn isPunctuation(cp: u21) bool { |
| 133 | return switch (gencat.gc(cp)) { | 107 | return switch (gc(cp)) { |
| 134 | .Pc, | 108 | .Pc, |
| 135 | .Pd, | 109 | .Pd, |
| 136 | .Pe, | 110 | .Pe, |
| @@ -144,8 +118,8 @@ pub fn isPunctuation(gencat: GeneralCategories, cp: u21) bool { | |||
| 144 | } | 118 | } |
| 145 | 119 | ||
| 146 | /// True if `cp` has an S general category. | 120 | /// True if `cp` has an S general category. |
| 147 | pub fn isSymbol(gencat: GeneralCategories, cp: u21) bool { | 121 | pub fn isSymbol(cp: u21) bool { |
| 148 | return switch (gencat.gc(cp)) { | 122 | return switch (gc(cp)) { |
| 149 | .Sc, | 123 | .Sc, |
| 150 | .Sk, | 124 | .Sk, |
| 151 | .Sm, | 125 | .Sm, |
| @@ -156,8 +130,8 @@ pub fn isSymbol(gencat: GeneralCategories, cp: u21) bool { | |||
| 156 | } | 130 | } |
| 157 | 131 | ||
| 158 | /// True if `cp` has an Z general category. | 132 | /// True if `cp` has an Z general category. |
| 159 | pub fn isSeparator(gencat: GeneralCategories, cp: u21) bool { | 133 | pub fn isSeparator(cp: u21) bool { |
| 160 | return switch (gencat.gc(cp)) { | 134 | return switch (gc(cp)) { |
| 161 | .Zl, | 135 | .Zl, |
| 162 | .Zp, | 136 | .Zp, |
| 163 | .Zs, | 137 | .Zs, |
| @@ -165,19 +139,3 @@ pub fn isSeparator(gencat: GeneralCategories, cp: u21) bool { | |||
| 165 | else => false, | 139 | else => false, |
| 166 | }; | 140 | }; |
| 167 | } | 141 | } |
| 168 | |||
| 169 | fn testAllocator(allocator: Allocator) !void { | ||
| 170 | var gen_cat = try GeneralCategories.init(allocator); | ||
| 171 | gen_cat.deinit(allocator); | ||
| 172 | } | ||
| 173 | |||
| 174 | test "Allocation failure" { | ||
| 175 | try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{}); | ||
| 176 | } | ||
| 177 | |||
| 178 | const std = @import("std"); | ||
| 179 | const builtin = @import("builtin"); | ||
| 180 | const compress = std.compress; | ||
| 181 | const mem = std.mem; | ||
| 182 | const testing = std.testing; | ||
| 183 | const Allocator = mem.Allocator; | ||