From e476250ea9326b2550847b301c265115ff375a31 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 4 Feb 2026 18:36:18 -0500 Subject: Rest of the 'easy' stuff This gets us up to feature parity with Jacob's work. I want to eliminate that last allocation using the comptime hash map, and then see about eliminating allocations from case comparisons as well. That should just about do it. --- src/GeneralCategories.zig | 102 ++++++++++++++-------------------------------- 1 file changed, 30 insertions(+), 72 deletions(-) (limited to 'src/GeneralCategories.zig') diff --git a/src/GeneralCategories.zig b/src/GeneralCategories.zig index eee7e56..9a383bf 100644 --- a/src/GeneralCategories.zig +++ b/src/GeneralCategories.zig @@ -1,8 +1,19 @@ //! General Categories -s1: []u16 = undefined, -s2: []u5 = undefined, -s3: []u5 = undefined, +const Data = struct { + s1: []const u16 = undefined, + s2: []const u5 = undefined, + s3: []const u5 = undefined, +}; + +const general_categories = general_categories: { + const data = @import("gencat"); + break :general_categories Data{ + .s1 = &data.s1, + .s2 = &data.s2, + .s3 = &data.s3, + }; +}; /// General Category pub const Gc = enum { @@ -38,51 +49,14 @@ pub const Gc = enum { Zs, // Separator, Space }; -const GeneralCategories = @This(); - -pub fn init(allocator: Allocator) Allocator.Error!GeneralCategories { - var gencat = GeneralCategories{}; - try gencat.setup(allocator); - return gencat; -} - -pub fn setup(gencat: *GeneralCategories, allocator: Allocator) Allocator.Error!void { - const in_bytes = @embedFile("gencat"); - var in_fbs = std.io.fixedBufferStream(in_bytes); - var reader = in_fbs.reader(); - - const endian = builtin.cpu.arch.endian(); - - const s1_len: u16 = reader.readInt(u16, endian) catch unreachable; - gencat.s1 = try allocator.alloc(u16, s1_len); - errdefer allocator.free(gencat.s1); - for (0..s1_len) |i| gencat.s1[i] = reader.readInt(u16, endian) catch unreachable; - - const s2_len: u16 = reader.readInt(u16, endian) catch unreachable; - gencat.s2 = try allocator.alloc(u5, s2_len); - errdefer allocator.free(gencat.s2); - for (0..s2_len) |i| gencat.s2[i] = @intCast(reader.readInt(u8, endian) catch unreachable); - - const s3_len: u16 = reader.readInt(u8, endian) catch unreachable; - gencat.s3 = try allocator.alloc(u5, s3_len); - errdefer allocator.free(gencat.s3); - for (0..s3_len) |i| gencat.s3[i] = @intCast(reader.readInt(u8, endian) catch unreachable); -} - -pub fn deinit(gencat: *const GeneralCategories, allocator: mem.Allocator) void { - allocator.free(gencat.s1); - allocator.free(gencat.s2); - allocator.free(gencat.s3); -} - /// Lookup the General Category for `cp`. -pub fn gc(gencat: GeneralCategories, cp: u21) Gc { - return @enumFromInt(gencat.s3[gencat.s2[gencat.s1[cp >> 8] + (cp & 0xff)]]); +pub fn gc(cp: u21) Gc { + return @enumFromInt(general_categories.s3[general_categories.s2[general_categories.s1[cp >> 8] + (cp & 0xff)]]); } /// True if `cp` has an C general category. -pub fn isControl(gencat: GeneralCategories, cp: u21) bool { - return switch (gencat.gc(cp)) { +pub fn isControl(cp: u21) bool { + return switch (gc(cp)) { .Cc, .Cf, .Cn, @@ -94,8 +68,8 @@ pub fn isControl(gencat: GeneralCategories, cp: u21) bool { } /// True if `cp` has an L general category. -pub fn isLetter(gencat: GeneralCategories, cp: u21) bool { - return switch (gencat.gc(cp)) { +pub fn isLetter(cp: u21) bool { + return switch (gc(cp)) { .Ll, .Lm, .Lo, @@ -107,8 +81,8 @@ pub fn isLetter(gencat: GeneralCategories, cp: u21) bool { } /// True if `cp` has an M general category. -pub fn isMark(gencat: GeneralCategories, cp: u21) bool { - return switch (gencat.gc(cp)) { +pub fn isMark(cp: u21) bool { + return switch (gc(cp)) { .Mc, .Me, .Mn, @@ -118,8 +92,8 @@ pub fn isMark(gencat: GeneralCategories, cp: u21) bool { } /// True if `cp` has an N general category. -pub fn isNumber(gencat: GeneralCategories, cp: u21) bool { - return switch (gencat.gc(cp)) { +pub fn isNumber(cp: u21) bool { + return switch (gc(cp)) { .Nd, .Nl, .No, @@ -129,8 +103,8 @@ pub fn isNumber(gencat: GeneralCategories, cp: u21) bool { } /// True if `cp` has an P general category. -pub fn isPunctuation(gencat: GeneralCategories, cp: u21) bool { - return switch (gencat.gc(cp)) { +pub fn isPunctuation(cp: u21) bool { + return switch (gc(cp)) { .Pc, .Pd, .Pe, @@ -144,8 +118,8 @@ pub fn isPunctuation(gencat: GeneralCategories, cp: u21) bool { } /// True if `cp` has an S general category. -pub fn isSymbol(gencat: GeneralCategories, cp: u21) bool { - return switch (gencat.gc(cp)) { +pub fn isSymbol(cp: u21) bool { + return switch (gc(cp)) { .Sc, .Sk, .Sm, @@ -156,8 +130,8 @@ pub fn isSymbol(gencat: GeneralCategories, cp: u21) bool { } /// True if `cp` has an Z general category. -pub fn isSeparator(gencat: GeneralCategories, cp: u21) bool { - return switch (gencat.gc(cp)) { +pub fn isSeparator(cp: u21) bool { + return switch (gc(cp)) { .Zl, .Zp, .Zs, @@ -165,19 +139,3 @@ pub fn isSeparator(gencat: GeneralCategories, cp: u21) bool { else => false, }; } - -fn testAllocator(allocator: Allocator) !void { - var gen_cat = try GeneralCategories.init(allocator); - gen_cat.deinit(allocator); -} - -test "Allocation failure" { - try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{}); -} - -const std = @import("std"); -const builtin = @import("builtin"); -const compress = std.compress; -const mem = std.mem; -const testing = std.testing; -const Allocator = mem.Allocator; -- cgit v1.2.3