From e476250ea9326b2550847b301c265115ff375a31 Mon Sep 17 00:00:00 2001
From: Sam Atman
Date: Wed, 4 Feb 2026 18:36:18 -0500
Subject: Rest of the 'easy' stuff

This gets us up to feature parity with Jacob's work.  I want to
eliminate that last allocation using the comptime hash map, and then
see about eliminating allocations from case comparisons as well.

That should just about do it.
---
 src/LetterCasing.zig | 179 +++++++++++++++------------------------------------
 1 file changed, 51 insertions(+), 128 deletions(-)

(limited to 'src/LetterCasing.zig')

diff --git a/src/LetterCasing.zig b/src/LetterCasing.zig
index 33096fc..24b67a0 100644
--- a/src/LetterCasing.zig
+++ b/src/LetterCasing.zig
@@ -1,120 +1,58 @@
 const CodePointIterator = @import("code_point").Iterator;
-
-case_map: [][2]u21 = undefined,
-prop_s1: []u16 = undefined,
-prop_s2: []u8 = undefined,
-
-const LetterCasing = @This();
-
-pub fn init(allocator: Allocator) Allocator.Error!LetterCasing {
-    var case = LetterCasing{};
-    try case.setup(allocator);
-    return case;
-}
-
-pub fn setup(case: *LetterCasing, allocator: Allocator) Allocator.Error!void {
-    case.setupInner(allocator) catch |err| {
-        switch (err) {
-            error.OutOfMemory => |e| return e,
-            else => unreachable,
-        }
+const GeneralCategories = @import("GeneralCategories");
+
+const Data = struct {
+    s1: []const u16 = undefined,
+    s2: []const u44 = undefined,
+};
+
+const letter_casing = letter_casing: {
+    const data = @import("case");
+    break :letter_casing Data{
+        .s1 = &data.s1,
+        .s2 = &data.s2,
     };
-}
-
-inline fn setupInner(self: *LetterCasing, allocator: mem.Allocator) !void {
-    const endian = builtin.cpu.arch.endian();
-
-    self.case_map = try allocator.alloc([2]u21, 0x110000);
-    errdefer allocator.free(self.case_map);
-
-    for (0..0x110000) |i| {
-        const cp: u21 = @intCast(i);
-        self.case_map[cp] = .{ cp, cp };
-    }
-
-    // Uppercase
-    const upper_bytes = @embedFile("upper");
-    var upper_fbs = std.io.fixedBufferStream(upper_bytes);
-    var upper_reader = upper_fbs.reader();
-
-    while (true) {
-        const cp = try upper_reader.readInt(i24, endian);
-        if (cp == 0) break;
-        const diff = try upper_reader.readInt(i24, endian);
-        self.case_map[@intCast(cp)][0] = @intCast(cp + diff);
-    }
-
-    // Lowercase
-    const lower_bytes = @embedFile("lower");
-    var lower_fbs = std.io.fixedBufferStream(lower_bytes);
-    var lower_reader = lower_fbs.reader();
-
-    while (true) {
-        const cp = try lower_reader.readInt(i24, endian);
-        if (cp == 0) break;
-        const diff = try lower_reader.readInt(i24, endian);
-        self.case_map[@intCast(cp)][1] = @intCast(cp + diff);
-    }
-
-    // Case properties
-    const cp_bytes = @embedFile("case_prop");
-    var cp_fbs = std.io.fixedBufferStream(cp_bytes);
-    var cp_reader = cp_fbs.reader();
-
-    const stage_1_len: u16 = try cp_reader.readInt(u16, endian);
-    self.prop_s1 = try allocator.alloc(u16, stage_1_len);
-    errdefer allocator.free(self.prop_s1);
-    for (0..stage_1_len) |i| self.prop_s1[i] = try cp_reader.readInt(u16, endian);
-
-    const stage_2_len: u16 = try cp_reader.readInt(u16, endian);
-    self.prop_s2 = try allocator.alloc(u8, stage_2_len);
-    errdefer allocator.free(self.prop_s2);
-    _ = try cp_reader.readAll(self.prop_s2);
-}
-
-pub fn deinit(self: *const LetterCasing, allocator: mem.Allocator) void {
-    allocator.free(self.case_map);
-    allocator.free(self.prop_s1);
-    allocator.free(self.prop_s2);
-}
+};
 
 // Returns true if `cp` is either upper, lower, or title case.
-pub fn isCased(self: LetterCasing, cp: u21) bool {
-    return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
+pub fn isCased(cp: u21) bool {
+    return isUpper(cp) or isLower(cp) or GeneralCategories.gc(cp) == .Lt;
 }
 
 // Returns true if `cp` is uppercase.
-pub fn isUpper(self: LetterCasing, cp: u21) bool {
-    return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
+pub fn isUpper(cp: u21) bool {
+    // isUpper is true if we have a mapping to a lower character (bit 1)
+    return letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
 }
 
-/// Returns true if `str` is all uppercase.
-pub fn isUpperStr(self: LetterCasing, str: []const u8) bool {
+/// Returns true if `str` is all non-lowercase.
+pub fn isUpperStr(str: []const u8) bool {
     var iter = CodePointIterator{ .bytes = str };
 
     return while (iter.next()) |cp| {
-        if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false;
+        if (isLower(cp.code)) break false;
     } else true;
 }
 
 test "isUpperStr" {
-    const cd = try init(testing.allocator);
-    defer cd.deinit(testing.allocator);
-
-    try testing.expect(cd.isUpperStr("HELLO, WORLD 2112!"));
-    try testing.expect(!cd.isUpperStr("hello, world 2112!"));
-    try testing.expect(!cd.isUpperStr("Hello, World 2112!"));
+    try testing.expect(isUpperStr("HELLO, WORLD 2112!"));
+    try testing.expect(!isUpperStr("hello, world 2112!"));
+    try testing.expect(!isUpperStr("Hello, World 2112!"));
 }
 
 /// Returns uppercase mapping for `cp`.
-pub fn toUpper(self: LetterCasing, cp: u21) u21 {
-    return self.case_map[cp][0];
+pub fn toUpper(cp: u21) u21 {
+    const case_prop = letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)];
+    if (case_prop & 2 == 2) {
+        return @intCast(case_prop >> (21 + 2));
+    } else {
+        return cp;
+    }
 }
 
 /// Returns a new string with all letters in uppercase.
 /// Caller must free returned bytes with `allocator`.
 pub fn toUpperStr(
-    self: LetterCasing,
     allocator: mem.Allocator,
     str: []const u8,
 ) ![]u8 {
@@ -125,7 +63,7 @@ pub fn toUpperStr(
     var buf: [4]u8 = undefined;
 
     while (iter.next()) |cp| {
-        const len = try unicode.utf8Encode(self.toUpper(cp.code), &buf);
+        const len = try unicode.utf8Encode(toUpper(cp.code), &buf);
         try bytes.appendSlice(buf[0..len]);
     }
 
@@ -133,46 +71,45 @@ pub fn toUpperStr(
 }
 
 test "toUpperStr" {
-    const cd = try init(testing.allocator);
-    defer cd.deinit(testing.allocator);
-
-    const uppered = try cd.toUpperStr(testing.allocator, "Hello, World 2112!");
+    const uppered = try toUpperStr(testing.allocator, "Hello, World 2112!");
     defer testing.allocator.free(uppered);
     try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered);
 }
 
 // Returns true if `cp` is lowercase.
-pub fn isLower(self: LetterCasing, cp: u21) bool {
-    return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
+pub fn isLower(cp: u21) bool {
+    // isLower is true if we have a mapping to an upper character (bit 2)
+    return letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
 }
 
-/// Returns true if `str` is all lowercase.
-pub fn isLowerStr(self: LetterCasing, str: []const u8) bool {
+/// Returns true if `str` is all non-uppercase.
+pub fn isLowerStr(str: []const u8) bool {
     var iter = CodePointIterator{ .bytes = str };
 
     return while (iter.next()) |cp| {
-        if (self.isCased(cp.code) and !self.isLower(cp.code)) break false;
+        if (isUpper(cp.code)) break false;
     } else true;
 }
 
 test "isLowerStr" {
-    const cd = try init(testing.allocator);
-    defer cd.deinit(testing.allocator);
-
-    try testing.expect(cd.isLowerStr("hello, world 2112!"));
-    try testing.expect(!cd.isLowerStr("HELLO, WORLD 2112!"));
-    try testing.expect(!cd.isLowerStr("Hello, World 2112!"));
+    try testing.expect(isLowerStr("hello, world 2112!"));
+    try testing.expect(!isLowerStr("HELLO, WORLD 2112!"));
+    try testing.expect(!isLowerStr("Hello, World 2112!"));
 }
 
 /// Returns lowercase mapping for `cp`.
-pub fn toLower(self: LetterCasing, cp: u21) u21 {
-    return self.case_map[cp][1];
+pub fn toLower(cp: u21) u21 {
+    const case_prop = letter_casing.s2[letter_casing.s1[cp >> 8] + (cp & 0xff)];
+    if (case_prop & 1 == 1) {
+        return @intCast((case_prop >> 2) & 0x1FFFFF);
+    } else {
+        return cp;
+    }
 }
 
 /// Returns a new string with all letters in lowercase.
 /// Caller must free returned bytes with `allocator`.
 pub fn toLowerStr(
-    self: LetterCasing,
     allocator: mem.Allocator,
     str: []const u8,
 ) ![]u8 {
@@ -183,7 +120,7 @@ pub fn toLowerStr(
     var buf: [4]u8 = undefined;
 
     while (iter.next()) |cp| {
-        const len = try unicode.utf8Encode(self.toLower(cp.code), &buf);
+        const len = try unicode.utf8Encode(toLower(cp.code), &buf);
         try bytes.appendSlice(buf[0..len]);
     }
 
@@ -191,27 +128,13 @@ pub fn toLowerStr(
 }
 
 test "toLowerStr" {
-    const cd = try init(testing.allocator);
-    defer cd.deinit(testing.allocator);
-
-    const lowered = try cd.toLowerStr(testing.allocator, "Hello, World 2112!");
+    const lowered = try toLowerStr(testing.allocator, "Hello, World 2112!");
     defer testing.allocator.free(lowered);
     try testing.expectEqualStrings("hello, world 2112!", lowered);
 }
 
-fn testAllocator(allocator: Allocator) !void {
-    var prop = try LetterCasing.init(allocator);
-    prop.deinit(allocator);
-}
-
-test "Allocation failure" {
-    try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{});
-}
-
 const std = @import("std");
 const builtin = @import("builtin");
-const compress = std.compress;
 const mem = std.mem;
-const Allocator = std.mem.Allocator;
 const testing = std.testing;
 const unicode = std.unicode;
-- 
cgit v1.2.3