From 4c9b673c7f47d8a2090499f8e5c222312b284725 Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Tue, 26 Mar 2024 07:45:32 -0400 Subject: Removed title case processing --- build.zig | 12 ------------ codegen/title.zig | 58 ------------------------------------------------------- src/CaseData.zig | 50 ++++++++++++++--------------------------------- 3 files changed, 15 insertions(+), 105 deletions(-) delete mode 100644 codegen/title.zig diff --git a/build.zig b/build.zig index 7272336..496e210 100644 --- a/build.zig +++ b/build.zig @@ -137,16 +137,6 @@ pub fn build(b: *std.Build) void { const run_lower_gen_exe = b.addRunArtifact(lower_gen_exe); const lower_gen_out = run_lower_gen_exe.addOutputFileArg("lower.bin.z"); - // Titlecase mappings - const title_gen_exe = b.addExecutable(.{ - .name = "title", - .root_source_file = .{ .path = "codegen/title.zig" }, - .target = b.host, - .optimize = .Debug, - }); - const run_title_gen_exe = b.addRunArtifact(title_gen_exe); - const title_gen_out = run_title_gen_exe.addOutputFileArg("title.bin.z"); - // Modules we provide // Code points const code_point = b.addModule("code_point", .{ @@ -296,7 +286,6 @@ pub fn build(b: *std.Build) void { case_data.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out }); case_data.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out }); case_data.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out }); - case_data.addAnonymousImport("title", .{ .root_source_file = title_gen_out }); // Benchmark rig const exe = b.addExecutable(.{ @@ -344,7 +333,6 @@ pub fn build(b: *std.Build) void { exe_unit_tests.root_module.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out }); exe_unit_tests.root_module.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out }); exe_unit_tests.root_module.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out }); - exe_unit_tests.root_module.addAnonymousImport("title", .{ .root_source_file = title_gen_out }); // exe_unit_tests.filter = "nfd !ASCII"; const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); diff --git a/codegen/title.zig b/codegen/title.zig deleted file mode 100644 index 653b812..0000000 --- a/codegen/title.zig +++ /dev/null @@ -1,58 +0,0 @@ -const std = @import("std"); -const builtin = @import("builtin"); - -pub fn main() !void { - var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); - defer arena.deinit(); - const allocator = arena.allocator(); - - // Process UnicodeData.txt - var in_file = try std.fs.cwd().openFile("data/unicode/UnicodeData.txt", .{}); - defer in_file.close(); - var in_buf = std.io.bufferedReader(in_file.reader()); - const in_reader = in_buf.reader(); - - var args_iter = try std.process.argsWithAllocator(allocator); - defer args_iter.deinit(); - _ = args_iter.skip(); - const output_path = args_iter.next() orelse @panic("No output file arg!"); - - const compressor = std.compress.deflate.compressor; - var out_file = try std.fs.cwd().createFile(output_path, .{}); - defer out_file.close(); - var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); - defer out_comp.deinit(); - const writer = out_comp.writer(); - - const endian = builtin.cpu.arch.endian(); - var line_buf: [4096]u8 = undefined; - - lines: while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { - if (line.len == 0) continue; - - var field_iter = std.mem.splitScalar(u8, line, ';'); - var cps: [2]u24 = undefined; - - var i: usize = 0; - while (field_iter.next()) |field| : (i += 1) { - switch (i) { - 0 => cps[0] = try std.fmt.parseInt(u24, field, 16), - - 14 => { - // Simple titlecase mapping - if (field.len == 0) continue :lines; - cps[1] = try std.fmt.parseInt(u24, field, 16); - }, - - 2 => if (line[0] == '<') continue :lines, - - else => {}, - } - } - - for (&cps) |cp| try writer.writeInt(u24, cp, endian); - } - - try writer.writeInt(u24, 0, endian); - try out_comp.flush(); -} diff --git a/src/CaseData.zig b/src/CaseData.zig index 38830e3..d790e8c 100644 --- a/src/CaseData.zig +++ b/src/CaseData.zig @@ -8,7 +8,7 @@ const unicode = std.unicode; const CodePointIterator = @import("code_point").Iterator; allocator: mem.Allocator, -case_map: [][3]u21, +case_map: [][2]u21, prop_s1: []u16 = undefined, prop_s2: []u8 = undefined, @@ -20,13 +20,13 @@ pub fn init(allocator: mem.Allocator) !Self { var self = Self{ .allocator = allocator, - .case_map = try allocator.alloc([3]u21, 0x110000), + .case_map = try allocator.alloc([2]u21, 0x110000), }; errdefer allocator.free(self.case_map); for (0..0x110000) |i| { const cp: u21 = @intCast(i); - self.case_map[cp] = .{ cp, cp, cp }; + self.case_map[cp] = .{ cp, cp }; } // Uppercase @@ -55,19 +55,6 @@ pub fn init(allocator: mem.Allocator) !Self { self.case_map[cp][1] = @intCast(try lower_reader.readInt(u24, endian)); } - // Titlercase - const title_bytes = @embedFile("title"); - var title_fbs = std.io.fixedBufferStream(title_bytes); - var title_decomp = try decompressor(allocator, title_fbs.reader(), null); - defer title_decomp.deinit(); - var title_reader = title_decomp.reader(); - - while (true) { - const cp = try title_reader.readInt(u24, endian); - if (cp == 0) break; - self.case_map[cp][2] = @intCast(try title_reader.readInt(u24, endian)); - } - // Case properties const cp_bytes = @embedFile("case_prop"); var cp_fbs = std.io.fixedBufferStream(cp_bytes); @@ -101,7 +88,6 @@ pub inline fn isCased(self: Self, cp: u21) bool { // Returns true if `cp` is uppercase. pub fn isUpper(self: Self, cp: u21) bool { - if (!self.isCased(cp)) return true; return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; } @@ -110,7 +96,7 @@ pub fn isUpperStr(self: Self, str: []const u8) bool { var iter = CodePointIterator{ .bytes = str }; return while (iter.next()) |cp| { - if (!self.isUpper(cp.code)) break false; + if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false; } else true; } @@ -123,6 +109,11 @@ test "isUpperStr" { try testing.expect(!cd.isUpperStr("Hello, World 2112!")); } +/// Returns uppercase mapping for `cp`. +pub inline fn toUpper(self: Self, cp: u21) u21 { + return self.case_map[cp][0]; +} + /// Returns a new string with all letters in uppercase. /// Caller must free returned bytes with `allocator`. pub fn toUpperStr( @@ -153,28 +144,17 @@ test "toUpperStr" { try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered); } -/// Returns uppercase mapping for `cp`. -pub inline fn toUpper(self: Self, cp: u21) u21 { - return self.case_map[cp][0]; -} - // Returns true if `cp` is lowercase. pub fn isLower(self: Self, cp: u21) bool { - if (!self.isCased(cp)) return true; return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; } -/// Returns lowercase mapping for `cp`. -pub inline fn toLower(self: Self, cp: u21) u21 { - return self.case_map[cp][1]; -} - /// Returns true if `str` is all lowercase. pub fn isLowerStr(self: Self, str: []const u8) bool { var iter = CodePointIterator{ .bytes = str }; return while (iter.next()) |cp| { - if (!self.isLower(cp.code)) break false; + if (self.isCased(cp.code) and !self.isLower(cp.code)) break false; } else true; } @@ -187,6 +167,11 @@ test "isLowerStr" { try testing.expect(!cd.isLowerStr("Hello, World 2112!")); } +/// Returns lowercase mapping for `cp`. +pub inline fn toLower(self: Self, cp: u21) u21 { + return self.case_map[cp][1]; +} + /// Returns a new string with all letters in lowercase. /// Caller must free returned bytes with `allocator`. pub fn toLowerStr( @@ -216,8 +201,3 @@ test "toLowerStr" { defer testing.allocator.free(lowered); try testing.expectEqualStrings("hello, world 2112!", lowered); } - -/// Returns titlecase mapping for `cp`. -pub inline fn toTitle(self: Self, cp: u21) u21 { - return self.case_map[cp][2]; -} -- cgit v1.2.3