Removed title case processing

author: Jose Colon Rodriguez 2024-03-26 07:45:32 -0400
committer: Jose Colon Rodriguez 2024-03-26 07:45:32 -0400
commit: 4c9b673c7f47d8a2090499f8e5c222312b284725 (patch)
tree: 5246f97dd39d3cea7ca309b9d91728e3e644ed6f
parent: CaseData (diff)
download: zg-4c9b673c7f47d8a2090499f8e5c222312b284725.tar.gz
zg-4c9b673c7f47d8a2090499f8e5c222312b284725.tar.xz
zg-4c9b673c7f47d8a2090499f8e5c222312b284725.zip
3 files changed, 15 insertions, 105 deletions
diff --git a/build.zig b/build.zig
index 7272336..496e210 100644
--- a/build.zig
+++ b/build.zig
@@ -137,16 +137,6 @@ pub fn build(b: *std.Build) void {
    const run_lower_gen_exe = b.addRunArtifact(lower_gen_exe);
    const lower_gen_out = run_lower_gen_exe.addOutputFileArg("lower.bin.z");
-    // Titlecase mappings
-    const title_gen_exe = b.addExecutable(.{
-        .name = "title",
-        .root_source_file = .{ .path = "codegen/title.zig" },
-        .target = b.host,
-        .optimize = .Debug,
-    });
-    const run_title_gen_exe = b.addRunArtifact(title_gen_exe);
-    const title_gen_out = run_title_gen_exe.addOutputFileArg("title.bin.z");
    // Modules we provide
    // Code points
    const code_point = b.addModule("code_point", .{
@@ -296,7 +286,6 @@ pub fn build(b: *std.Build) void {
    case_data.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out });
    case_data.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out });
    case_data.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out });
-    case_data.addAnonymousImport("title", .{ .root_source_file = title_gen_out });
    // Benchmark rig
    const exe = b.addExecutable(.{
@@ -344,7 +333,6 @@ pub fn build(b: *std.Build) void {
    exe_unit_tests.root_module.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out });
    exe_unit_tests.root_module.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out });
    exe_unit_tests.root_module.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out });
-    exe_unit_tests.root_module.addAnonymousImport("title", .{ .root_source_file = title_gen_out });
    // exe_unit_tests.filter = "nfd !ASCII";
    const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
diff --git a/codegen/title.zig b/codegen/title.zig
deleted file mode 100644
index 653b812..0000000
--- a/codegen/title.zig
+++ /dev/null
@@ -1,58 +0,0 @@
-const std = @import("std");
-const builtin = @import("builtin");
-pub fn main() !void {
-    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
-    defer arena.deinit();
-    const allocator = arena.allocator();
-    // Process UnicodeData.txt
-    var in_file = try std.fs.cwd().openFile("data/unicode/UnicodeData.txt", .{});
-    defer in_file.close();
-    var in_buf = std.io.bufferedReader(in_file.reader());
-    const in_reader = in_buf.reader();
-    var args_iter = try std.process.argsWithAllocator(allocator);
-    defer args_iter.deinit();
-    _ = args_iter.skip();
-    const output_path = args_iter.next() orelse @panic("No output file arg!");
-    const compressor = std.compress.deflate.compressor;
-    var out_file = try std.fs.cwd().createFile(output_path, .{});
-    defer out_file.close();
-    var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression });
-    defer out_comp.deinit();
-    const writer = out_comp.writer();
-    const endian = builtin.cpu.arch.endian();
-    var line_buf: [4096]u8 = undefined;
-    lines: while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
-        if (line.len == 0) continue;
-        var field_iter = std.mem.splitScalar(u8, line, ';');
-        var cps: [2]u24 = undefined;
-        var i: usize = 0;
-        while (field_iter.next()) |field| : (i += 1) {
-            switch (i) {
-                0 => cps[0] = try std.fmt.parseInt(u24, field, 16),
-                14 => {
-                    // Simple titlecase mapping
-                    if (field.len == 0) continue :lines;
-                    cps[1] = try std.fmt.parseInt(u24, field, 16);
-                },
-                2 => if (line[0] == '<') continue :lines,
-                else => {},
-            }
-        }
-        for (&cps) |cp| try writer.writeInt(u24, cp, endian);
-    }
-    try writer.writeInt(u24, 0, endian);
-    try out_comp.flush();
-}
diff --git a/src/CaseData.zig b/src/CaseData.zig
index 38830e3..d790e8c 100644
--- a/src/CaseData.zig
+++ b/src/CaseData.zig
@@ -8,7 +8,7 @@ const unicode = std.unicode;
 const CodePointIterator = @import("code_point").Iterator;
 allocator: mem.Allocator,
-case_map: [][3]u21,
+case_map: [][2]u21,
 prop_s1: []u16 = undefined,
 prop_s2: []u8 = undefined,
@@ -20,13 +20,13 @@ pub fn init(allocator: mem.Allocator) !Self {
    var self = Self{
        .allocator = allocator,
-        .case_map = try allocator.alloc([3]u21, 0x110000),
+        .case_map = try allocator.alloc([2]u21, 0x110000),
    };
    errdefer allocator.free(self.case_map);
    for (0..0x110000) |i| {
        const cp: u21 = @intCast(i);
-        self.case_map[cp] = .{ cp, cp, cp };
+        self.case_map[cp] = .{ cp, cp };
    }
    // Uppercase
@@ -55,19 +55,6 @@ pub fn init(allocator: mem.Allocator) !Self {
        self.case_map[cp][1] = @intCast(try lower_reader.readInt(u24, endian));
    }
-    // Titlercase
-    const title_bytes = @embedFile("title");
-    var title_fbs = std.io.fixedBufferStream(title_bytes);
-    var title_decomp = try decompressor(allocator, title_fbs.reader(), null);
-    defer title_decomp.deinit();
-    var title_reader = title_decomp.reader();
-    while (true) {
-        const cp = try title_reader.readInt(u24, endian);
-        if (cp == 0) break;
-        self.case_map[cp][2] = @intCast(try title_reader.readInt(u24, endian));
-    }
    // Case properties
    const cp_bytes = @embedFile("case_prop");
    var cp_fbs = std.io.fixedBufferStream(cp_bytes);
@@ -101,7 +88,6 @@ pub inline fn isCased(self: Self, cp: u21) bool {
 // Returns true if `cp` is uppercase.
 pub fn isUpper(self: Self, cp: u21) bool {
-    if (!self.isCased(cp)) return true;
    return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
 }
@@ -110,7 +96,7 @@ pub fn isUpperStr(self: Self, str: []const u8) bool {
    var iter = CodePointIterator{ .bytes = str };
    return while (iter.next()) |cp| {
-        if (!self.isUpper(cp.code)) break false;
+        if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false;
    } else true;
 }
@@ -123,6 +109,11 @@ test "isUpperStr" {
    try testing.expect(!cd.isUpperStr("Hello, World 2112!"));
 }
+/// Returns uppercase mapping for `cp`.
+pub inline fn toUpper(self: Self, cp: u21) u21 {
+    return self.case_map[cp][0];
+}
 /// Returns a new string with all letters in uppercase.
 /// Caller must free returned bytes with `allocator`.
 pub fn toUpperStr(
@@ -153,28 +144,17 @@ test "toUpperStr" {
    try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered);
 }
-/// Returns uppercase mapping for `cp`.
-pub inline fn toUpper(self: Self, cp: u21) u21 {
-    return self.case_map[cp][0];
-}
 // Returns true if `cp` is lowercase.
 pub fn isLower(self: Self, cp: u21) bool {
-    if (!self.isCased(cp)) return true;
    return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
 }
-/// Returns lowercase mapping for `cp`.
-pub inline fn toLower(self: Self, cp: u21) u21 {
-    return self.case_map[cp][1];
-}
 /// Returns true if `str` is all lowercase.
 pub fn isLowerStr(self: Self, str: []const u8) bool {
    var iter = CodePointIterator{ .bytes = str };
    return while (iter.next()) |cp| {
-        if (!self.isLower(cp.code)) break false;
+        if (self.isCased(cp.code) and !self.isLower(cp.code)) break false;
    } else true;
 }
@@ -187,6 +167,11 @@ test "isLowerStr" {
    try testing.expect(!cd.isLowerStr("Hello, World 2112!"));
 }
+/// Returns lowercase mapping for `cp`.
+pub inline fn toLower(self: Self, cp: u21) u21 {
+    return self.case_map[cp][1];
+}
 /// Returns a new string with all letters in lowercase.
 /// Caller must free returned bytes with `allocator`.
 pub fn toLowerStr(
@@ -216,8 +201,3 @@ test "toLowerStr" {
    defer testing.allocator.free(lowered);
    try testing.expectEqualStrings("hello, world 2112!", lowered);
 }
-/// Returns titlecase mapping for `cp`.
-pub inline fn toTitle(self: Self, cp: u21) u21 {
-    return self.case_map[cp][2];
-}
author	Jose Colon Rodriguez	2024-03-26 07:45:32 -0400
committer	Jose Colon Rodriguez	2024-03-26 07:45:32 -0400
commit	4c9b673c7f47d8a2090499f8e5c222312b284725 (patch)
tree	5246f97dd39d3cea7ca309b9d91728e3e644ed6f
parent	CaseData (diff)
download	zg-4c9b673c7f47d8a2090499f8e5c222312b284725.tar.gz zg-4c9b673c7f47d8a2090499f8e5c222312b284725.tar.xz zg-4c9b673c7f47d8a2090499f8e5c222312b284725.zip

diff --git a/build.zig b/build.zig index 7272336..496e210 100644 --- a/build.zig +++ b/build.zig
@@ -137,16 +137,6 @@ pub fn build(b: *std.Build) void {
137	const run_lower_gen_exe = b.addRunArtifact(lower_gen_exe);	137	const run_lower_gen_exe = b.addRunArtifact(lower_gen_exe);
138	const lower_gen_out = run_lower_gen_exe.addOutputFileArg("lower.bin.z");	138	const lower_gen_out = run_lower_gen_exe.addOutputFileArg("lower.bin.z");
139		139
140	// Titlecase mappings
141	const title_gen_exe = b.addExecutable(.{
142	.name = "title",
143	.root_source_file = .{ .path = "codegen/title.zig" },
144	.target = b.host,
145	.optimize = .Debug,
146	});
147	const run_title_gen_exe = b.addRunArtifact(title_gen_exe);
148	const title_gen_out = run_title_gen_exe.addOutputFileArg("title.bin.z");
149
150	// Modules we provide	140	// Modules we provide
151	// Code points	141	// Code points
152	const code_point = b.addModule("code_point", .{	142	const code_point = b.addModule("code_point", .{
@@ -296,7 +286,6 @@ pub fn build(b: *std.Build) void {
296	case_data.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out });	286	case_data.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out });
297	case_data.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out });	287	case_data.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out });
298	case_data.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out });	288	case_data.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out });
299	case_data.addAnonymousImport("title", .{ .root_source_file = title_gen_out });
300		289
301	// Benchmark rig	290	// Benchmark rig
302	const exe = b.addExecutable(.{	291	const exe = b.addExecutable(.{
@@ -344,7 +333,6 @@ pub fn build(b: *std.Build) void {
344	exe_unit_tests.root_module.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out });	333	exe_unit_tests.root_module.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out });
345	exe_unit_tests.root_module.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out });	334	exe_unit_tests.root_module.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out });
346	exe_unit_tests.root_module.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out });	335	exe_unit_tests.root_module.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out });
347	exe_unit_tests.root_module.addAnonymousImport("title", .{ .root_source_file = title_gen_out });
348	// exe_unit_tests.filter = "nfd !ASCII";	336	// exe_unit_tests.filter = "nfd !ASCII";
349		337
350	const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);	338	const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);


diff --git a/codegen/title.zig b/codegen/title.zig deleted file mode 100644 index 653b812..0000000 --- a/codegen/title.zig +++ /dev/null
@@ -1,58 +0,0 @@
1	const std = @import("std");
2	const builtin = @import("builtin");
3
4	pub fn main() !void {
5	var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
6	defer arena.deinit();
7	const allocator = arena.allocator();
8
9	// Process UnicodeData.txt
10	var in_file = try std.fs.cwd().openFile("data/unicode/UnicodeData.txt", .{});
11	defer in_file.close();
12	var in_buf = std.io.bufferedReader(in_file.reader());
13	const in_reader = in_buf.reader();
14
15	var args_iter = try std.process.argsWithAllocator(allocator);
16	defer args_iter.deinit();
17	_ = args_iter.skip();
18	const output_path = args_iter.next() orelse @panic("No output file arg!");
19
20	const compressor = std.compress.deflate.compressor;
21	var out_file = try std.fs.cwd().createFile(output_path, .{});
22	defer out_file.close();
23	var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression });
24	defer out_comp.deinit();
25	const writer = out_comp.writer();
26
27	const endian = builtin.cpu.arch.endian();
28	var line_buf: [4096]u8 = undefined;
29
30	lines: while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) \|line\| {
31	if (line.len == 0) continue;
32
33	var field_iter = std.mem.splitScalar(u8, line, ';');
34	var cps: [2]u24 = undefined;
35
36	var i: usize = 0;
37	while (field_iter.next()) \|field\| : (i += 1) {
38	switch (i) {
39	0 => cps[0] = try std.fmt.parseInt(u24, field, 16),
40
41	14 => {
42	// Simple titlecase mapping
43	if (field.len == 0) continue :lines;
44	cps[1] = try std.fmt.parseInt(u24, field, 16);
45	},
46
47	2 => if (line[0] == '<') continue :lines,
48
49	else => {},
50	}
51	}
52
53	for (&cps) \|cp\| try writer.writeInt(u24, cp, endian);
54	}
55
56	try writer.writeInt(u24, 0, endian);
57	try out_comp.flush();
58	}


diff --git a/src/CaseData.zig b/src/CaseData.zig index 38830e3..d790e8c 100644 --- a/src/CaseData.zig +++ b/src/CaseData.zig
@@ -8,7 +8,7 @@ const unicode = std.unicode;
8	const CodePointIterator = @import("code_point").Iterator;	8	const CodePointIterator = @import("code_point").Iterator;
9		9
10	allocator: mem.Allocator,	10	allocator: mem.Allocator,
11	case_map: [][3]u21,	11	case_map: [][2]u21,
12	prop_s1: []u16 = undefined,	12	prop_s1: []u16 = undefined,
13	prop_s2: []u8 = undefined,	13	prop_s2: []u8 = undefined,
14		14
@@ -20,13 +20,13 @@ pub fn init(allocator: mem.Allocator) !Self {
20		20
21	var self = Self{	21	var self = Self{
22	.allocator = allocator,	22	.allocator = allocator,
23	.case_map = try allocator.alloc([3]u21, 0x110000),	23	.case_map = try allocator.alloc([2]u21, 0x110000),
24	};	24	};
25	errdefer allocator.free(self.case_map);	25	errdefer allocator.free(self.case_map);
26		26
27	for (0..0x110000) \|i\| {	27	for (0..0x110000) \|i\| {
28	const cp: u21 = @intCast(i);	28	const cp: u21 = @intCast(i);
29	self.case_map[cp] = .{ cp, cp, cp };	29	self.case_map[cp] = .{ cp, cp };
30	}	30	}
31		31
32	// Uppercase	32	// Uppercase
@@ -55,19 +55,6 @@ pub fn init(allocator: mem.Allocator) !Self {
55	self.case_map[cp][1] = @intCast(try lower_reader.readInt(u24, endian));	55	self.case_map[cp][1] = @intCast(try lower_reader.readInt(u24, endian));
56	}	56	}
57		57
58	// Titlercase
59	const title_bytes = @embedFile("title");
60	var title_fbs = std.io.fixedBufferStream(title_bytes);
61	var title_decomp = try decompressor(allocator, title_fbs.reader(), null);
62	defer title_decomp.deinit();
63	var title_reader = title_decomp.reader();
64
65	while (true) {
66	const cp = try title_reader.readInt(u24, endian);
67	if (cp == 0) break;
68	self.case_map[cp][2] = @intCast(try title_reader.readInt(u24, endian));
69	}
70
71	// Case properties	58	// Case properties
72	const cp_bytes = @embedFile("case_prop");	59	const cp_bytes = @embedFile("case_prop");
73	var cp_fbs = std.io.fixedBufferStream(cp_bytes);	60	var cp_fbs = std.io.fixedBufferStream(cp_bytes);
@@ -101,7 +88,6 @@ pub inline fn isCased(self: Self, cp: u21) bool {
101		88
102	// Returns true if `cp` is uppercase.	89	// Returns true if `cp` is uppercase.
103	pub fn isUpper(self: Self, cp: u21) bool {	90	pub fn isUpper(self: Self, cp: u21) bool {
104	if (!self.isCased(cp)) return true;
105	return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2;	91	return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
106	}	92	}
107		93
@@ -110,7 +96,7 @@ pub fn isUpperStr(self: Self, str: []const u8) bool {
110	var iter = CodePointIterator{ .bytes = str };	96	var iter = CodePointIterator{ .bytes = str };
111		97
112	return while (iter.next()) \|cp\| {	98	return while (iter.next()) \|cp\| {
113	if (!self.isUpper(cp.code)) break false;	99	if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false;
114	} else true;	100	} else true;
115	}	101	}
116		102
@@ -123,6 +109,11 @@ test "isUpperStr" {
123	try testing.expect(!cd.isUpperStr("Hello, World 2112!"));	109	try testing.expect(!cd.isUpperStr("Hello, World 2112!"));
124	}	110	}
125		111
		112	/// Returns uppercase mapping for `cp`.
		113	pub inline fn toUpper(self: Self, cp: u21) u21 {
		114	return self.case_map[cp][0];
		115	}
		116
126	/// Returns a new string with all letters in uppercase.	117	/// Returns a new string with all letters in uppercase.
127	/// Caller must free returned bytes with `allocator`.	118	/// Caller must free returned bytes with `allocator`.
128	pub fn toUpperStr(	119	pub fn toUpperStr(
@@ -153,28 +144,17 @@ test "toUpperStr" {
153	try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered);	144	try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered);
154	}	145	}
155		146
156	/// Returns uppercase mapping for `cp`.
157	pub inline fn toUpper(self: Self, cp: u21) u21 {
158	return self.case_map[cp][0];
159	}
160
161	// Returns true if `cp` is lowercase.	147	// Returns true if `cp` is lowercase.
162	pub fn isLower(self: Self, cp: u21) bool {	148	pub fn isLower(self: Self, cp: u21) bool {
163	if (!self.isCased(cp)) return true;
164	return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1;	149	return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
165	}	150	}
166		151
167	/// Returns lowercase mapping for `cp`.
168	pub inline fn toLower(self: Self, cp: u21) u21 {
169	return self.case_map[cp][1];
170	}
171
172	/// Returns true if `str` is all lowercase.	152	/// Returns true if `str` is all lowercase.
173	pub fn isLowerStr(self: Self, str: []const u8) bool {	153	pub fn isLowerStr(self: Self, str: []const u8) bool {
174	var iter = CodePointIterator{ .bytes = str };	154	var iter = CodePointIterator{ .bytes = str };
175		155
176	return while (iter.next()) \|cp\| {	156	return while (iter.next()) \|cp\| {
177	if (!self.isLower(cp.code)) break false;	157	if (self.isCased(cp.code) and !self.isLower(cp.code)) break false;
178	} else true;	158	} else true;
179	}	159	}
180		160
@@ -187,6 +167,11 @@ test "isLowerStr" {
187	try testing.expect(!cd.isLowerStr("Hello, World 2112!"));	167	try testing.expect(!cd.isLowerStr("Hello, World 2112!"));
188	}	168	}
189		169
		170	/// Returns lowercase mapping for `cp`.
		171	pub inline fn toLower(self: Self, cp: u21) u21 {
		172	return self.case_map[cp][1];
		173	}
		174
190	/// Returns a new string with all letters in lowercase.	175	/// Returns a new string with all letters in lowercase.
191	/// Caller must free returned bytes with `allocator`.	176	/// Caller must free returned bytes with `allocator`.
192	pub fn toLowerStr(	177	pub fn toLowerStr(
@@ -216,8 +201,3 @@ test "toLowerStr" {
216	defer testing.allocator.free(lowered);	201	defer testing.allocator.free(lowered);
217	try testing.expectEqualStrings("hello, world 2112!", lowered);	202	try testing.expectEqualStrings("hello, world 2112!", lowered);
218	}	203	}
219
220	/// Returns titlecase mapping for `cp`.
221	pub inline fn toTitle(self: Self, cp: u21) u21 {
222	return self.case_map[cp][2];
223	}