summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-03-26 07:45:32 -0400
committerGravatar Jose Colon Rodriguez2024-03-26 07:45:32 -0400
commit4c9b673c7f47d8a2090499f8e5c222312b284725 (patch)
tree5246f97dd39d3cea7ca309b9d91728e3e644ed6f
parentCaseData (diff)
downloadzg-4c9b673c7f47d8a2090499f8e5c222312b284725.tar.gz
zg-4c9b673c7f47d8a2090499f8e5c222312b284725.tar.xz
zg-4c9b673c7f47d8a2090499f8e5c222312b284725.zip
Removed title case processing
-rw-r--r--build.zig12
-rw-r--r--codegen/title.zig58
-rw-r--r--src/CaseData.zig50
3 files changed, 15 insertions, 105 deletions
diff --git a/build.zig b/build.zig
index 7272336..496e210 100644
--- a/build.zig
+++ b/build.zig
@@ -137,16 +137,6 @@ pub fn build(b: *std.Build) void {
137 const run_lower_gen_exe = b.addRunArtifact(lower_gen_exe); 137 const run_lower_gen_exe = b.addRunArtifact(lower_gen_exe);
138 const lower_gen_out = run_lower_gen_exe.addOutputFileArg("lower.bin.z"); 138 const lower_gen_out = run_lower_gen_exe.addOutputFileArg("lower.bin.z");
139 139
140 // Titlecase mappings
141 const title_gen_exe = b.addExecutable(.{
142 .name = "title",
143 .root_source_file = .{ .path = "codegen/title.zig" },
144 .target = b.host,
145 .optimize = .Debug,
146 });
147 const run_title_gen_exe = b.addRunArtifact(title_gen_exe);
148 const title_gen_out = run_title_gen_exe.addOutputFileArg("title.bin.z");
149
150 // Modules we provide 140 // Modules we provide
151 // Code points 141 // Code points
152 const code_point = b.addModule("code_point", .{ 142 const code_point = b.addModule("code_point", .{
@@ -296,7 +286,6 @@ pub fn build(b: *std.Build) void {
296 case_data.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out }); 286 case_data.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out });
297 case_data.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out }); 287 case_data.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out });
298 case_data.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out }); 288 case_data.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out });
299 case_data.addAnonymousImport("title", .{ .root_source_file = title_gen_out });
300 289
301 // Benchmark rig 290 // Benchmark rig
302 const exe = b.addExecutable(.{ 291 const exe = b.addExecutable(.{
@@ -344,7 +333,6 @@ pub fn build(b: *std.Build) void {
344 exe_unit_tests.root_module.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out }); 333 exe_unit_tests.root_module.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out });
345 exe_unit_tests.root_module.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out }); 334 exe_unit_tests.root_module.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out });
346 exe_unit_tests.root_module.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out }); 335 exe_unit_tests.root_module.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out });
347 exe_unit_tests.root_module.addAnonymousImport("title", .{ .root_source_file = title_gen_out });
348 // exe_unit_tests.filter = "nfd !ASCII"; 336 // exe_unit_tests.filter = "nfd !ASCII";
349 337
350 const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); 338 const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
diff --git a/codegen/title.zig b/codegen/title.zig
deleted file mode 100644
index 653b812..0000000
--- a/codegen/title.zig
+++ /dev/null
@@ -1,58 +0,0 @@
1const std = @import("std");
2const builtin = @import("builtin");
3
4pub fn main() !void {
5 var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
6 defer arena.deinit();
7 const allocator = arena.allocator();
8
9 // Process UnicodeData.txt
10 var in_file = try std.fs.cwd().openFile("data/unicode/UnicodeData.txt", .{});
11 defer in_file.close();
12 var in_buf = std.io.bufferedReader(in_file.reader());
13 const in_reader = in_buf.reader();
14
15 var args_iter = try std.process.argsWithAllocator(allocator);
16 defer args_iter.deinit();
17 _ = args_iter.skip();
18 const output_path = args_iter.next() orelse @panic("No output file arg!");
19
20 const compressor = std.compress.deflate.compressor;
21 var out_file = try std.fs.cwd().createFile(output_path, .{});
22 defer out_file.close();
23 var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression });
24 defer out_comp.deinit();
25 const writer = out_comp.writer();
26
27 const endian = builtin.cpu.arch.endian();
28 var line_buf: [4096]u8 = undefined;
29
30 lines: while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
31 if (line.len == 0) continue;
32
33 var field_iter = std.mem.splitScalar(u8, line, ';');
34 var cps: [2]u24 = undefined;
35
36 var i: usize = 0;
37 while (field_iter.next()) |field| : (i += 1) {
38 switch (i) {
39 0 => cps[0] = try std.fmt.parseInt(u24, field, 16),
40
41 14 => {
42 // Simple titlecase mapping
43 if (field.len == 0) continue :lines;
44 cps[1] = try std.fmt.parseInt(u24, field, 16);
45 },
46
47 2 => if (line[0] == '<') continue :lines,
48
49 else => {},
50 }
51 }
52
53 for (&cps) |cp| try writer.writeInt(u24, cp, endian);
54 }
55
56 try writer.writeInt(u24, 0, endian);
57 try out_comp.flush();
58}
diff --git a/src/CaseData.zig b/src/CaseData.zig
index 38830e3..d790e8c 100644
--- a/src/CaseData.zig
+++ b/src/CaseData.zig
@@ -8,7 +8,7 @@ const unicode = std.unicode;
8const CodePointIterator = @import("code_point").Iterator; 8const CodePointIterator = @import("code_point").Iterator;
9 9
10allocator: mem.Allocator, 10allocator: mem.Allocator,
11case_map: [][3]u21, 11case_map: [][2]u21,
12prop_s1: []u16 = undefined, 12prop_s1: []u16 = undefined,
13prop_s2: []u8 = undefined, 13prop_s2: []u8 = undefined,
14 14
@@ -20,13 +20,13 @@ pub fn init(allocator: mem.Allocator) !Self {
20 20
21 var self = Self{ 21 var self = Self{
22 .allocator = allocator, 22 .allocator = allocator,
23 .case_map = try allocator.alloc([3]u21, 0x110000), 23 .case_map = try allocator.alloc([2]u21, 0x110000),
24 }; 24 };
25 errdefer allocator.free(self.case_map); 25 errdefer allocator.free(self.case_map);
26 26
27 for (0..0x110000) |i| { 27 for (0..0x110000) |i| {
28 const cp: u21 = @intCast(i); 28 const cp: u21 = @intCast(i);
29 self.case_map[cp] = .{ cp, cp, cp }; 29 self.case_map[cp] = .{ cp, cp };
30 } 30 }
31 31
32 // Uppercase 32 // Uppercase
@@ -55,19 +55,6 @@ pub fn init(allocator: mem.Allocator) !Self {
55 self.case_map[cp][1] = @intCast(try lower_reader.readInt(u24, endian)); 55 self.case_map[cp][1] = @intCast(try lower_reader.readInt(u24, endian));
56 } 56 }
57 57
58 // Titlercase
59 const title_bytes = @embedFile("title");
60 var title_fbs = std.io.fixedBufferStream(title_bytes);
61 var title_decomp = try decompressor(allocator, title_fbs.reader(), null);
62 defer title_decomp.deinit();
63 var title_reader = title_decomp.reader();
64
65 while (true) {
66 const cp = try title_reader.readInt(u24, endian);
67 if (cp == 0) break;
68 self.case_map[cp][2] = @intCast(try title_reader.readInt(u24, endian));
69 }
70
71 // Case properties 58 // Case properties
72 const cp_bytes = @embedFile("case_prop"); 59 const cp_bytes = @embedFile("case_prop");
73 var cp_fbs = std.io.fixedBufferStream(cp_bytes); 60 var cp_fbs = std.io.fixedBufferStream(cp_bytes);
@@ -101,7 +88,6 @@ pub inline fn isCased(self: Self, cp: u21) bool {
101 88
102// Returns true if `cp` is uppercase. 89// Returns true if `cp` is uppercase.
103pub fn isUpper(self: Self, cp: u21) bool { 90pub fn isUpper(self: Self, cp: u21) bool {
104 if (!self.isCased(cp)) return true;
105 return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; 91 return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
106} 92}
107 93
@@ -110,7 +96,7 @@ pub fn isUpperStr(self: Self, str: []const u8) bool {
110 var iter = CodePointIterator{ .bytes = str }; 96 var iter = CodePointIterator{ .bytes = str };
111 97
112 return while (iter.next()) |cp| { 98 return while (iter.next()) |cp| {
113 if (!self.isUpper(cp.code)) break false; 99 if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false;
114 } else true; 100 } else true;
115} 101}
116 102
@@ -123,6 +109,11 @@ test "isUpperStr" {
123 try testing.expect(!cd.isUpperStr("Hello, World 2112!")); 109 try testing.expect(!cd.isUpperStr("Hello, World 2112!"));
124} 110}
125 111
112/// Returns uppercase mapping for `cp`.
113pub inline fn toUpper(self: Self, cp: u21) u21 {
114 return self.case_map[cp][0];
115}
116
126/// Returns a new string with all letters in uppercase. 117/// Returns a new string with all letters in uppercase.
127/// Caller must free returned bytes with `allocator`. 118/// Caller must free returned bytes with `allocator`.
128pub fn toUpperStr( 119pub fn toUpperStr(
@@ -153,28 +144,17 @@ test "toUpperStr" {
153 try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered); 144 try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered);
154} 145}
155 146
156/// Returns uppercase mapping for `cp`.
157pub inline fn toUpper(self: Self, cp: u21) u21 {
158 return self.case_map[cp][0];
159}
160
161// Returns true if `cp` is lowercase. 147// Returns true if `cp` is lowercase.
162pub fn isLower(self: Self, cp: u21) bool { 148pub fn isLower(self: Self, cp: u21) bool {
163 if (!self.isCased(cp)) return true;
164 return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; 149 return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
165} 150}
166 151
167/// Returns lowercase mapping for `cp`.
168pub inline fn toLower(self: Self, cp: u21) u21 {
169 return self.case_map[cp][1];
170}
171
172/// Returns true if `str` is all lowercase. 152/// Returns true if `str` is all lowercase.
173pub fn isLowerStr(self: Self, str: []const u8) bool { 153pub fn isLowerStr(self: Self, str: []const u8) bool {
174 var iter = CodePointIterator{ .bytes = str }; 154 var iter = CodePointIterator{ .bytes = str };
175 155
176 return while (iter.next()) |cp| { 156 return while (iter.next()) |cp| {
177 if (!self.isLower(cp.code)) break false; 157 if (self.isCased(cp.code) and !self.isLower(cp.code)) break false;
178 } else true; 158 } else true;
179} 159}
180 160
@@ -187,6 +167,11 @@ test "isLowerStr" {
187 try testing.expect(!cd.isLowerStr("Hello, World 2112!")); 167 try testing.expect(!cd.isLowerStr("Hello, World 2112!"));
188} 168}
189 169
170/// Returns lowercase mapping for `cp`.
171pub inline fn toLower(self: Self, cp: u21) u21 {
172 return self.case_map[cp][1];
173}
174
190/// Returns a new string with all letters in lowercase. 175/// Returns a new string with all letters in lowercase.
191/// Caller must free returned bytes with `allocator`. 176/// Caller must free returned bytes with `allocator`.
192pub fn toLowerStr( 177pub fn toLowerStr(
@@ -216,8 +201,3 @@ test "toLowerStr" {
216 defer testing.allocator.free(lowered); 201 defer testing.allocator.free(lowered);
217 try testing.expectEqualStrings("hello, world 2112!", lowered); 202 try testing.expectEqualStrings("hello, world 2112!", lowered);
218} 203}
219
220/// Returns titlecase mapping for `cp`.
221pub inline fn toTitle(self: Self, cp: u21) u21 {
222 return self.case_map[cp][2];
223}