diff options
| author | 2024-03-26 07:45:32 -0400 | |
|---|---|---|
| committer | 2024-03-26 07:45:32 -0400 | |
| commit | 4c9b673c7f47d8a2090499f8e5c222312b284725 (patch) | |
| tree | 5246f97dd39d3cea7ca309b9d91728e3e644ed6f | |
| parent | CaseData (diff) | |
| download | zg-4c9b673c7f47d8a2090499f8e5c222312b284725.tar.gz zg-4c9b673c7f47d8a2090499f8e5c222312b284725.tar.xz zg-4c9b673c7f47d8a2090499f8e5c222312b284725.zip | |
Removed title case processing
| -rw-r--r-- | build.zig | 12 | ||||
| -rw-r--r-- | codegen/title.zig | 58 | ||||
| -rw-r--r-- | src/CaseData.zig | 50 |
3 files changed, 15 insertions, 105 deletions
| @@ -137,16 +137,6 @@ pub fn build(b: *std.Build) void { | |||
| 137 | const run_lower_gen_exe = b.addRunArtifact(lower_gen_exe); | 137 | const run_lower_gen_exe = b.addRunArtifact(lower_gen_exe); |
| 138 | const lower_gen_out = run_lower_gen_exe.addOutputFileArg("lower.bin.z"); | 138 | const lower_gen_out = run_lower_gen_exe.addOutputFileArg("lower.bin.z"); |
| 139 | 139 | ||
| 140 | // Titlecase mappings | ||
| 141 | const title_gen_exe = b.addExecutable(.{ | ||
| 142 | .name = "title", | ||
| 143 | .root_source_file = .{ .path = "codegen/title.zig" }, | ||
| 144 | .target = b.host, | ||
| 145 | .optimize = .Debug, | ||
| 146 | }); | ||
| 147 | const run_title_gen_exe = b.addRunArtifact(title_gen_exe); | ||
| 148 | const title_gen_out = run_title_gen_exe.addOutputFileArg("title.bin.z"); | ||
| 149 | |||
| 150 | // Modules we provide | 140 | // Modules we provide |
| 151 | // Code points | 141 | // Code points |
| 152 | const code_point = b.addModule("code_point", .{ | 142 | const code_point = b.addModule("code_point", .{ |
| @@ -296,7 +286,6 @@ pub fn build(b: *std.Build) void { | |||
| 296 | case_data.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out }); | 286 | case_data.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out }); |
| 297 | case_data.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out }); | 287 | case_data.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out }); |
| 298 | case_data.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out }); | 288 | case_data.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out }); |
| 299 | case_data.addAnonymousImport("title", .{ .root_source_file = title_gen_out }); | ||
| 300 | 289 | ||
| 301 | // Benchmark rig | 290 | // Benchmark rig |
| 302 | const exe = b.addExecutable(.{ | 291 | const exe = b.addExecutable(.{ |
| @@ -344,7 +333,6 @@ pub fn build(b: *std.Build) void { | |||
| 344 | exe_unit_tests.root_module.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out }); | 333 | exe_unit_tests.root_module.addAnonymousImport("case_prop", .{ .root_source_file = case_prop_gen_out }); |
| 345 | exe_unit_tests.root_module.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out }); | 334 | exe_unit_tests.root_module.addAnonymousImport("upper", .{ .root_source_file = upper_gen_out }); |
| 346 | exe_unit_tests.root_module.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out }); | 335 | exe_unit_tests.root_module.addAnonymousImport("lower", .{ .root_source_file = lower_gen_out }); |
| 347 | exe_unit_tests.root_module.addAnonymousImport("title", .{ .root_source_file = title_gen_out }); | ||
| 348 | // exe_unit_tests.filter = "nfd !ASCII"; | 336 | // exe_unit_tests.filter = "nfd !ASCII"; |
| 349 | 337 | ||
| 350 | const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); | 338 | const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); |
diff --git a/codegen/title.zig b/codegen/title.zig deleted file mode 100644 index 653b812..0000000 --- a/codegen/title.zig +++ /dev/null | |||
| @@ -1,58 +0,0 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | |||
| 4 | pub fn main() !void { | ||
| 5 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); | ||
| 6 | defer arena.deinit(); | ||
| 7 | const allocator = arena.allocator(); | ||
| 8 | |||
| 9 | // Process UnicodeData.txt | ||
| 10 | var in_file = try std.fs.cwd().openFile("data/unicode/UnicodeData.txt", .{}); | ||
| 11 | defer in_file.close(); | ||
| 12 | var in_buf = std.io.bufferedReader(in_file.reader()); | ||
| 13 | const in_reader = in_buf.reader(); | ||
| 14 | |||
| 15 | var args_iter = try std.process.argsWithAllocator(allocator); | ||
| 16 | defer args_iter.deinit(); | ||
| 17 | _ = args_iter.skip(); | ||
| 18 | const output_path = args_iter.next() orelse @panic("No output file arg!"); | ||
| 19 | |||
| 20 | const compressor = std.compress.deflate.compressor; | ||
| 21 | var out_file = try std.fs.cwd().createFile(output_path, .{}); | ||
| 22 | defer out_file.close(); | ||
| 23 | var out_comp = try compressor(allocator, out_file.writer(), .{ .level = .best_compression }); | ||
| 24 | defer out_comp.deinit(); | ||
| 25 | const writer = out_comp.writer(); | ||
| 26 | |||
| 27 | const endian = builtin.cpu.arch.endian(); | ||
| 28 | var line_buf: [4096]u8 = undefined; | ||
| 29 | |||
| 30 | lines: while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { | ||
| 31 | if (line.len == 0) continue; | ||
| 32 | |||
| 33 | var field_iter = std.mem.splitScalar(u8, line, ';'); | ||
| 34 | var cps: [2]u24 = undefined; | ||
| 35 | |||
| 36 | var i: usize = 0; | ||
| 37 | while (field_iter.next()) |field| : (i += 1) { | ||
| 38 | switch (i) { | ||
| 39 | 0 => cps[0] = try std.fmt.parseInt(u24, field, 16), | ||
| 40 | |||
| 41 | 14 => { | ||
| 42 | // Simple titlecase mapping | ||
| 43 | if (field.len == 0) continue :lines; | ||
| 44 | cps[1] = try std.fmt.parseInt(u24, field, 16); | ||
| 45 | }, | ||
| 46 | |||
| 47 | 2 => if (line[0] == '<') continue :lines, | ||
| 48 | |||
| 49 | else => {}, | ||
| 50 | } | ||
| 51 | } | ||
| 52 | |||
| 53 | for (&cps) |cp| try writer.writeInt(u24, cp, endian); | ||
| 54 | } | ||
| 55 | |||
| 56 | try writer.writeInt(u24, 0, endian); | ||
| 57 | try out_comp.flush(); | ||
| 58 | } | ||
diff --git a/src/CaseData.zig b/src/CaseData.zig index 38830e3..d790e8c 100644 --- a/src/CaseData.zig +++ b/src/CaseData.zig | |||
| @@ -8,7 +8,7 @@ const unicode = std.unicode; | |||
| 8 | const CodePointIterator = @import("code_point").Iterator; | 8 | const CodePointIterator = @import("code_point").Iterator; |
| 9 | 9 | ||
| 10 | allocator: mem.Allocator, | 10 | allocator: mem.Allocator, |
| 11 | case_map: [][3]u21, | 11 | case_map: [][2]u21, |
| 12 | prop_s1: []u16 = undefined, | 12 | prop_s1: []u16 = undefined, |
| 13 | prop_s2: []u8 = undefined, | 13 | prop_s2: []u8 = undefined, |
| 14 | 14 | ||
| @@ -20,13 +20,13 @@ pub fn init(allocator: mem.Allocator) !Self { | |||
| 20 | 20 | ||
| 21 | var self = Self{ | 21 | var self = Self{ |
| 22 | .allocator = allocator, | 22 | .allocator = allocator, |
| 23 | .case_map = try allocator.alloc([3]u21, 0x110000), | 23 | .case_map = try allocator.alloc([2]u21, 0x110000), |
| 24 | }; | 24 | }; |
| 25 | errdefer allocator.free(self.case_map); | 25 | errdefer allocator.free(self.case_map); |
| 26 | 26 | ||
| 27 | for (0..0x110000) |i| { | 27 | for (0..0x110000) |i| { |
| 28 | const cp: u21 = @intCast(i); | 28 | const cp: u21 = @intCast(i); |
| 29 | self.case_map[cp] = .{ cp, cp, cp }; | 29 | self.case_map[cp] = .{ cp, cp }; |
| 30 | } | 30 | } |
| 31 | 31 | ||
| 32 | // Uppercase | 32 | // Uppercase |
| @@ -55,19 +55,6 @@ pub fn init(allocator: mem.Allocator) !Self { | |||
| 55 | self.case_map[cp][1] = @intCast(try lower_reader.readInt(u24, endian)); | 55 | self.case_map[cp][1] = @intCast(try lower_reader.readInt(u24, endian)); |
| 56 | } | 56 | } |
| 57 | 57 | ||
| 58 | // Titlercase | ||
| 59 | const title_bytes = @embedFile("title"); | ||
| 60 | var title_fbs = std.io.fixedBufferStream(title_bytes); | ||
| 61 | var title_decomp = try decompressor(allocator, title_fbs.reader(), null); | ||
| 62 | defer title_decomp.deinit(); | ||
| 63 | var title_reader = title_decomp.reader(); | ||
| 64 | |||
| 65 | while (true) { | ||
| 66 | const cp = try title_reader.readInt(u24, endian); | ||
| 67 | if (cp == 0) break; | ||
| 68 | self.case_map[cp][2] = @intCast(try title_reader.readInt(u24, endian)); | ||
| 69 | } | ||
| 70 | |||
| 71 | // Case properties | 58 | // Case properties |
| 72 | const cp_bytes = @embedFile("case_prop"); | 59 | const cp_bytes = @embedFile("case_prop"); |
| 73 | var cp_fbs = std.io.fixedBufferStream(cp_bytes); | 60 | var cp_fbs = std.io.fixedBufferStream(cp_bytes); |
| @@ -101,7 +88,6 @@ pub inline fn isCased(self: Self, cp: u21) bool { | |||
| 101 | 88 | ||
| 102 | // Returns true if `cp` is uppercase. | 89 | // Returns true if `cp` is uppercase. |
| 103 | pub fn isUpper(self: Self, cp: u21) bool { | 90 | pub fn isUpper(self: Self, cp: u21) bool { |
| 104 | if (!self.isCased(cp)) return true; | ||
| 105 | return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; | 91 | return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2; |
| 106 | } | 92 | } |
| 107 | 93 | ||
| @@ -110,7 +96,7 @@ pub fn isUpperStr(self: Self, str: []const u8) bool { | |||
| 110 | var iter = CodePointIterator{ .bytes = str }; | 96 | var iter = CodePointIterator{ .bytes = str }; |
| 111 | 97 | ||
| 112 | return while (iter.next()) |cp| { | 98 | return while (iter.next()) |cp| { |
| 113 | if (!self.isUpper(cp.code)) break false; | 99 | if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false; |
| 114 | } else true; | 100 | } else true; |
| 115 | } | 101 | } |
| 116 | 102 | ||
| @@ -123,6 +109,11 @@ test "isUpperStr" { | |||
| 123 | try testing.expect(!cd.isUpperStr("Hello, World 2112!")); | 109 | try testing.expect(!cd.isUpperStr("Hello, World 2112!")); |
| 124 | } | 110 | } |
| 125 | 111 | ||
| 112 | /// Returns uppercase mapping for `cp`. | ||
| 113 | pub inline fn toUpper(self: Self, cp: u21) u21 { | ||
| 114 | return self.case_map[cp][0]; | ||
| 115 | } | ||
| 116 | |||
| 126 | /// Returns a new string with all letters in uppercase. | 117 | /// Returns a new string with all letters in uppercase. |
| 127 | /// Caller must free returned bytes with `allocator`. | 118 | /// Caller must free returned bytes with `allocator`. |
| 128 | pub fn toUpperStr( | 119 | pub fn toUpperStr( |
| @@ -153,28 +144,17 @@ test "toUpperStr" { | |||
| 153 | try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered); | 144 | try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered); |
| 154 | } | 145 | } |
| 155 | 146 | ||
| 156 | /// Returns uppercase mapping for `cp`. | ||
| 157 | pub inline fn toUpper(self: Self, cp: u21) u21 { | ||
| 158 | return self.case_map[cp][0]; | ||
| 159 | } | ||
| 160 | |||
| 161 | // Returns true if `cp` is lowercase. | 147 | // Returns true if `cp` is lowercase. |
| 162 | pub fn isLower(self: Self, cp: u21) bool { | 148 | pub fn isLower(self: Self, cp: u21) bool { |
| 163 | if (!self.isCased(cp)) return true; | ||
| 164 | return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; | 149 | return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1; |
| 165 | } | 150 | } |
| 166 | 151 | ||
| 167 | /// Returns lowercase mapping for `cp`. | ||
| 168 | pub inline fn toLower(self: Self, cp: u21) u21 { | ||
| 169 | return self.case_map[cp][1]; | ||
| 170 | } | ||
| 171 | |||
| 172 | /// Returns true if `str` is all lowercase. | 152 | /// Returns true if `str` is all lowercase. |
| 173 | pub fn isLowerStr(self: Self, str: []const u8) bool { | 153 | pub fn isLowerStr(self: Self, str: []const u8) bool { |
| 174 | var iter = CodePointIterator{ .bytes = str }; | 154 | var iter = CodePointIterator{ .bytes = str }; |
| 175 | 155 | ||
| 176 | return while (iter.next()) |cp| { | 156 | return while (iter.next()) |cp| { |
| 177 | if (!self.isLower(cp.code)) break false; | 157 | if (self.isCased(cp.code) and !self.isLower(cp.code)) break false; |
| 178 | } else true; | 158 | } else true; |
| 179 | } | 159 | } |
| 180 | 160 | ||
| @@ -187,6 +167,11 @@ test "isLowerStr" { | |||
| 187 | try testing.expect(!cd.isLowerStr("Hello, World 2112!")); | 167 | try testing.expect(!cd.isLowerStr("Hello, World 2112!")); |
| 188 | } | 168 | } |
| 189 | 169 | ||
| 170 | /// Returns lowercase mapping for `cp`. | ||
| 171 | pub inline fn toLower(self: Self, cp: u21) u21 { | ||
| 172 | return self.case_map[cp][1]; | ||
| 173 | } | ||
| 174 | |||
| 190 | /// Returns a new string with all letters in lowercase. | 175 | /// Returns a new string with all letters in lowercase. |
| 191 | /// Caller must free returned bytes with `allocator`. | 176 | /// Caller must free returned bytes with `allocator`. |
| 192 | pub fn toLowerStr( | 177 | pub fn toLowerStr( |
| @@ -216,8 +201,3 @@ test "toLowerStr" { | |||
| 216 | defer testing.allocator.free(lowered); | 201 | defer testing.allocator.free(lowered); |
| 217 | try testing.expectEqualStrings("hello, world 2112!", lowered); | 202 | try testing.expectEqualStrings("hello, world 2112!", lowered); |
| 218 | } | 203 | } |
| 219 | |||
| 220 | /// Returns titlecase mapping for `cp`. | ||
| 221 | pub inline fn toTitle(self: Self, cp: u21) u21 { | ||
| 222 | return self.case_map[cp][2]; | ||
| 223 | } | ||