summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-06-26 12:08:08 -0400
committerGravatar Jose Colon Rodriguez2024-06-26 12:08:08 -0400
commit8ada7b4176d2c8afb7ecd01c4ac1aaa0f3b53cc0 (patch)
tree4c73852d462aea2a800964ab2345b9d2f3d95607
parentMerge pull request 'Normalize: Mark utf8Encode errors as unreachable, use exp... (diff)
downloadzg-8ada7b4176d2c8afb7ecd01c4ac1aaa0f3b53cc0.tar.gz
zg-8ada7b4176d2c8afb7ecd01c4ac1aaa0f3b53cc0.tar.xz
zg-8ada7b4176d2c8afb7ecd01c4ac1aaa0f3b53cc0.zip
Implemented sqeek502s case fold
-rw-r--r--codegen/fold.zig289
-rw-r--r--src/CaseFold.zig3
-rw-r--r--src/FoldData.zig86
3 files changed, 245 insertions, 133 deletions
diff --git a/codegen/fold.zig b/codegen/fold.zig
index 6dc51ac..ec024c5 100644
--- a/codegen/fold.zig
+++ b/codegen/fold.zig
@@ -1,123 +1,218 @@
1const std = @import("std"); 1const std = @import("std");
2const builtin = @import("builtin"); 2const builtin = @import("builtin");
3const fmt = std.fmt; 3
4const mem = std.mem; 4// From https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt
5// const case_folding_txt = @embedFile("CaseFolding.txt");
5 6
6pub fn main() !void { 7pub fn main() !void {
7 var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); 8 var gpa = std.heap.GeneralPurposeAllocator(.{}){};
8 defer arena.deinit(); 9 defer std.debug.assert(gpa.deinit() == .ok);
9 const allocator = arena.allocator(); 10 const allocator = gpa.allocator();
11
12 // const unbuf_stdout = std.io.getStdOut().writer();
13 // var buf_stdout = std.io.bufferedWriter(unbuf_stdout);
14 // const writer = buf_stdout.writer();
10 15
11 // Process DerivedCoreProperties.txt 16 var codepoint_mapping = std.AutoArrayHashMap(u21, [3]u21).init(allocator);
12 var cp_file = try std.fs.cwd().openFile("data/unicode/DerivedCoreProperties.txt", .{}); 17 defer codepoint_mapping.deinit();
18
19 // Process
20 var cp_file = try std.fs.cwd().openFile("data/unicode/CaseFolding.txt", .{});
13 defer cp_file.close(); 21 defer cp_file.close();
14 var cp_buf = std.io.bufferedReader(cp_file.reader()); 22 var cp_buf = std.io.bufferedReader(cp_file.reader());
15 const cp_reader = cp_buf.reader(); 23 const cp_reader = cp_buf.reader();
16 24
17 var cp_map = std.AutoHashMap(u21, void).init(allocator); 25 // var line_it = std.mem.tokenizeAny(u8, case_folding_txt, "\r\n");
18 defer cp_map.deinit();
19
20 var line_buf: [4096]u8 = undefined; 26 var line_buf: [4096]u8 = undefined;
21 27
22 cp_lines: while (try cp_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { 28 while (try cp_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
29 // while (line_it.next()) |line| {
23 if (line.len == 0 or line[0] == '#') continue; 30 if (line.len == 0 or line[0] == '#') continue;
24 31
25 const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line; 32 var field_it = std.mem.splitScalar(u8, line, ';');
26 33 const codepoint_str = field_it.first();
27 var field_iter = std.mem.tokenizeAny(u8, no_comment, "; "); 34 const codepoint = try std.fmt.parseUnsigned(u21, codepoint_str, 16);
28 var current_code: [2]u21 = undefined; 35
29 36 const status = std.mem.trim(u8, field_it.next() orelse continue, " ");
30 var i: usize = 0; 37 // Only interested in 'common' and 'full'
31 while (field_iter.next()) |field| : (i += 1) { 38 if (status[0] != 'C' and status[0] != 'F') continue;
32 switch (i) { 39
33 0 => { 40 const mapping = std.mem.trim(u8, field_it.next() orelse continue, " ");
34 // Code point(s) 41 var mapping_it = std.mem.splitScalar(u8, mapping, ' ');
35 if (std.mem.indexOf(u8, field, "..")) |dots| { 42 var mapping_buf = [_]u21{0} ** 3;
36 current_code = .{ 43 var mapping_i: u8 = 0;
37 try std.fmt.parseInt(u21, field[0..dots], 16), 44 while (mapping_it.next()) |mapping_c| {
38 try std.fmt.parseInt(u21, field[dots + 2 ..], 16), 45 mapping_buf[mapping_i] = try std.fmt.parseInt(u21, mapping_c, 16);
39 }; 46 mapping_i += 1;
40 } else {
41 const code = try std.fmt.parseInt(u21, field, 16);
42 current_code = .{ code, code };
43 }
44 },
45 1 => {
46 // Core property
47 if (!mem.eql(u8, field, "Changes_When_Casefolded")) continue :cp_lines;
48 for (current_code[0]..current_code[1] + 1) |cp| try cp_map.put(@intCast(cp), {});
49 },
50 else => {},
51 }
52 } 47 }
53 }
54 48
55 // Process CaseFolding.txt 49 try codepoint_mapping.putNoClobber(codepoint, mapping_buf);
56 var in_file = try std.fs.cwd().openFile("data/unicode/CaseFolding.txt", .{}); 50 }
57 defer in_file.close();
58 var in_buf = std.io.bufferedReader(in_file.reader());
59 const in_reader = in_buf.reader();
60 51
61 var args_iter = try std.process.argsWithAllocator(allocator); 52 var offset_to_index = std.AutoHashMap(i32, u8).init(allocator);
62 defer args_iter.deinit(); 53 defer offset_to_index.deinit();
63 _ = args_iter.skip(); 54 var unique_offsets = std.AutoArrayHashMap(i32, u32).init(allocator);
64 const output_path = args_iter.next() orelse @panic("No output file arg!"); 55 defer unique_offsets.deinit();
56
57 // First pass
58 {
59 var it = codepoint_mapping.iterator();
60 while (it.next()) |entry| {
61 const codepoint = entry.key_ptr.*;
62 const mappings = std.mem.sliceTo(entry.value_ptr, 0);
63 if (mappings.len == 1) {
64 const offset: i32 = @as(i32, mappings[0]) - @as(i32, codepoint);
65 const result = try unique_offsets.getOrPut(offset);
66 if (!result.found_existing) result.value_ptr.* = 0;
67 result.value_ptr.* += 1;
68 }
69 }
65 70
66 const compressor = std.compress.flate.deflate.compressor; 71 // A codepoint mapping to itself (offset=0) is the most common case
67 var out_file = try std.fs.cwd().createFile(output_path, .{}); 72 try unique_offsets.put(0, 0x10FFFF);
68 defer out_file.close(); 73 const C = struct {
69 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best }); 74 vals: []u32,
70 const writer = out_comp.writer();
71 75
72 const endian = builtin.cpu.arch.endian(); 76 pub fn lessThan(ctx: @This(), a_index: usize, b_index: usize) bool {
77 return ctx.vals[a_index] > ctx.vals[b_index];
78 }
79 };
80 unique_offsets.sort(C{ .vals = unique_offsets.values() });
81
82 var offset_it = unique_offsets.iterator();
83 var offset_index: u7 = 0;
84 while (offset_it.next()) |entry| {
85 try offset_to_index.put(entry.key_ptr.*, offset_index);
86 offset_index += 1;
87 }
88 }
73 89
74 lines: while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { 90 var mappings_to_index = std.AutoArrayHashMap([3]u21, u8).init(allocator);
75 if (line.len == 0 or line[0] == '#') continue; 91 defer mappings_to_index.deinit();
92 var codepoint_to_index = std.AutoHashMap(u21, u8).init(allocator);
93 defer codepoint_to_index.deinit();
94
95 // Second pass
96 {
97 var count_multiple_codepoints: u8 = 0;
98
99 var it = codepoint_mapping.iterator();
100 while (it.next()) |entry| {
101 const codepoint = entry.key_ptr.*;
102 const mappings = std.mem.sliceTo(entry.value_ptr, 0);
103 if (mappings.len > 1) {
104 const result = try mappings_to_index.getOrPut(entry.value_ptr.*);
105 if (!result.found_existing) {
106 result.value_ptr.* = 0x80 | count_multiple_codepoints;
107 count_multiple_codepoints += 1;
108 }
109 const index = result.value_ptr.*;
110 try codepoint_to_index.put(codepoint, index);
111 } else {
112 const offset: i32 = @as(i32, mappings[0]) - @as(i32, codepoint);
113 const index = offset_to_index.get(offset).?;
114 try codepoint_to_index.put(codepoint, index);
115 }
116 }
117 }
76 118
77 const no_comment = if (mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line; 119 // Build the stage1/stage2/stage3 arrays and output them
78 120 {
79 var field_iter = mem.tokenizeSequence(u8, no_comment, "; "); 121 const Block = [256]u8;
80 var cps: [4]u24 = undefined; 122 var stage2_blocks = std.AutoArrayHashMap(Block, void).init(allocator);
81 var len: usize = 2; 123 defer stage2_blocks.deinit();
82 124
83 var i: usize = 0; 125 const empty_block: Block = [_]u8{0} ** 256;
84 while (field_iter.next()) |field| : (i += 1) { 126 try stage2_blocks.put(empty_block, {});
85 switch (i) { 127 const stage1_len = (0x10FFFF / 256) + 1;
86 0 => { 128 var stage1: [stage1_len]u8 = undefined;
87 var cp = try fmt.parseInt(u21, field, 16); 129
88 cp <<= 1; 130 var codepoint: u21 = 0;
89 if (cp_map.contains(cp)) cp |= 1; 131 var block: Block = undefined;
90 cps[0] = cp; 132 while (codepoint <= 0x10FFFF) {
91 }, 133 const data_index = codepoint_to_index.get(codepoint) orelse 0;
92 134 block[codepoint % 256] = data_index;
93 1 => { 135
94 if (!mem.eql(u8, field, "C") and !mem.eql(u8, field, "F")) continue :lines; 136 codepoint += 1;
95 if (mem.eql(u8, field, "F")) len = 3; 137 if (codepoint % 256 == 0) {
96 }, 138 const result = try stage2_blocks.getOrPut(block);
97 139 const index = result.index;
98 2 => { 140 stage1[(codepoint >> 8) - 1] = @intCast(index);
99 if (len == 3) {
100 // Full case fold
101 // std.debug.print("-->{s} {s}\n", .{ line, field });
102 var cp_iter = mem.tokenizeScalar(u8, field, ' ');
103 len = 1;
104 while (cp_iter.next()) |cp_str| : (len += 1) {
105 cps[len] = try fmt.parseInt(u24, cp_str, 16);
106 }
107 } else {
108 // Common case fold
109 cps[1] = try fmt.parseInt(u24, field, 16);
110 }
111 },
112
113 else => {},
114 } 141 }
115 } 142 }
116 143
117 try writer.writeInt(u8, @intCast(len), endian); 144 const last_meaningful_block = std.mem.lastIndexOfNone(u8, &stage1, "\x00").?;
118 for (cps[0..len]) |cp| try writer.writeInt(u24, cp, endian); 145 const meaningful_stage1 = stage1[0 .. last_meaningful_block + 1];
146 const codepoint_cutoff = (last_meaningful_block + 1) << 8;
147 const multiple_codepoint_start: usize = unique_offsets.count();
148
149 var index: usize = 0;
150 const stage3_elems = unique_offsets.count() + mappings_to_index.count() * 3;
151 var stage3 = try allocator.alloc(i24, stage3_elems);
152 defer allocator.free(stage3);
153 for (unique_offsets.keys()) |key| {
154 stage3[index] = @intCast(key);
155 index += 1;
156 }
157 for (mappings_to_index.keys()) |key| {
158 stage3[index] = @intCast(key[0]);
159 stage3[index + 1] = @intCast(key[1]);
160 stage3[index + 2] = @intCast(key[2]);
161 index += 3;
162 }
163
164 const stage2_elems = stage2_blocks.count() * 256;
165 var stage2 = try allocator.alloc(u8, stage2_elems);
166 defer allocator.free(stage2);
167 for (stage2_blocks.keys(), 0..) |key, i| {
168 @memcpy(stage2[i * 256 ..][0..256], &key);
169 }
170
171 // try writer.print("const cutoff = 0x{X};\n", .{codepoint_cutoff});
172 // try writeArray(writer, u8, "stage1", meaningful_stage1);
173 // try writeArray(writer, u8, "stage2", stage2);
174 // try writer.print("const multiple_start = {};\n", .{multiple_codepoint_start});
175 // try writeArray(writer, i24, "stage3", stage3);
176
177 var args_iter = try std.process.argsWithAllocator(allocator);
178 defer args_iter.deinit();
179 _ = args_iter.skip();
180 const output_path = args_iter.next() orelse @panic("No output file arg!");
181
182 const compressor = std.compress.flate.deflate.compressor;
183 var out_file = try std.fs.cwd().createFile(output_path, .{});
184 defer out_file.close();
185 var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
186 const writer = out_comp.writer();
187
188 const endian = builtin.cpu.arch.endian();
189
190 try writer.writeInt(u24, @intCast(codepoint_cutoff), endian);
191 try writer.writeInt(u24, @intCast(multiple_codepoint_start), endian);
192
193 try writer.writeInt(u16, @intCast(meaningful_stage1.len), endian);
194 try writer.writeAll(meaningful_stage1);
195
196 try writer.writeInt(u16, @intCast(stage2.len), endian);
197 try writer.writeAll(stage2);
198
199 try writer.writeInt(u16, @intCast(stage3.len), endian);
200 for (stage3) |offset| try writer.writeInt(i24, offset, endian);
201
202 try out_comp.flush();
119 } 203 }
120 204
121 try writer.writeInt(u16, 0, endian); 205 // try buf_stdout.flush();
122 try out_comp.flush();
123} 206}
207
208// fn writeArray(writer: anytype, comptime T: type, name: []const u8, data: []const T) !void {
209// try writer.print("const {s} = [{}]{s}{{", .{ name, data.len, @typeName(T) });
210//
211// for (data, 0..) |v, i| {
212// if (i % 32 == 0) try writer.writeAll("\n ");
213// try writer.print("{},", .{v});
214// if (i != data.len - 1) try writer.writeByte(' ');
215// }
216//
217// try writer.writeAll("\n};\n");
218// }
diff --git a/src/CaseFold.zig b/src/CaseFold.zig
index 3e7535e..19c9da8 100644
--- a/src/CaseFold.zig
+++ b/src/CaseFold.zig
@@ -19,9 +19,10 @@ pub fn caseFold(
19) ![]const u21 { 19) ![]const u21 {
20 var cfcps = std.ArrayList(u21).init(allocator); 20 var cfcps = std.ArrayList(u21).init(allocator);
21 defer cfcps.deinit(); 21 defer cfcps.deinit();
22 var buf: [3]u21 = undefined;
22 23
23 for (cps) |cp| { 24 for (cps) |cp| {
24 const cf = self.fold_data.caseFold(cp); 25 const cf = self.fold_data.caseFold(cp, &buf);
25 26
26 if (cf.len == 0) { 27 if (cf.len == 0) {
27 try cfcps.append(cp); 28 try cfcps.append(cp);
diff --git a/src/FoldData.zig b/src/FoldData.zig
index d4312b0..93613fe 100644
--- a/src/FoldData.zig
+++ b/src/FoldData.zig
@@ -4,8 +4,11 @@ const compress = std.compress;
4const mem = std.mem; 4const mem = std.mem;
5 5
6allocator: mem.Allocator, 6allocator: mem.Allocator,
7fold: [][]u21 = undefined, 7cutoff: u21 = undefined,
8cwcf: []bool = undefined, 8multiple_start: u21 = undefined,
9stage1: []u8 = undefined,
10stage2: []u8 = undefined,
11stage3: []i24 = undefined,
9 12
10const Self = @This(); 13const Self = @This();
11 14
@@ -17,49 +20,62 @@ pub fn init(allocator: mem.Allocator) !Self {
17 var reader = in_decomp.reader(); 20 var reader = in_decomp.reader();
18 21
19 const endian = builtin.cpu.arch.endian(); 22 const endian = builtin.cpu.arch.endian();
20 var self = Self{
21 .allocator = allocator,
22 .fold = try allocator.alloc([]u21, 0x110000),
23 .cwcf = try allocator.alloc(bool, 0x110000),
24 };
25
26 var slices: usize = 0;
27 errdefer {
28 for (self.fold[0..slices]) |slice| self.allocator.free(slice);
29 self.allocator.free(self.fold);
30 self.allocator.free(self.cwcf);
31 }
32 23
33 @memset(self.fold, &.{}); 24 var self = Self{ .allocator = allocator };
34 @memset(self.cwcf, false); 25 self.cutoff = @intCast(try reader.readInt(u24, endian));
35 26 self.multiple_start = @intCast(try reader.readInt(u24, endian));
36 while (true) { 27
37 const len: u8 = try reader.readInt(u8, endian); 28 var len = try reader.readInt(u16, endian);
38 if (len == 0) break; 29 self.stage1 = try allocator.alloc(u8, len);
39 const cp = try reader.readInt(u24, endian); 30 errdefer allocator.free(self.stage1);
40 self.fold[cp >> 1] = try allocator.alloc(u21, len - 1); 31 for (0..len) |i| self.stage1[i] = try reader.readInt(u8, endian);
41 slices += 1; 32
42 for (0..len - 1) |i| { 33 len = try reader.readInt(u16, endian);
43 self.fold[cp >> 1][i] = @intCast(try reader.readInt(u24, endian)); 34 self.stage2 = try allocator.alloc(u8, len);
44 } 35 errdefer allocator.free(self.stage2);
45 self.cwcf[cp >> 1] = cp & 1 == 1; 36 for (0..len) |i| self.stage2[i] = try reader.readInt(u8, endian);
46 } 37
38 len = try reader.readInt(u16, endian);
39 self.stage3 = try allocator.alloc(i24, len);
40 errdefer allocator.free(self.stage3);
41 for (0..len) |i| self.stage3[i] = try reader.readInt(i24, endian);
47 42
48 return self; 43 return self;
49} 44}
50 45
51pub fn deinit(self: *const Self) void { 46pub fn deinit(self: *const Self) void {
52 for (self.fold) |slice| self.allocator.free(slice); 47 self.allocator.free(self.stage1);
53 self.allocator.free(self.fold); 48 self.allocator.free(self.stage2);
54 self.allocator.free(self.cwcf); 49 self.allocator.free(self.stage3);
55} 50}
56 51
57/// Returns the case fold for `cp`. 52/// Returns the case fold for `cp`.
58pub inline fn caseFold(self: Self, cp: u21) []const u21 { 53pub inline fn caseFold(self: Self, cp: u21, buf: []u21) []const u21 {
59 return self.fold[cp]; 54 if (cp >= self.cutoff) return &.{};
55
56 const stage1_val = self.stage1[cp >> 8];
57 if (stage1_val == 0) return &.{};
58
59 const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF);
60 const stage3_index = self.stage2[stage2_index];
61
62 if (stage3_index & 0x80 != 0) {
63 const real_index = @as(usize, self.multiple_start) + (stage3_index ^ 0x80) * 3;
64 const mapping = mem.sliceTo(self.stage3[real_index..][0..3], 0);
65 for (mapping, 0..) |c, i| buf[i] = @intCast(c);
66
67 return buf[0..mapping.len];
68 }
69
70 const offset = self.stage3[stage3_index];
71 if (offset == 0) return &.{};
72
73 buf[0] = @intCast(@as(i32, cp) + offset);
74
75 return buf[0..1];
60} 76}
61 77
62/// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). 78/// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`).
63pub inline fn changesWhenCaseFolded(self: Self, cp: u21) bool { 79pub inline fn changesWhenCaseFolded(_: Self, _: u21) bool {
64 return self.cwcf[cp]; 80 return true;
65} 81}