summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-03-01 19:28:41 -0400
committerGravatar Jose Colon Rodriguez2024-03-01 19:28:41 -0400
commit9b435e69cb9f1572728b38457fabc9636bc47143 (patch)
treefabc23d4f122906457ab8e389ef44c23205d506e
parentNormalizer.eqlIgnoreCase compatibility caseless matching (diff)
downloadzg-9b435e69cb9f1572728b38457fabc9636bc47143.tar.gz
zg-9b435e69cb9f1572728b38457fabc9636bc47143.tar.xz
zg-9b435e69cb9f1572728b38457fabc9636bc47143.zip
Changes when case folded check; 20ms faster
-rw-r--r--codegen/fold.zig54
-rw-r--r--src/FoldData.zig14
-rw-r--r--src/Normalizer.zig30
3 files changed, 89 insertions, 9 deletions
diff --git a/codegen/fold.zig b/codegen/fold.zig
index 7977e61..b3192e7 100644
--- a/codegen/fold.zig
+++ b/codegen/fold.zig
@@ -8,7 +8,51 @@ pub fn main() !void {
8 defer arena.deinit(); 8 defer arena.deinit();
9 const allocator = arena.allocator(); 9 const allocator = arena.allocator();
10 10
11 // Process DerivedEastAsianWidth.txt 11 // Process DerivedCoreProperties.txt
12 var cp_file = try std.fs.cwd().openFile("data/unicode/DerivedCoreProperties.txt", .{});
13 defer cp_file.close();
14 var cp_buf = std.io.bufferedReader(cp_file.reader());
15 const cp_reader = cp_buf.reader();
16
17 var cp_map = std.AutoHashMap(u21, void).init(allocator);
18 defer cp_map.deinit();
19
20 var line_buf: [4096]u8 = undefined;
21
22 cp_lines: while (try cp_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
23 if (line.len == 0 or line[0] == '#') continue;
24
25 const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line;
26
27 var field_iter = std.mem.tokenizeAny(u8, no_comment, "; ");
28 var current_code: [2]u21 = undefined;
29
30 var i: usize = 0;
31 while (field_iter.next()) |field| : (i += 1) {
32 switch (i) {
33 0 => {
34 // Code point(s)
35 if (std.mem.indexOf(u8, field, "..")) |dots| {
36 current_code = .{
37 try std.fmt.parseInt(u21, field[0..dots], 16),
38 try std.fmt.parseInt(u21, field[dots + 2 ..], 16),
39 };
40 } else {
41 const code = try std.fmt.parseInt(u21, field, 16);
42 current_code = .{ code, code };
43 }
44 },
45 1 => {
46 // Core property
47 if (!mem.eql(u8, field, "Changes_When_Casefolded")) continue :cp_lines;
48 for (current_code[0]..current_code[1] + 1) |cp| try cp_map.put(@intCast(cp), {});
49 },
50 else => {},
51 }
52 }
53 }
54
55 // Process CaseFolding.txt
12 var in_file = try std.fs.cwd().openFile("data/unicode/CaseFolding.txt", .{}); 56 var in_file = try std.fs.cwd().openFile("data/unicode/CaseFolding.txt", .{});
13 defer in_file.close(); 57 defer in_file.close();
14 var in_buf = std.io.bufferedReader(in_file.reader()); 58 var in_buf = std.io.bufferedReader(in_file.reader());
@@ -27,7 +71,6 @@ pub fn main() !void {
27 const writer = out_comp.writer(); 71 const writer = out_comp.writer();
28 72
29 const endian = builtin.cpu.arch.endian(); 73 const endian = builtin.cpu.arch.endian();
30 var line_buf: [4096]u8 = undefined;
31 74
32 lines: while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { 75 lines: while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
33 if (line.len == 0 or line[0] == '#') continue; 76 if (line.len == 0 or line[0] == '#') continue;
@@ -41,7 +84,12 @@ pub fn main() !void {
41 var i: usize = 0; 84 var i: usize = 0;
42 while (field_iter.next()) |field| : (i += 1) { 85 while (field_iter.next()) |field| : (i += 1) {
43 switch (i) { 86 switch (i) {
44 0 => cps[0] = try fmt.parseInt(u24, field, 16), 87 0 => {
88 var cp = try fmt.parseInt(u21, field, 16);
89 cp <<= 1;
90 if (cp_map.contains(cp)) cp |= 1;
91 cps[0] = cp;
92 },
45 93
46 1 => { 94 1 => {
47 if (!mem.eql(u8, field, "C") and !mem.eql(u8, field, "F")) continue :lines; 95 if (!mem.eql(u8, field, "C") and !mem.eql(u8, field, "F")) continue :lines;
diff --git a/src/FoldData.zig b/src/FoldData.zig
index 139c677..2a9a1f5 100644
--- a/src/FoldData.zig
+++ b/src/FoldData.zig
@@ -5,6 +5,7 @@ const mem = std.mem;
5 5
6allocator: mem.Allocator, 6allocator: mem.Allocator,
7fold: [][]u21 = undefined, 7fold: [][]u21 = undefined,
8cwcf: []bool = undefined,
8 9
9const Self = @This(); 10const Self = @This();
10 11
@@ -20,18 +21,21 @@ pub fn init(allocator: mem.Allocator) !Self {
20 var self = Self{ 21 var self = Self{
21 .allocator = allocator, 22 .allocator = allocator,
22 .fold = try allocator.alloc([]u21, 0x110000), 23 .fold = try allocator.alloc([]u21, 0x110000),
24 .cwcf = try allocator.alloc(bool, 0x110000),
23 }; 25 };
24 26
25 @memset(self.fold, &.{}); 27 @memset(self.fold, &.{});
28 @memset(self.cwcf, false);
26 29
27 while (true) { 30 while (true) {
28 const len: u8 = try reader.readInt(u8, endian); 31 const len: u8 = try reader.readInt(u8, endian);
29 if (len == 0) break; 32 if (len == 0) break;
30 const cp = try reader.readInt(u24, endian); 33 const cp = try reader.readInt(u24, endian);
31 self.fold[cp] = try allocator.alloc(u21, len - 1); 34 self.fold[cp >> 1] = try allocator.alloc(u21, len - 1);
32 for (0..len - 1) |i| { 35 for (0..len - 1) |i| {
33 self.fold[cp][i] = @intCast(try reader.readInt(u24, endian)); 36 self.fold[cp >> 1][i] = @intCast(try reader.readInt(u24, endian));
34 } 37 }
38 self.cwcf[cp >> 1] = cp & 1 == 1;
35 } 39 }
36 40
37 return self; 41 return self;
@@ -40,9 +44,15 @@ pub fn init(allocator: mem.Allocator) !Self {
40pub fn deinit(self: *Self) void { 44pub fn deinit(self: *Self) void {
41 for (self.fold) |slice| self.allocator.free(slice); 45 for (self.fold) |slice| self.allocator.free(slice);
42 self.allocator.free(self.fold); 46 self.allocator.free(self.fold);
47 self.allocator.free(self.cwcf);
43} 48}
44 49
45/// Returns the case fold for `cp`. 50/// Returns the case fold for `cp`.
46pub inline fn caseFold(self: Self, cp: u21) []const u21 { 51pub inline fn caseFold(self: Self, cp: u21) []const u21 {
47 return self.fold[cp]; 52 return self.fold[cp];
48} 53}
54
55/// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`).
56pub inline fn changesWhenCaseFolded(self: Self, cp: u21) bool {
57 return self.cwcf[cp];
58}
diff --git a/src/Normalizer.zig b/src/Normalizer.zig
index c68b2ec..5a26dfa 100644
--- a/src/Normalizer.zig
+++ b/src/Normalizer.zig
@@ -389,6 +389,12 @@ fn nfkdCodePoints(
389 return try dcp_list.toOwnedSlice(); 389 return try dcp_list.toOwnedSlice();
390} 390}
391 391
392fn changesWhenCaseFolded(self: Self, cps: []const u21) bool {
393 return for (cps) |cp| {
394 if (self.norm_data.fold_data.changesWhenCaseFolded(cp)) break true;
395 } else false;
396}
397
392pub fn eqlIgnoreCase( 398pub fn eqlIgnoreCase(
393 self: Self, 399 self: Self,
394 allocator: mem.Allocator, 400 allocator: mem.Allocator,
@@ -397,10 +403,18 @@ pub fn eqlIgnoreCase(
397) !bool { 403) !bool {
398 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); 404 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
399 405
406 // Process a
400 const nfd_a = try self.nfxdCodePoints(allocator, a, .nfd); 407 const nfd_a = try self.nfxdCodePoints(allocator, a, .nfd);
401 defer allocator.free(nfd_a); 408 defer allocator.free(nfd_a);
402 const cf_nfd_a = try self.caseFold(allocator, nfd_a); 409
403 defer allocator.free(cf_nfd_a); 410 var need_frr_cf_nfd_a = false;
411 var cf_nfd_a: []const u21 = nfd_a;
412 if (self.changesWhenCaseFolded(nfd_a)) {
413 cf_nfd_a = try self.caseFold(allocator, nfd_a);
414 need_frr_cf_nfd_a = true;
415 }
416 defer if (need_frr_cf_nfd_a) allocator.free(cf_nfd_a);
417
404 const nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfd_a); 418 const nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfd_a);
405 defer allocator.free(nfkd_cf_nfd_a); 419 defer allocator.free(nfkd_cf_nfd_a);
406 const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a); 420 const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a);
@@ -408,10 +422,18 @@ pub fn eqlIgnoreCase(
408 const nfkd_cf_nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); 422 const nfkd_cf_nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a);
409 defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); 423 defer allocator.free(nfkd_cf_nfkd_cf_nfd_a);
410 424
425 // Process b
411 const nfd_b = try self.nfxdCodePoints(allocator, b, .nfd); 426 const nfd_b = try self.nfxdCodePoints(allocator, b, .nfd);
412 defer allocator.free(nfd_b); 427 defer allocator.free(nfd_b);
413 const cf_nfd_b = try self.caseFold(allocator, nfd_b); 428
414 defer allocator.free(cf_nfd_b); 429 var need_frr_cf_nfd_b = false;
430 var cf_nfd_b: []const u21 = nfd_b;
431 if (self.changesWhenCaseFolded(nfd_b)) {
432 cf_nfd_b = try self.caseFold(allocator, nfd_b);
433 need_frr_cf_nfd_b = true;
434 }
435 defer if (need_frr_cf_nfd_b) allocator.free(cf_nfd_b);
436
415 const nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfd_b); 437 const nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfd_b);
416 defer allocator.free(nfkd_cf_nfd_b); 438 defer allocator.free(nfkd_cf_nfd_b);
417 const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b); 439 const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b);