diff options
| author | 2024-03-01 19:28:41 -0400 | |
|---|---|---|
| committer | 2024-03-01 19:28:41 -0400 | |
| commit | 9b435e69cb9f1572728b38457fabc9636bc47143 (patch) | |
| tree | fabc23d4f122906457ab8e389ef44c23205d506e | |
| parent | Normalizer.eqlIgnoreCase compatibility caseless matching (diff) | |
| download | zg-9b435e69cb9f1572728b38457fabc9636bc47143.tar.gz zg-9b435e69cb9f1572728b38457fabc9636bc47143.tar.xz zg-9b435e69cb9f1572728b38457fabc9636bc47143.zip | |
Changes when case folded check; 20ms faster
| -rw-r--r-- | codegen/fold.zig | 54 | ||||
| -rw-r--r-- | src/FoldData.zig | 14 | ||||
| -rw-r--r-- | src/Normalizer.zig | 30 |
3 files changed, 89 insertions, 9 deletions
diff --git a/codegen/fold.zig b/codegen/fold.zig index 7977e61..b3192e7 100644 --- a/codegen/fold.zig +++ b/codegen/fold.zig | |||
| @@ -8,7 +8,51 @@ pub fn main() !void { | |||
| 8 | defer arena.deinit(); | 8 | defer arena.deinit(); |
| 9 | const allocator = arena.allocator(); | 9 | const allocator = arena.allocator(); |
| 10 | 10 | ||
| 11 | // Process DerivedEastAsianWidth.txt | 11 | // Process DerivedCoreProperties.txt |
| 12 | var cp_file = try std.fs.cwd().openFile("data/unicode/DerivedCoreProperties.txt", .{}); | ||
| 13 | defer cp_file.close(); | ||
| 14 | var cp_buf = std.io.bufferedReader(cp_file.reader()); | ||
| 15 | const cp_reader = cp_buf.reader(); | ||
| 16 | |||
| 17 | var cp_map = std.AutoHashMap(u21, void).init(allocator); | ||
| 18 | defer cp_map.deinit(); | ||
| 19 | |||
| 20 | var line_buf: [4096]u8 = undefined; | ||
| 21 | |||
| 22 | cp_lines: while (try cp_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { | ||
| 23 | if (line.len == 0 or line[0] == '#') continue; | ||
| 24 | |||
| 25 | const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line; | ||
| 26 | |||
| 27 | var field_iter = std.mem.tokenizeAny(u8, no_comment, "; "); | ||
| 28 | var current_code: [2]u21 = undefined; | ||
| 29 | |||
| 30 | var i: usize = 0; | ||
| 31 | while (field_iter.next()) |field| : (i += 1) { | ||
| 32 | switch (i) { | ||
| 33 | 0 => { | ||
| 34 | // Code point(s) | ||
| 35 | if (std.mem.indexOf(u8, field, "..")) |dots| { | ||
| 36 | current_code = .{ | ||
| 37 | try std.fmt.parseInt(u21, field[0..dots], 16), | ||
| 38 | try std.fmt.parseInt(u21, field[dots + 2 ..], 16), | ||
| 39 | }; | ||
| 40 | } else { | ||
| 41 | const code = try std.fmt.parseInt(u21, field, 16); | ||
| 42 | current_code = .{ code, code }; | ||
| 43 | } | ||
| 44 | }, | ||
| 45 | 1 => { | ||
| 46 | // Core property | ||
| 47 | if (!mem.eql(u8, field, "Changes_When_Casefolded")) continue :cp_lines; | ||
| 48 | for (current_code[0]..current_code[1] + 1) |cp| try cp_map.put(@intCast(cp), {}); | ||
| 49 | }, | ||
| 50 | else => {}, | ||
| 51 | } | ||
| 52 | } | ||
| 53 | } | ||
| 54 | |||
| 55 | // Process CaseFolding.txt | ||
| 12 | var in_file = try std.fs.cwd().openFile("data/unicode/CaseFolding.txt", .{}); | 56 | var in_file = try std.fs.cwd().openFile("data/unicode/CaseFolding.txt", .{}); |
| 13 | defer in_file.close(); | 57 | defer in_file.close(); |
| 14 | var in_buf = std.io.bufferedReader(in_file.reader()); | 58 | var in_buf = std.io.bufferedReader(in_file.reader()); |
| @@ -27,7 +71,6 @@ pub fn main() !void { | |||
| 27 | const writer = out_comp.writer(); | 71 | const writer = out_comp.writer(); |
| 28 | 72 | ||
| 29 | const endian = builtin.cpu.arch.endian(); | 73 | const endian = builtin.cpu.arch.endian(); |
| 30 | var line_buf: [4096]u8 = undefined; | ||
| 31 | 74 | ||
| 32 | lines: while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { | 75 | lines: while (try in_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { |
| 33 | if (line.len == 0 or line[0] == '#') continue; | 76 | if (line.len == 0 or line[0] == '#') continue; |
| @@ -41,7 +84,12 @@ pub fn main() !void { | |||
| 41 | var i: usize = 0; | 84 | var i: usize = 0; |
| 42 | while (field_iter.next()) |field| : (i += 1) { | 85 | while (field_iter.next()) |field| : (i += 1) { |
| 43 | switch (i) { | 86 | switch (i) { |
| 44 | 0 => cps[0] = try fmt.parseInt(u24, field, 16), | 87 | 0 => { |
| 88 | var cp = try fmt.parseInt(u21, field, 16); | ||
| 89 | cp <<= 1; | ||
| 90 | if (cp_map.contains(cp)) cp |= 1; | ||
| 91 | cps[0] = cp; | ||
| 92 | }, | ||
| 45 | 93 | ||
| 46 | 1 => { | 94 | 1 => { |
| 47 | if (!mem.eql(u8, field, "C") and !mem.eql(u8, field, "F")) continue :lines; | 95 | if (!mem.eql(u8, field, "C") and !mem.eql(u8, field, "F")) continue :lines; |
diff --git a/src/FoldData.zig b/src/FoldData.zig index 139c677..2a9a1f5 100644 --- a/src/FoldData.zig +++ b/src/FoldData.zig | |||
| @@ -5,6 +5,7 @@ const mem = std.mem; | |||
| 5 | 5 | ||
| 6 | allocator: mem.Allocator, | 6 | allocator: mem.Allocator, |
| 7 | fold: [][]u21 = undefined, | 7 | fold: [][]u21 = undefined, |
| 8 | cwcf: []bool = undefined, | ||
| 8 | 9 | ||
| 9 | const Self = @This(); | 10 | const Self = @This(); |
| 10 | 11 | ||
| @@ -20,18 +21,21 @@ pub fn init(allocator: mem.Allocator) !Self { | |||
| 20 | var self = Self{ | 21 | var self = Self{ |
| 21 | .allocator = allocator, | 22 | .allocator = allocator, |
| 22 | .fold = try allocator.alloc([]u21, 0x110000), | 23 | .fold = try allocator.alloc([]u21, 0x110000), |
| 24 | .cwcf = try allocator.alloc(bool, 0x110000), | ||
| 23 | }; | 25 | }; |
| 24 | 26 | ||
| 25 | @memset(self.fold, &.{}); | 27 | @memset(self.fold, &.{}); |
| 28 | @memset(self.cwcf, false); | ||
| 26 | 29 | ||
| 27 | while (true) { | 30 | while (true) { |
| 28 | const len: u8 = try reader.readInt(u8, endian); | 31 | const len: u8 = try reader.readInt(u8, endian); |
| 29 | if (len == 0) break; | 32 | if (len == 0) break; |
| 30 | const cp = try reader.readInt(u24, endian); | 33 | const cp = try reader.readInt(u24, endian); |
| 31 | self.fold[cp] = try allocator.alloc(u21, len - 1); | 34 | self.fold[cp >> 1] = try allocator.alloc(u21, len - 1); |
| 32 | for (0..len - 1) |i| { | 35 | for (0..len - 1) |i| { |
| 33 | self.fold[cp][i] = @intCast(try reader.readInt(u24, endian)); | 36 | self.fold[cp >> 1][i] = @intCast(try reader.readInt(u24, endian)); |
| 34 | } | 37 | } |
| 38 | self.cwcf[cp >> 1] = cp & 1 == 1; | ||
| 35 | } | 39 | } |
| 36 | 40 | ||
| 37 | return self; | 41 | return self; |
| @@ -40,9 +44,15 @@ pub fn init(allocator: mem.Allocator) !Self { | |||
| 40 | pub fn deinit(self: *Self) void { | 44 | pub fn deinit(self: *Self) void { |
| 41 | for (self.fold) |slice| self.allocator.free(slice); | 45 | for (self.fold) |slice| self.allocator.free(slice); |
| 42 | self.allocator.free(self.fold); | 46 | self.allocator.free(self.fold); |
| 47 | self.allocator.free(self.cwcf); | ||
| 43 | } | 48 | } |
| 44 | 49 | ||
| 45 | /// Returns the case fold for `cp`. | 50 | /// Returns the case fold for `cp`. |
| 46 | pub inline fn caseFold(self: Self, cp: u21) []const u21 { | 51 | pub inline fn caseFold(self: Self, cp: u21) []const u21 { |
| 47 | return self.fold[cp]; | 52 | return self.fold[cp]; |
| 48 | } | 53 | } |
| 54 | |||
| 55 | /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). | ||
| 56 | pub inline fn changesWhenCaseFolded(self: Self, cp: u21) bool { | ||
| 57 | return self.cwcf[cp]; | ||
| 58 | } | ||
diff --git a/src/Normalizer.zig b/src/Normalizer.zig index c68b2ec..5a26dfa 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig | |||
| @@ -389,6 +389,12 @@ fn nfkdCodePoints( | |||
| 389 | return try dcp_list.toOwnedSlice(); | 389 | return try dcp_list.toOwnedSlice(); |
| 390 | } | 390 | } |
| 391 | 391 | ||
| 392 | fn changesWhenCaseFolded(self: Self, cps: []const u21) bool { | ||
| 393 | return for (cps) |cp| { | ||
| 394 | if (self.norm_data.fold_data.changesWhenCaseFolded(cp)) break true; | ||
| 395 | } else false; | ||
| 396 | } | ||
| 397 | |||
| 392 | pub fn eqlIgnoreCase( | 398 | pub fn eqlIgnoreCase( |
| 393 | self: Self, | 399 | self: Self, |
| 394 | allocator: mem.Allocator, | 400 | allocator: mem.Allocator, |
| @@ -397,10 +403,18 @@ pub fn eqlIgnoreCase( | |||
| 397 | ) !bool { | 403 | ) !bool { |
| 398 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); | 404 | if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b); |
| 399 | 405 | ||
| 406 | // Process a | ||
| 400 | const nfd_a = try self.nfxdCodePoints(allocator, a, .nfd); | 407 | const nfd_a = try self.nfxdCodePoints(allocator, a, .nfd); |
| 401 | defer allocator.free(nfd_a); | 408 | defer allocator.free(nfd_a); |
| 402 | const cf_nfd_a = try self.caseFold(allocator, nfd_a); | 409 | |
| 403 | defer allocator.free(cf_nfd_a); | 410 | var need_frr_cf_nfd_a = false; |
| 411 | var cf_nfd_a: []const u21 = nfd_a; | ||
| 412 | if (self.changesWhenCaseFolded(nfd_a)) { | ||
| 413 | cf_nfd_a = try self.caseFold(allocator, nfd_a); | ||
| 414 | need_frr_cf_nfd_a = true; | ||
| 415 | } | ||
| 416 | defer if (need_frr_cf_nfd_a) allocator.free(cf_nfd_a); | ||
| 417 | |||
| 404 | const nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfd_a); | 418 | const nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfd_a); |
| 405 | defer allocator.free(nfkd_cf_nfd_a); | 419 | defer allocator.free(nfkd_cf_nfd_a); |
| 406 | const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a); | 420 | const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a); |
| @@ -408,10 +422,18 @@ pub fn eqlIgnoreCase( | |||
| 408 | const nfkd_cf_nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); | 422 | const nfkd_cf_nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a); |
| 409 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); | 423 | defer allocator.free(nfkd_cf_nfkd_cf_nfd_a); |
| 410 | 424 | ||
| 425 | // Process b | ||
| 411 | const nfd_b = try self.nfxdCodePoints(allocator, b, .nfd); | 426 | const nfd_b = try self.nfxdCodePoints(allocator, b, .nfd); |
| 412 | defer allocator.free(nfd_b); | 427 | defer allocator.free(nfd_b); |
| 413 | const cf_nfd_b = try self.caseFold(allocator, nfd_b); | 428 | |
| 414 | defer allocator.free(cf_nfd_b); | 429 | var need_frr_cf_nfd_b = false; |
| 430 | var cf_nfd_b: []const u21 = nfd_b; | ||
| 431 | if (self.changesWhenCaseFolded(nfd_b)) { | ||
| 432 | cf_nfd_b = try self.caseFold(allocator, nfd_b); | ||
| 433 | need_frr_cf_nfd_b = true; | ||
| 434 | } | ||
| 435 | defer if (need_frr_cf_nfd_b) allocator.free(cf_nfd_b); | ||
| 436 | |||
| 415 | const nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfd_b); | 437 | const nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfd_b); |
| 416 | defer allocator.free(nfkd_cf_nfd_b); | 438 | defer allocator.free(nfkd_cf_nfd_b); |
| 417 | const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b); | 439 | const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b); |