summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-03-01 18:51:43 -0400
committerGravatar Jose Colon Rodriguez2024-03-01 18:51:43 -0400
commit9a0fb96c0c28540493a205b85d1b89d2c9b50f2b (patch)
tree723760b45ef8ef604b235d10c3c60edfadd0bb70 /src
parentRemoved dupe tombstone check in Normalizer (diff)
downloadzg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.tar.gz
zg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.tar.xz
zg-9a0fb96c0c28540493a205b85d1b89d2c9b50f2b.zip
Normalizer.eqlIgnoreCase compatibility caseless matching
Diffstat (limited to 'src')
-rw-r--r--src/FoldData.zig48
-rw-r--r--src/NormData.zig4
-rw-r--r--src/Normalizer.zig103
-rw-r--r--src/main.zig17
4 files changed, 163 insertions, 9 deletions
diff --git a/src/FoldData.zig b/src/FoldData.zig
new file mode 100644
index 0000000..139c677
--- /dev/null
+++ b/src/FoldData.zig
@@ -0,0 +1,48 @@
1const std = @import("std");
2const builtin = @import("builtin");
3const compress = std.compress;
4const mem = std.mem;
5
6allocator: mem.Allocator,
7fold: [][]u21 = undefined,
8
9const Self = @This();
10
11pub fn init(allocator: mem.Allocator) !Self {
12 const decompressor = compress.deflate.decompressor;
13 const in_bytes = @embedFile("fold");
14 var in_fbs = std.io.fixedBufferStream(in_bytes);
15 var in_decomp = try decompressor(allocator, in_fbs.reader(), null);
16 defer in_decomp.deinit();
17 var reader = in_decomp.reader();
18
19 const endian = builtin.cpu.arch.endian();
20 var self = Self{
21 .allocator = allocator,
22 .fold = try allocator.alloc([]u21, 0x110000),
23 };
24
25 @memset(self.fold, &.{});
26
27 while (true) {
28 const len: u8 = try reader.readInt(u8, endian);
29 if (len == 0) break;
30 const cp = try reader.readInt(u24, endian);
31 self.fold[cp] = try allocator.alloc(u21, len - 1);
32 for (0..len - 1) |i| {
33 self.fold[cp][i] = @intCast(try reader.readInt(u24, endian));
34 }
35 }
36
37 return self;
38}
39
40pub fn deinit(self: *Self) void {
41 for (self.fold) |slice| self.allocator.free(slice);
42 self.allocator.free(self.fold);
43}
44
45/// Returns the case fold for `cp`.
46pub inline fn caseFold(self: Self, cp: u21) []const u21 {
47 return self.fold[cp];
48}
diff --git a/src/NormData.zig b/src/NormData.zig
index 7c2a09b..3c2f614 100644
--- a/src/NormData.zig
+++ b/src/NormData.zig
@@ -4,6 +4,7 @@ const mem = std.mem;
4const CanonData = @import("CanonData"); 4const CanonData = @import("CanonData");
5const CccData = @import("CombiningData"); 5const CccData = @import("CombiningData");
6const CompatData = @import("CompatData"); 6const CompatData = @import("CompatData");
7const FoldData = @import("FoldData");
7const HangulData = @import("HangulData"); 8const HangulData = @import("HangulData");
8const NormPropsData = @import("NormPropsData"); 9const NormPropsData = @import("NormPropsData");
9 10
@@ -12,6 +13,7 @@ ccc_data: CccData,
12compat_data: CompatData, 13compat_data: CompatData,
13hangul_data: HangulData, 14hangul_data: HangulData,
14normp_data: NormPropsData, 15normp_data: NormPropsData,
16fold_data: FoldData,
15 17
16const Self = @This(); 18const Self = @This();
17 19
@@ -20,6 +22,7 @@ pub fn init(allocator: std.mem.Allocator) !Self {
20 .canon_data = try CanonData.init(allocator), 22 .canon_data = try CanonData.init(allocator),
21 .ccc_data = try CccData.init(allocator), 23 .ccc_data = try CccData.init(allocator),
22 .compat_data = try CompatData.init(allocator), 24 .compat_data = try CompatData.init(allocator),
25 .fold_data = try FoldData.init(allocator),
23 .hangul_data = try HangulData.init(allocator), 26 .hangul_data = try HangulData.init(allocator),
24 .normp_data = try NormPropsData.init(allocator), 27 .normp_data = try NormPropsData.init(allocator),
25 }; 28 };
@@ -30,5 +33,6 @@ pub fn deinit(self: *Self) void {
30 self.ccc_data.deinit(); 33 self.ccc_data.deinit();
31 self.compat_data.deinit(); 34 self.compat_data.deinit();
32 self.hangul_data.deinit(); 35 self.hangul_data.deinit();
36 self.fold_data.deinit();
33 self.normp_data.deinit(); 37 self.normp_data.deinit();
34} 38}
diff --git a/src/Normalizer.zig b/src/Normalizer.zig
index abe35e5..c68b2ec 100644
--- a/src/Normalizer.zig
+++ b/src/Normalizer.zig
@@ -343,7 +343,102 @@ test "nfkd !ASCII / alloc" {
343 try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice); 343 try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice);
344} 344}
345 345
346// Composition utilities. 346fn caseFold(
347 self: Self,
348 allocator: mem.Allocator,
349 cps: []const u21,
350) ![]const u21 {
351 var cfcps = std.ArrayList(u21).init(allocator);
352 defer cfcps.deinit();
353
354 for (cps) |cp| {
355 const cf = self.norm_data.fold_data.caseFold(cp);
356
357 if (cf.len == 0) {
358 try cfcps.append(cp);
359 } else {
360 try cfcps.appendSlice(cf);
361 }
362 }
363
364 return try cfcps.toOwnedSlice();
365}
366
367fn nfkdCodePoints(
368 self: Self,
369 allocator: mem.Allocator,
370 cps: []const u21,
371) ![]u21 {
372 var dcp_list = std.ArrayList(u21).init(allocator);
373 defer dcp_list.deinit();
374
375 var dc_buf: [18]u21 = undefined;
376
377 for (cps) |cp| {
378 const dc = self.decompose(cp, .nfkd, &dc_buf);
379
380 if (dc.form == .same) {
381 try dcp_list.append(cp);
382 } else {
383 try dcp_list.appendSlice(dc.cps);
384 }
385 }
386
387 self.canonicalSort(dcp_list.items);
388
389 return try dcp_list.toOwnedSlice();
390}
391
392pub fn eqlIgnoreCase(
393 self: Self,
394 allocator: mem.Allocator,
395 a: []const u8,
396 b: []const u8,
397) !bool {
398 if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
399
400 const nfd_a = try self.nfxdCodePoints(allocator, a, .nfd);
401 defer allocator.free(nfd_a);
402 const cf_nfd_a = try self.caseFold(allocator, nfd_a);
403 defer allocator.free(cf_nfd_a);
404 const nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfd_a);
405 defer allocator.free(nfkd_cf_nfd_a);
406 const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a);
407 defer allocator.free(cf_nfkd_cf_nfd_a);
408 const nfkd_cf_nfkd_cf_nfd_a = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a);
409 defer allocator.free(nfkd_cf_nfkd_cf_nfd_a);
410
411 const nfd_b = try self.nfxdCodePoints(allocator, b, .nfd);
412 defer allocator.free(nfd_b);
413 const cf_nfd_b = try self.caseFold(allocator, nfd_b);
414 defer allocator.free(cf_nfd_b);
415 const nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfd_b);
416 defer allocator.free(nfkd_cf_nfd_b);
417 const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b);
418 defer allocator.free(cf_nfkd_cf_nfd_b);
419 const nfkd_cf_nfkd_cf_nfd_b = try self.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b);
420 defer allocator.free(nfkd_cf_nfkd_cf_nfd_b);
421
422 return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b);
423}
424
425test "eqlIgnoreCase" {
426 const allocator = testing.allocator;
427 var data = try NormData.init(allocator);
428 defer data.deinit();
429 var n = Self{ .norm_data = &data };
430
431 try testing.expect(try n.eqlIgnoreCase(allocator, "ascii only!", "ASCII Only!"));
432
433 const a = "Héllo World! \u{3d3}";
434 const b = "He\u{301}llo World! \u{3a5}\u{301}";
435 try testing.expect(try n.eqlIgnoreCase(allocator, a, b));
436
437 const c = "He\u{301}llo World! \u{3d2}\u{301}";
438 try testing.expect(try n.eqlIgnoreCase(allocator, a, c));
439}
440
441// Composition (NFC, NFKC)
347 442
348fn isHangul(self: Self, cp: u21) bool { 443fn isHangul(self: Self, cp: u21) bool {
349 return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none; 444 return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none;
@@ -504,11 +599,11 @@ test "nfkc" {
504 try testing.expectEqualStrings("Complex char: \u{038E}", result.slice); 599 try testing.expectEqualStrings("Complex char: \u{038E}", result.slice);
505} 600}
506 601
507/// Tests for equality of `a` and `b` after normalizing to NFD. 602/// Tests for equality of `a` and `b` after normalizing to NFC.
508pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool { 603pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool {
509 var norm_result_a = try self.nfd(allocator, a); 604 var norm_result_a = try self.nfc(allocator, a);
510 defer norm_result_a.deinit(); 605 defer norm_result_a.deinit();
511 var norm_result_b = try self.nfd(allocator, b); 606 var norm_result_b = try self.nfc(allocator, b);
512 defer norm_result_b.deinit(); 607 defer norm_result_b.deinit();
513 608
514 return mem.eql(u8, norm_result_a.slice, norm_result_b.slice); 609 return mem.eql(u8, norm_result_a.slice, norm_result_b.slice);
diff --git a/src/main.zig b/src/main.zig
index 59a0fbc..a5afa66 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -49,21 +49,28 @@ pub fn main() !void {
49 var iter = std.mem.splitScalar(u8, input, '\n'); 49 var iter = std.mem.splitScalar(u8, input, '\n');
50 50
51 var result: usize = 0; 51 var result: usize = 0;
52 var prev_line: []const u8 = "";
52 // var result: isize = 0; 53 // var result: isize = 0;
53 var timer = try std.time.Timer.start(); 54 var timer = try std.time.Timer.start();
54 55
55 // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code)); 56 // while (iter.next()) |cp| result += codePointWidth(@intCast(cp.code));
56 // while (iter.next()) |_| result += 1; 57 // while (iter.next()) |_| result += 1;
57 // while (iter.next()) |line| result += strWidth(line, &data); 58 // while (iter.next()) |line| result += strWidth(line, &data);
58 while (iter.next()) |line| { 59 // while (iter.next()) |line| {
59 const nfc = try n.nfkc(allocator, line); 60 // const nfc = try n.nfkc(allocator, line);
60 result += nfc.slice.len; 61 // result += nfc.slice.len;
61 // nfc.deinit(); 62 // // nfc.deinit();
62 } 63 // }
63 // while (iter.next()) |cp| { 64 // while (iter.next()) |cp| {
64 // if (cp.code == 'É') std.debug.print("`{u}` Gc: {s}\n", .{ cp.code, @tagName(gencat_data.gc(cp.code)) }); 65 // if (cp.code == 'É') std.debug.print("`{u}` Gc: {s}\n", .{ cp.code, @tagName(gencat_data.gc(cp.code)) });
65 // result += 1; 66 // result += 1;
66 // } 67 // }
68 while (iter.next()) |line| {
69 if (try n.eqlIgnoreCase(allocator, prev_line, line)) {
70 result += line.len;
71 }
72 prev_line = line;
73 }
67 74
68 std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms }); 75 std.debug.print("result: {}, took: {}\n", .{ result, timer.lap() / std.time.ns_per_ms });
69} 76}