diff options
| author | 2025-04-30 12:58:26 -0400 | |
|---|---|---|
| committer | 2025-04-30 13:01:37 -0400 | |
| commit | 3c2c30bfbe861c6c48acd8d7507886787197a788 (patch) | |
| tree | 875ba35c1954b201207452b18a189ebd70c0b596 /src | |
| parent | grapheme now Graphemes, Data files gone (diff) | |
| download | zg-3c2c30bfbe861c6c48acd8d7507886787197a788.tar.gz zg-3c2c30bfbe861c6c48acd8d7507886787197a788.tar.xz zg-3c2c30bfbe861c6c48acd8d7507886787197a788.zip | |
Merge NormData with Normalize
Diffstat (limited to 'src')
| -rw-r--r-- | src/CanonData.zig | 50 | ||||
| -rw-r--r-- | src/CaseFold.zig | 12 | ||||
| -rw-r--r-- | src/CombiningData.zig | 44 | ||||
| -rw-r--r-- | src/CompatData.zig | 36 | ||||
| -rw-r--r-- | src/FoldData.zig | 78 | ||||
| -rw-r--r-- | src/HangulData.zig | 42 | ||||
| -rw-r--r-- | src/NormData.zig | 37 | ||||
| -rw-r--r-- | src/NormPropsData.zig | 50 | ||||
| -rw-r--r-- | src/Normalize.zig | 193 | ||||
| -rw-r--r-- | src/unicode_tests.zig | 5 |
10 files changed, 269 insertions, 278 deletions
diff --git a/src/CanonData.zig b/src/CanonData.zig index 794748c..c67d1d6 100644 --- a/src/CanonData.zig +++ b/src/CanonData.zig | |||
| @@ -1,14 +1,11 @@ | |||
| 1 | const std = @import("std"); | 1 | //! Canonicalization Data |
| 2 | const builtin = @import("builtin"); | ||
| 3 | const compress = std.compress; | ||
| 4 | const mem = std.mem; | ||
| 5 | 2 | ||
| 6 | nfc: std.AutoHashMapUnmanaged([2]u21, u21), | 3 | nfc: std.AutoHashMapUnmanaged([2]u21, u21), |
| 7 | nfd: [][]u21 = undefined, | 4 | nfd: [][]u21 = undefined, |
| 8 | 5 | ||
| 9 | const Self = @This(); | 6 | const CanonData = @This(); |
| 10 | 7 | ||
| 11 | pub fn init(allocator: mem.Allocator) !Self { | 8 | pub fn init(allocator: mem.Allocator) !CanonData { |
| 12 | const decompressor = compress.flate.inflate.decompressor; | 9 | const decompressor = compress.flate.inflate.decompressor; |
| 13 | const in_bytes = @embedFile("canon"); | 10 | const in_bytes = @embedFile("canon"); |
| 14 | var in_fbs = std.io.fixedBufferStream(in_bytes); | 11 | var in_fbs = std.io.fixedBufferStream(in_bytes); |
| @@ -16,49 +13,54 @@ pub fn init(allocator: mem.Allocator) !Self { | |||
| 16 | var reader = in_decomp.reader(); | 13 | var reader = in_decomp.reader(); |
| 17 | 14 | ||
| 18 | const endian = builtin.cpu.arch.endian(); | 15 | const endian = builtin.cpu.arch.endian(); |
| 19 | var self = Self{ | 16 | var cdata = CanonData{ |
| 20 | .nfc = .{}, | 17 | .nfc = .empty, |
| 21 | .nfd = try allocator.alloc([]u21, 0x110000), | 18 | .nfd = try allocator.alloc([]u21, 0x110000), |
| 22 | }; | 19 | }; |
| 23 | 20 | ||
| 24 | var slices: usize = 0; | 21 | var slices: usize = 0; |
| 25 | errdefer { | 22 | errdefer { |
| 26 | self.nfc.deinit(allocator); | 23 | cdata.nfc.deinit(allocator); |
| 27 | for (self.nfd[0..slices]) |slice| allocator.free(slice); | 24 | for (cdata.nfd[0..slices]) |slice| allocator.free(slice); |
| 28 | allocator.free(self.nfd); | 25 | allocator.free(cdata.nfd); |
| 29 | } | 26 | } |
| 30 | 27 | ||
| 31 | @memset(self.nfd, &.{}); | 28 | @memset(cdata.nfd, &.{}); |
| 32 | 29 | ||
| 33 | while (true) { | 30 | while (true) { |
| 34 | const len: u8 = try reader.readInt(u8, endian); | 31 | const len: u8 = try reader.readInt(u8, endian); |
| 35 | if (len == 0) break; | 32 | if (len == 0) break; |
| 36 | const cp = try reader.readInt(u24, endian); | 33 | const cp = try reader.readInt(u24, endian); |
| 37 | self.nfd[cp] = try allocator.alloc(u21, len - 1); | 34 | cdata.nfd[cp] = try allocator.alloc(u21, len - 1); |
| 38 | slices += 1; | 35 | slices += 1; |
| 39 | for (0..len - 1) |i| { | 36 | for (0..len - 1) |i| { |
| 40 | self.nfd[cp][i] = @intCast(try reader.readInt(u24, endian)); | 37 | cdata.nfd[cp][i] = @intCast(try reader.readInt(u24, endian)); |
| 41 | } | 38 | } |
| 42 | if (len == 3) { | 39 | if (len == 3) { |
| 43 | try self.nfc.put(allocator, self.nfd[cp][0..2].*, @intCast(cp)); | 40 | try cdata.nfc.put(allocator, cdata.nfd[cp][0..2].*, @intCast(cp)); |
| 44 | } | 41 | } |
| 45 | } | 42 | } |
| 46 | 43 | ||
| 47 | return self; | 44 | return cdata; |
| 48 | } | 45 | } |
| 49 | 46 | ||
| 50 | pub fn deinit(self: *Self, allocator: mem.Allocator) void { | 47 | pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void { |
| 51 | self.nfc.deinit(allocator); | 48 | cdata.nfc.deinit(allocator); |
| 52 | for (self.nfd) |slice| allocator.free(slice); | 49 | for (cdata.nfd) |slice| allocator.free(slice); |
| 53 | allocator.free(self.nfd); | 50 | allocator.free(cdata.nfd); |
| 54 | } | 51 | } |
| 55 | 52 | ||
| 56 | /// Returns canonical decomposition for `cp`. | 53 | /// Returns canonical decomposition for `cp`. |
| 57 | pub fn toNfd(self: Self, cp: u21) []const u21 { | 54 | pub fn toNfd(cdata: *const CanonData, cp: u21) []const u21 { |
| 58 | return self.nfd[cp]; | 55 | return cdata.nfd[cp]; |
| 59 | } | 56 | } |
| 60 | 57 | ||
| 61 | // Returns the primary composite for the codepoints in `cp`. | 58 | // Returns the primary composite for the codepoints in `cp`. |
| 62 | pub fn toNfc(self: Self, cps: [2]u21) ?u21 { | 59 | pub fn toNfc(cdata: *const CanonData, cps: [2]u21) ?u21 { |
| 63 | return self.nfc.get(cps); | 60 | return cdata.nfc.get(cps); |
| 64 | } | 61 | } |
| 62 | |||
| 63 | const std = @import("std"); | ||
| 64 | const builtin = @import("builtin"); | ||
| 65 | const compress = std.compress; | ||
| 66 | const mem = std.mem; | ||
diff --git a/src/CaseFold.zig b/src/CaseFold.zig index c84a420..6490aea 100644 --- a/src/CaseFold.zig +++ b/src/CaseFold.zig | |||
| @@ -95,10 +95,8 @@ pub fn compatCaselessMatch( | |||
| 95 | test "compatCaselessMatch" { | 95 | test "compatCaselessMatch" { |
| 96 | const allocator = testing.allocator; | 96 | const allocator = testing.allocator; |
| 97 | 97 | ||
| 98 | var norm_data = Normalize.NormData{}; | 98 | const n = try Normalize.init(allocator); |
| 99 | try norm_data.init(allocator); | 99 | defer n.deinit(allocator); |
| 100 | defer norm_data.deinit(allocator); | ||
| 101 | const n = Normalize{ .norm_data = &norm_data }; | ||
| 102 | 100 | ||
| 103 | const fold_data = try FoldData.init(allocator); | 101 | const fold_data = try FoldData.init(allocator); |
| 104 | defer fold_data.deinit(allocator); | 102 | defer fold_data.deinit(allocator); |
| @@ -171,10 +169,8 @@ pub fn canonCaselessMatch( | |||
| 171 | test "canonCaselessMatch" { | 169 | test "canonCaselessMatch" { |
| 172 | const allocator = testing.allocator; | 170 | const allocator = testing.allocator; |
| 173 | 171 | ||
| 174 | var norm_data = Normalize.NormData{}; | 172 | const n = try Normalize.init(allocator); |
| 175 | try norm_data.init(allocator); | 173 | defer n.deinit(allocator); |
| 176 | defer norm_data.deinit(allocator); | ||
| 177 | const n = Normalize{ .norm_data = &norm_data }; | ||
| 178 | 174 | ||
| 179 | const fold_data = try FoldData.init(allocator); | 175 | const fold_data = try FoldData.init(allocator); |
| 180 | defer fold_data.deinit(allocator); | 176 | defer fold_data.deinit(allocator); |
diff --git a/src/CombiningData.zig b/src/CombiningData.zig index b5e227a..fd64a3b 100644 --- a/src/CombiningData.zig +++ b/src/CombiningData.zig | |||
| @@ -1,14 +1,11 @@ | |||
| 1 | const std = @import("std"); | 1 | //! Combining Class Data |
| 2 | const builtin = @import("builtin"); | ||
| 3 | const compress = std.compress; | ||
| 4 | const mem = std.mem; | ||
| 5 | 2 | ||
| 6 | s1: []u16 = undefined, | 3 | s1: []u16 = undefined, |
| 7 | s2: []u8 = undefined, | 4 | s2: []u8 = undefined, |
| 8 | 5 | ||
| 9 | const Self = @This(); | 6 | const CombiningData = @This(); |
| 10 | 7 | ||
| 11 | pub fn init(allocator: mem.Allocator) !Self { | 8 | pub fn init(allocator: mem.Allocator) !CombiningData { |
| 12 | const decompressor = compress.flate.inflate.decompressor; | 9 | const decompressor = compress.flate.inflate.decompressor; |
| 13 | const in_bytes = @embedFile("ccc"); | 10 | const in_bytes = @embedFile("ccc"); |
| 14 | var in_fbs = std.io.fixedBufferStream(in_bytes); | 11 | var in_fbs = std.io.fixedBufferStream(in_bytes); |
| @@ -17,32 +14,37 @@ pub fn init(allocator: mem.Allocator) !Self { | |||
| 17 | 14 | ||
| 18 | const endian = builtin.cpu.arch.endian(); | 15 | const endian = builtin.cpu.arch.endian(); |
| 19 | 16 | ||
| 20 | var self = Self{}; | 17 | var cbdata = CombiningData{}; |
| 21 | 18 | ||
| 22 | const stage_1_len: u16 = try reader.readInt(u16, endian); | 19 | const stage_1_len: u16 = try reader.readInt(u16, endian); |
| 23 | self.s1 = try allocator.alloc(u16, stage_1_len); | 20 | cbdata.s1 = try allocator.alloc(u16, stage_1_len); |
| 24 | errdefer allocator.free(self.s1); | 21 | errdefer allocator.free(cbdata.s1); |
| 25 | for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); | 22 | for (0..stage_1_len) |i| cbdata.s1[i] = try reader.readInt(u16, endian); |
| 26 | 23 | ||
| 27 | const stage_2_len: u16 = try reader.readInt(u16, endian); | 24 | const stage_2_len: u16 = try reader.readInt(u16, endian); |
| 28 | self.s2 = try allocator.alloc(u8, stage_2_len); | 25 | cbdata.s2 = try allocator.alloc(u8, stage_2_len); |
| 29 | errdefer allocator.free(self.s2); | 26 | errdefer allocator.free(cbdata.s2); |
| 30 | _ = try reader.readAll(self.s2); | 27 | _ = try reader.readAll(cbdata.s2); |
| 31 | 28 | ||
| 32 | return self; | 29 | return cbdata; |
| 33 | } | 30 | } |
| 34 | 31 | ||
| 35 | pub fn deinit(self: *const Self, allocator: mem.Allocator) void { | 32 | pub fn deinit(cbdata: *const CombiningData, allocator: mem.Allocator) void { |
| 36 | allocator.free(self.s1); | 33 | allocator.free(cbdata.s1); |
| 37 | allocator.free(self.s2); | 34 | allocator.free(cbdata.s2); |
| 38 | } | 35 | } |
| 39 | 36 | ||
| 40 | /// Returns the canonical combining class for a code point. | 37 | /// Returns the canonical combining class for a code point. |
| 41 | pub fn ccc(self: Self, cp: u21) u8 { | 38 | pub fn ccc(cbdata: CombiningData, cp: u21) u8 { |
| 42 | return self.s2[self.s1[cp >> 8] + (cp & 0xff)]; | 39 | return cbdata.s2[cbdata.s1[cp >> 8] + (cp & 0xff)]; |
| 43 | } | 40 | } |
| 44 | 41 | ||
| 45 | /// True if `cp` is a starter code point, not a combining character. | 42 | /// True if `cp` is a starter code point, not a combining character. |
| 46 | pub fn isStarter(self: Self, cp: u21) bool { | 43 | pub fn isStarter(cbdata: CombiningData, cp: u21) bool { |
| 47 | return self.s2[self.s1[cp >> 8] + (cp & 0xff)] == 0; | 44 | return cbdata.s2[cbdata.s1[cp >> 8] + (cp & 0xff)] == 0; |
| 48 | } | 45 | } |
| 46 | |||
| 47 | const std = @import("std"); | ||
| 48 | const builtin = @import("builtin"); | ||
| 49 | const compress = std.compress; | ||
| 50 | const mem = std.mem; | ||
diff --git a/src/CompatData.zig b/src/CompatData.zig index ac08048..d787103 100644 --- a/src/CompatData.zig +++ b/src/CompatData.zig | |||
| @@ -1,13 +1,10 @@ | |||
| 1 | const std = @import("std"); | 1 | //! Compatibility Data |
| 2 | const builtin = @import("builtin"); | ||
| 3 | const compress = std.compress; | ||
| 4 | const mem = std.mem; | ||
| 5 | 2 | ||
| 6 | nfkd: [][]u21 = undefined, | 3 | nfkd: [][]u21 = undefined, |
| 7 | 4 | ||
| 8 | const Self = @This(); | 5 | const CompatData = @This(); |
| 9 | 6 | ||
| 10 | pub fn init(allocator: mem.Allocator) !Self { | 7 | pub fn init(allocator: mem.Allocator) !CompatData { |
| 11 | const decompressor = compress.flate.inflate.decompressor; | 8 | const decompressor = compress.flate.inflate.decompressor; |
| 12 | const in_bytes = @embedFile("compat"); | 9 | const in_bytes = @embedFile("compat"); |
| 13 | var in_fbs = std.io.fixedBufferStream(in_bytes); | 10 | var in_fbs = std.io.fixedBufferStream(in_bytes); |
| @@ -15,34 +12,39 @@ pub fn init(allocator: mem.Allocator) !Self { | |||
| 15 | var reader = in_decomp.reader(); | 12 | var reader = in_decomp.reader(); |
| 16 | 13 | ||
| 17 | const endian = builtin.cpu.arch.endian(); | 14 | const endian = builtin.cpu.arch.endian(); |
| 18 | var self = Self{ | 15 | var cpdata = CompatData{ |
| 19 | .nfkd = try allocator.alloc([]u21, 0x110000), | 16 | .nfkd = try allocator.alloc([]u21, 0x110000), |
| 20 | }; | 17 | }; |
| 21 | errdefer self.deinit(allocator); | 18 | errdefer cpdata.deinit(allocator); |
| 22 | 19 | ||
| 23 | @memset(self.nfkd, &.{}); | 20 | @memset(cpdata.nfkd, &.{}); |
| 24 | 21 | ||
| 25 | while (true) { | 22 | while (true) { |
| 26 | const len: u8 = try reader.readInt(u8, endian); | 23 | const len: u8 = try reader.readInt(u8, endian); |
| 27 | if (len == 0) break; | 24 | if (len == 0) break; |
| 28 | const cp = try reader.readInt(u24, endian); | 25 | const cp = try reader.readInt(u24, endian); |
| 29 | self.nfkd[cp] = try allocator.alloc(u21, len - 1); | 26 | cpdata.nfkd[cp] = try allocator.alloc(u21, len - 1); |
| 30 | for (0..len - 1) |i| { | 27 | for (0..len - 1) |i| { |
| 31 | self.nfkd[cp][i] = @intCast(try reader.readInt(u24, endian)); | 28 | cpdata.nfkd[cp][i] = @intCast(try reader.readInt(u24, endian)); |
| 32 | } | 29 | } |
| 33 | } | 30 | } |
| 34 | 31 | ||
| 35 | return self; | 32 | return cpdata; |
| 36 | } | 33 | } |
| 37 | 34 | ||
| 38 | pub fn deinit(self: *const Self, allocator: mem.Allocator) void { | 35 | pub fn deinit(cpdata: *const CompatData, allocator: mem.Allocator) void { |
| 39 | for (self.nfkd) |slice| { | 36 | for (cpdata.nfkd) |slice| { |
| 40 | if (slice.len != 0) allocator.free(slice); | 37 | if (slice.len != 0) allocator.free(slice); |
| 41 | } | 38 | } |
| 42 | allocator.free(self.nfkd); | 39 | allocator.free(cpdata.nfkd); |
| 43 | } | 40 | } |
| 44 | 41 | ||
| 45 | /// Returns compatibility decomposition for `cp`. | 42 | /// Returns compatibility decomposition for `cp`. |
| 46 | pub fn toNfkd(self: Self, cp: u21) []u21 { | 43 | pub fn toNfkd(cpdata: *const CompatData, cp: u21) []u21 { |
| 47 | return self.nfkd[cp]; | 44 | return cpdata.nfkd[cp]; |
| 48 | } | 45 | } |
| 46 | |||
| 47 | const std = @import("std"); | ||
| 48 | const builtin = @import("builtin"); | ||
| 49 | const compress = std.compress; | ||
| 50 | const mem = std.mem; | ||
diff --git a/src/FoldData.zig b/src/FoldData.zig index e44e714..b7fdceb 100644 --- a/src/FoldData.zig +++ b/src/FoldData.zig | |||
| @@ -12,9 +12,9 @@ stage1: []u8 = undefined, | |||
| 12 | stage2: []u8 = undefined, | 12 | stage2: []u8 = undefined, |
| 13 | stage3: []i24 = undefined, | 13 | stage3: []i24 = undefined, |
| 14 | 14 | ||
| 15 | const Self = @This(); | 15 | const FoldData = @This(); |
| 16 | 16 | ||
| 17 | pub fn init(allocator: mem.Allocator) !Self { | 17 | pub fn init(allocator: mem.Allocator) !FoldData { |
| 18 | const decompressor = compress.flate.inflate.decompressor; | 18 | const decompressor = compress.flate.inflate.decompressor; |
| 19 | const in_bytes = @embedFile("fold"); | 19 | const in_bytes = @embedFile("fold"); |
| 20 | var in_fbs = std.io.fixedBufferStream(in_bytes); | 20 | var in_fbs = std.io.fixedBufferStream(in_bytes); |
| @@ -23,61 +23,61 @@ pub fn init(allocator: mem.Allocator) !Self { | |||
| 23 | 23 | ||
| 24 | const endian = builtin.cpu.arch.endian(); | 24 | const endian = builtin.cpu.arch.endian(); |
| 25 | 25 | ||
| 26 | var self = Self{}; | 26 | var fdata = FoldData{}; |
| 27 | self.cutoff = @intCast(try reader.readInt(u24, endian)); | 27 | fdata.cutoff = @intCast(try reader.readInt(u24, endian)); |
| 28 | self.multiple_start = @intCast(try reader.readInt(u24, endian)); | 28 | fdata.multiple_start = @intCast(try reader.readInt(u24, endian)); |
| 29 | 29 | ||
| 30 | var len = try reader.readInt(u16, endian); | 30 | var len = try reader.readInt(u16, endian); |
| 31 | self.stage1 = try allocator.alloc(u8, len); | 31 | fdata.stage1 = try allocator.alloc(u8, len); |
| 32 | errdefer allocator.free(self.stage1); | 32 | errdefer allocator.free(fdata.stage1); |
| 33 | for (0..len) |i| self.stage1[i] = try reader.readInt(u8, endian); | 33 | for (0..len) |i| fdata.stage1[i] = try reader.readInt(u8, endian); |
| 34 | 34 | ||
| 35 | len = try reader.readInt(u16, endian); | 35 | len = try reader.readInt(u16, endian); |
| 36 | self.stage2 = try allocator.alloc(u8, len); | 36 | fdata.stage2 = try allocator.alloc(u8, len); |
| 37 | errdefer allocator.free(self.stage2); | 37 | errdefer allocator.free(fdata.stage2); |
| 38 | for (0..len) |i| self.stage2[i] = try reader.readInt(u8, endian); | 38 | for (0..len) |i| fdata.stage2[i] = try reader.readInt(u8, endian); |
| 39 | 39 | ||
| 40 | len = try reader.readInt(u16, endian); | 40 | len = try reader.readInt(u16, endian); |
| 41 | self.stage3 = try allocator.alloc(i24, len); | 41 | fdata.stage3 = try allocator.alloc(i24, len); |
| 42 | errdefer allocator.free(self.stage3); | 42 | errdefer allocator.free(fdata.stage3); |
| 43 | for (0..len) |i| self.stage3[i] = try reader.readInt(i24, endian); | 43 | for (0..len) |i| fdata.stage3[i] = try reader.readInt(i24, endian); |
| 44 | 44 | ||
| 45 | self.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian)); | 45 | fdata.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian)); |
| 46 | self.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian)); | 46 | fdata.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian)); |
| 47 | len = try reader.readInt(u16, endian); | 47 | len = try reader.readInt(u16, endian); |
| 48 | self.cwcf_exceptions = try allocator.alloc(u21, len); | 48 | fdata.cwcf_exceptions = try allocator.alloc(u21, len); |
| 49 | errdefer allocator.free(self.cwcf_exceptions); | 49 | errdefer allocator.free(fdata.cwcf_exceptions); |
| 50 | for (0..len) |i| self.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian)); | 50 | for (0..len) |i| fdata.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian)); |
| 51 | 51 | ||
| 52 | return self; | 52 | return fdata; |
| 53 | } | 53 | } |
| 54 | 54 | ||
| 55 | pub fn deinit(self: *const Self, allocator: mem.Allocator) void { | 55 | pub fn deinit(fdata: *const FoldData, allocator: mem.Allocator) void { |
| 56 | allocator.free(self.stage1); | 56 | allocator.free(fdata.stage1); |
| 57 | allocator.free(self.stage2); | 57 | allocator.free(fdata.stage2); |
| 58 | allocator.free(self.stage3); | 58 | allocator.free(fdata.stage3); |
| 59 | allocator.free(self.cwcf_exceptions); | 59 | allocator.free(fdata.cwcf_exceptions); |
| 60 | } | 60 | } |
| 61 | 61 | ||
| 62 | /// Returns the case fold for `cp`. | 62 | /// Returns the case fold for `cp`. |
| 63 | pub fn caseFold(self: Self, cp: u21, buf: []u21) []const u21 { | 63 | pub fn caseFold(fdata: *const FoldData, cp: u21, buf: []u21) []const u21 { |
| 64 | if (cp >= self.cutoff) return &.{}; | 64 | if (cp >= fdata.cutoff) return &.{}; |
| 65 | 65 | ||
| 66 | const stage1_val = self.stage1[cp >> 8]; | 66 | const stage1_val = fdata.stage1[cp >> 8]; |
| 67 | if (stage1_val == 0) return &.{}; | 67 | if (stage1_val == 0) return &.{}; |
| 68 | 68 | ||
| 69 | const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF); | 69 | const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF); |
| 70 | const stage3_index = self.stage2[stage2_index]; | 70 | const stage3_index = fdata.stage2[stage2_index]; |
| 71 | 71 | ||
| 72 | if (stage3_index & 0x80 != 0) { | 72 | if (stage3_index & 0x80 != 0) { |
| 73 | const real_index = @as(usize, self.multiple_start) + (stage3_index ^ 0x80) * 3; | 73 | const real_index = @as(usize, fdata.multiple_start) + (stage3_index ^ 0x80) * 3; |
| 74 | const mapping = mem.sliceTo(self.stage3[real_index..][0..3], 0); | 74 | const mapping = mem.sliceTo(fdata.stage3[real_index..][0..3], 0); |
| 75 | for (mapping, 0..) |c, i| buf[i] = @intCast(c); | 75 | for (mapping, 0..) |c, i| buf[i] = @intCast(c); |
| 76 | 76 | ||
| 77 | return buf[0..mapping.len]; | 77 | return buf[0..mapping.len]; |
| 78 | } | 78 | } |
| 79 | 79 | ||
| 80 | const offset = self.stage3[stage3_index]; | 80 | const offset = fdata.stage3[stage3_index]; |
| 81 | if (offset == 0) return &.{}; | 81 | if (offset == 0) return &.{}; |
| 82 | 82 | ||
| 83 | buf[0] = @intCast(@as(i32, cp) + offset); | 83 | buf[0] = @intCast(@as(i32, cp) + offset); |
| @@ -86,14 +86,14 @@ pub fn caseFold(self: Self, cp: u21, buf: []u21) []const u21 { | |||
| 86 | } | 86 | } |
| 87 | 87 | ||
| 88 | /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). | 88 | /// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`). |
| 89 | pub fn changesWhenCaseFolded(self: Self, cp: u21) bool { | 89 | pub fn changesWhenCaseFolded(fdata: *const FoldData, cp: u21) bool { |
| 90 | var buf: [3]u21 = undefined; | 90 | var buf: [3]u21 = undefined; |
| 91 | const has_mapping = self.caseFold(cp, &buf).len != 0; | 91 | const has_mapping = fdata.caseFold(cp, &buf).len != 0; |
| 92 | return has_mapping and !self.isCwcfException(cp); | 92 | return has_mapping and !fdata.isCwcfException(cp); |
| 93 | } | 93 | } |
| 94 | 94 | ||
| 95 | fn isCwcfException(self: Self, cp: u21) bool { | 95 | fn isCwcfException(fdata: *const FoldData, cp: u21) bool { |
| 96 | return cp >= self.cwcf_exceptions_min and | 96 | return cp >= fdata.cwcf_exceptions_min and |
| 97 | cp <= self.cwcf_exceptions_max and | 97 | cp <= fdata.cwcf_exceptions_max and |
| 98 | std.mem.indexOfScalar(u21, self.cwcf_exceptions, cp) != null; | 98 | std.mem.indexOfScalar(u21, fdata.cwcf_exceptions, cp) != null; |
| 99 | } | 99 | } |
diff --git a/src/HangulData.zig b/src/HangulData.zig index 4bccbe6..8c5f3ad 100644 --- a/src/HangulData.zig +++ b/src/HangulData.zig | |||
| @@ -1,8 +1,4 @@ | |||
| 1 | const std = @import("std"); | 1 | //! Hangul Data |
| 2 | const builtin = @import("builtin"); | ||
| 3 | const compress = std.compress; | ||
| 4 | const mem = std.mem; | ||
| 5 | const testing = std.testing; | ||
| 6 | 2 | ||
| 7 | pub const Syllable = enum { | 3 | pub const Syllable = enum { |
| 8 | none, | 4 | none, |
| @@ -16,9 +12,9 @@ pub const Syllable = enum { | |||
| 16 | s1: []u16 = undefined, | 12 | s1: []u16 = undefined, |
| 17 | s2: []u3 = undefined, | 13 | s2: []u3 = undefined, |
| 18 | 14 | ||
| 19 | const Self = @This(); | 15 | const Hangul = @This(); |
| 20 | 16 | ||
| 21 | pub fn init(allocator: mem.Allocator) !Self { | 17 | pub fn init(allocator: mem.Allocator) !Hangul { |
| 22 | const decompressor = compress.flate.inflate.decompressor; | 18 | const decompressor = compress.flate.inflate.decompressor; |
| 23 | const in_bytes = @embedFile("hangul"); | 19 | const in_bytes = @embedFile("hangul"); |
| 24 | var in_fbs = std.io.fixedBufferStream(in_bytes); | 20 | var in_fbs = std.io.fixedBufferStream(in_bytes); |
| @@ -26,27 +22,33 @@ pub fn init(allocator: mem.Allocator) !Self { | |||
| 26 | var reader = in_decomp.reader(); | 22 | var reader = in_decomp.reader(); |
| 27 | 23 | ||
| 28 | const endian = builtin.cpu.arch.endian(); | 24 | const endian = builtin.cpu.arch.endian(); |
| 29 | var self = Self{}; | 25 | var hangul = Hangul{}; |
| 30 | 26 | ||
| 31 | const stage_1_len: u16 = try reader.readInt(u16, endian); | 27 | const stage_1_len: u16 = try reader.readInt(u16, endian); |
| 32 | self.s1 = try allocator.alloc(u16, stage_1_len); | 28 | hangul.s1 = try allocator.alloc(u16, stage_1_len); |
| 33 | errdefer allocator.free(self.s1); | 29 | errdefer allocator.free(hangul.s1); |
| 34 | for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); | 30 | for (0..stage_1_len) |i| hangul.s1[i] = try reader.readInt(u16, endian); |
| 35 | 31 | ||
| 36 | const stage_2_len: u16 = try reader.readInt(u16, endian); | 32 | const stage_2_len: u16 = try reader.readInt(u16, endian); |
| 37 | self.s2 = try allocator.alloc(u3, stage_2_len); | 33 | hangul.s2 = try allocator.alloc(u3, stage_2_len); |
| 38 | errdefer allocator.free(self.s2); | 34 | errdefer allocator.free(hangul.s2); |
| 39 | for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian)); | 35 | for (0..stage_2_len) |i| hangul.s2[i] = @intCast(try reader.readInt(u8, endian)); |
| 40 | 36 | ||
| 41 | return self; | 37 | return hangul; |
| 42 | } | 38 | } |
| 43 | 39 | ||
| 44 | pub fn deinit(self: *const Self, allocator: mem.Allocator) void { | 40 | pub fn deinit(hangul: *const Hangul, allocator: mem.Allocator) void { |
| 45 | allocator.free(self.s1); | 41 | allocator.free(hangul.s1); |
| 46 | allocator.free(self.s2); | 42 | allocator.free(hangul.s2); |
| 47 | } | 43 | } |
| 48 | 44 | ||
| 49 | /// Returns the Hangul syllable type for `cp`. | 45 | /// Returns the Hangul syllable type for `cp`. |
| 50 | pub fn syllable(self: Self, cp: u21) Syllable { | 46 | pub fn syllable(hangul: *const Hangul, cp: u21) Syllable { |
| 51 | return @enumFromInt(self.s2[self.s1[cp >> 8] + (cp & 0xff)]); | 47 | return @enumFromInt(hangul.s2[hangul.s1[cp >> 8] + (cp & 0xff)]); |
| 52 | } | 48 | } |
| 49 | |||
| 50 | const std = @import("std"); | ||
| 51 | const builtin = @import("builtin"); | ||
| 52 | const compress = std.compress; | ||
| 53 | const mem = std.mem; | ||
| 54 | const testing = std.testing; | ||
diff --git a/src/NormData.zig b/src/NormData.zig deleted file mode 100644 index a123860..0000000 --- a/src/NormData.zig +++ /dev/null | |||
| @@ -1,37 +0,0 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const mem = std.mem; | ||
| 3 | |||
| 4 | const CanonData = @import("CanonData"); | ||
| 5 | const CccData = @import("CombiningData"); | ||
| 6 | const CompatData = @import("CompatData"); | ||
| 7 | const FoldData = @import("FoldData"); | ||
| 8 | const HangulData = @import("HangulData"); | ||
| 9 | const NormPropsData = @import("NormPropsData"); | ||
| 10 | |||
| 11 | canon_data: CanonData = undefined, | ||
| 12 | ccc_data: CccData = undefined, | ||
| 13 | compat_data: CompatData = undefined, | ||
| 14 | hangul_data: HangulData = undefined, | ||
| 15 | normp_data: NormPropsData = undefined, | ||
| 16 | |||
| 17 | const Self = @This(); | ||
| 18 | |||
| 19 | pub fn init(self: *Self, allocator: std.mem.Allocator) !void { | ||
| 20 | self.canon_data = try CanonData.init(allocator); | ||
| 21 | errdefer self.canon_data.deinit(allocator); | ||
| 22 | self.ccc_data = try CccData.init(allocator); | ||
| 23 | errdefer self.ccc_data.deinit(allocator); | ||
| 24 | self.compat_data = try CompatData.init(allocator); | ||
| 25 | errdefer self.compat_data.deinit(allocator); | ||
| 26 | self.hangul_data = try HangulData.init(allocator); | ||
| 27 | errdefer self.hangul_data.deinit(allocator); | ||
| 28 | self.normp_data = try NormPropsData.init(allocator); | ||
| 29 | } | ||
| 30 | |||
| 31 | pub fn deinit(self: *Self, allocator: mem.Allocator) void { | ||
| 32 | self.canon_data.deinit(allocator); | ||
| 33 | self.ccc_data.deinit(allocator); | ||
| 34 | self.compat_data.deinit(allocator); | ||
| 35 | self.hangul_data.deinit(allocator); | ||
| 36 | self.normp_data.deinit(allocator); | ||
| 37 | } | ||
diff --git a/src/NormPropsData.zig b/src/NormPropsData.zig index e79ae91..ca69569 100644 --- a/src/NormPropsData.zig +++ b/src/NormPropsData.zig | |||
| @@ -1,15 +1,11 @@ | |||
| 1 | const std = @import("std"); | 1 | //! Normalization Properties Data |
| 2 | const builtin = @import("builtin"); | ||
| 3 | const compress = std.compress; | ||
| 4 | const mem = std.mem; | ||
| 5 | const testing = std.testing; | ||
| 6 | 2 | ||
| 7 | s1: []u16 = undefined, | 3 | s1: []u16 = undefined, |
| 8 | s2: []u4 = undefined, | 4 | s2: []u4 = undefined, |
| 9 | 5 | ||
| 10 | const Self = @This(); | 6 | const NormProps = @This(); |
| 11 | 7 | ||
| 12 | pub fn init(allocator: mem.Allocator) !Self { | 8 | pub fn init(allocator: mem.Allocator) !NormProps { |
| 13 | const decompressor = compress.flate.inflate.decompressor; | 9 | const decompressor = compress.flate.inflate.decompressor; |
| 14 | const in_bytes = @embedFile("normp"); | 10 | const in_bytes = @embedFile("normp"); |
| 15 | var in_fbs = std.io.fixedBufferStream(in_bytes); | 11 | var in_fbs = std.io.fixedBufferStream(in_bytes); |
| @@ -17,37 +13,43 @@ pub fn init(allocator: mem.Allocator) !Self { | |||
| 17 | var reader = in_decomp.reader(); | 13 | var reader = in_decomp.reader(); |
| 18 | 14 | ||
| 19 | const endian = builtin.cpu.arch.endian(); | 15 | const endian = builtin.cpu.arch.endian(); |
| 20 | var self = Self{}; | 16 | var norms = NormProps{}; |
| 21 | 17 | ||
| 22 | const stage_1_len: u16 = try reader.readInt(u16, endian); | 18 | const stage_1_len: u16 = try reader.readInt(u16, endian); |
| 23 | self.s1 = try allocator.alloc(u16, stage_1_len); | 19 | norms.s1 = try allocator.alloc(u16, stage_1_len); |
| 24 | errdefer allocator.free(self.s1); | 20 | errdefer allocator.free(norms.s1); |
| 25 | for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian); | 21 | for (0..stage_1_len) |i| norms.s1[i] = try reader.readInt(u16, endian); |
| 26 | 22 | ||
| 27 | const stage_2_len: u16 = try reader.readInt(u16, endian); | 23 | const stage_2_len: u16 = try reader.readInt(u16, endian); |
| 28 | self.s2 = try allocator.alloc(u4, stage_2_len); | 24 | norms.s2 = try allocator.alloc(u4, stage_2_len); |
| 29 | errdefer allocator.free(self.s2); | 25 | errdefer allocator.free(norms.s2); |
| 30 | for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian)); | 26 | for (0..stage_2_len) |i| norms.s2[i] = @intCast(try reader.readInt(u8, endian)); |
| 31 | 27 | ||
| 32 | return self; | 28 | return norms; |
| 33 | } | 29 | } |
| 34 | 30 | ||
| 35 | pub fn deinit(self: *const Self, allocator: mem.Allocator) void { | 31 | pub fn deinit(norms: *const NormProps, allocator: mem.Allocator) void { |
| 36 | allocator.free(self.s1); | 32 | allocator.free(norms.s1); |
| 37 | allocator.free(self.s2); | 33 | allocator.free(norms.s2); |
| 38 | } | 34 | } |
| 39 | 35 | ||
| 40 | /// Returns true if `cp` is already in NFD form. | 36 | /// Returns true if `cp` is already in NFD form. |
| 41 | pub fn isNfd(self: Self, cp: u21) bool { | 37 | pub fn isNfd(norms: *const NormProps, cp: u21) bool { |
| 42 | return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 1 == 0; | 38 | return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 1 == 0; |
| 43 | } | 39 | } |
| 44 | 40 | ||
| 45 | /// Returns true if `cp` is already in NFKD form. | 41 | /// Returns true if `cp` is already in NFKD form. |
| 46 | pub fn isNfkd(self: Self, cp: u21) bool { | 42 | pub fn isNfkd(norms: *const NormProps, cp: u21) bool { |
| 47 | return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 2 == 0; | 43 | return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 2 == 0; |
| 48 | } | 44 | } |
| 49 | 45 | ||
| 50 | /// Returns true if `cp` is not allowed in any normalized form. | 46 | /// Returns true if `cp` is not allowed in any normalized form. |
| 51 | pub fn isFcx(self: Self, cp: u21) bool { | 47 | pub fn isFcx(norms: *const NormProps, cp: u21) bool { |
| 52 | return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 4 == 4; | 48 | return norms.s2[norms.s1[cp >> 8] + (cp & 0xff)] & 4 == 4; |
| 53 | } | 49 | } |
| 50 | |||
| 51 | const std = @import("std"); | ||
| 52 | const builtin = @import("builtin"); | ||
| 53 | const compress = std.compress; | ||
| 54 | const mem = std.mem; | ||
| 55 | const testing = std.testing; | ||
diff --git a/src/Normalize.zig b/src/Normalize.zig index b738b27..4f014cf 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig | |||
| @@ -2,23 +2,41 @@ | |||
| 2 | //! Unicode Normalization. You can normalize strings into NFC, | 2 | //! Unicode Normalization. You can normalize strings into NFC, |
| 3 | //! NFKC, NFD, and NFKD normalization forms. | 3 | //! NFKC, NFD, and NFKD normalization forms. |
| 4 | 4 | ||
| 5 | const std = @import("std"); | 5 | canon_data: CanonData = undefined, |
| 6 | const debug = std.debug; | 6 | ccc_data: CccData = undefined, |
| 7 | const assert = debug.assert; | 7 | compat_data: CompatData = undefined, |
| 8 | const fmt = std.fmt; | 8 | hangul_data: HangulData = undefined, |
| 9 | const heap = std.heap; | 9 | normp_data: NormPropsData = undefined, |
| 10 | const mem = std.mem; | 10 | |
| 11 | const simd = std.simd; | 11 | const Normalize = @This(); |
| 12 | const testing = std.testing; | 12 | |
| 13 | const unicode = std.unicode; | 13 | pub fn init(allocator: Allocator) !Normalize { |
| 14 | 14 | var norm: Normalize = undefined; | |
| 15 | const ascii = @import("ascii"); | 15 | try norm.setup(allocator); |
| 16 | const CodePointIterator = @import("code_point").Iterator; | 16 | return norm; |
| 17 | pub const NormData = @import("NormData"); | 17 | } |
| 18 | 18 | ||
| 19 | norm_data: *const NormData, | 19 | pub fn setup(self: *Normalize, allocator: Allocator) !void { |
| 20 | self.canon_data = try CanonData.init(allocator); | ||
| 21 | errdefer self.canon_data.deinit(allocator); | ||
| 22 | self.ccc_data = try CccData.init(allocator); | ||
| 23 | errdefer self.ccc_data.deinit(allocator); | ||
| 24 | self.compat_data = try CompatData.init(allocator); | ||
| 25 | errdefer self.compat_data.deinit(allocator); | ||
| 26 | self.hangul_data = try HangulData.init(allocator); | ||
| 27 | errdefer self.hangul_data.deinit(allocator); | ||
| 28 | self.normp_data = try NormPropsData.init(allocator); | ||
| 29 | } | ||
| 20 | 30 | ||
| 21 | const Self = @This(); | 31 | pub fn deinit(norm: *const Normalize, allocator: Allocator) void { |
| 32 | // Reasonably safe (?) | ||
| 33 | var mut_norm = @constCast(norm); | ||
| 34 | mut_norm.canon_data.deinit(allocator); | ||
| 35 | mut_norm.ccc_data.deinit(allocator); | ||
| 36 | mut_norm.compat_data.deinit(allocator); | ||
| 37 | mut_norm.hangul_data.deinit(allocator); | ||
| 38 | mut_norm.normp_data.deinit(allocator); | ||
| 39 | } | ||
| 22 | 40 | ||
| 23 | const SBase: u21 = 0xAC00; | 41 | const SBase: u21 = 0xAC00; |
| 24 | const LBase: u21 = 0x1100; | 42 | const LBase: u21 = 0x1100; |
| @@ -30,8 +48,8 @@ const TCount: u21 = 28; | |||
| 30 | const NCount: u21 = 588; // VCount * TCount | 48 | const NCount: u21 = 588; // VCount * TCount |
| 31 | const SCount: u21 = 11172; // LCount * NCount | 49 | const SCount: u21 = 11172; // LCount * NCount |
| 32 | 50 | ||
| 33 | fn decomposeHangul(self: Self, cp: u21, buf: []u21) ?Decomp { | 51 | fn decomposeHangul(self: Normalize, cp: u21, buf: []u21) ?Decomp { |
| 34 | const kind = self.norm_data.hangul_data.syllable(cp); | 52 | const kind = self.hangul_data.syllable(cp); |
| 35 | if (kind != .LV and kind != .LVT) return null; | 53 | if (kind != .LV and kind != .LVT) return null; |
| 36 | 54 | ||
| 37 | const SIndex: u21 = cp - SBase; | 55 | const SIndex: u21 = cp - SBase; |
| @@ -90,21 +108,21 @@ const Decomp = struct { | |||
| 90 | }; | 108 | }; |
| 91 | 109 | ||
| 92 | // `mapping` retrieves the decomposition mapping for a code point as per the UCD. | 110 | // `mapping` retrieves the decomposition mapping for a code point as per the UCD. |
| 93 | fn mapping(self: Self, cp: u21, form: Form) Decomp { | 111 | fn mapping(self: Normalize, cp: u21, form: Form) Decomp { |
| 94 | var dc = Decomp{}; | 112 | var dc = Decomp{}; |
| 95 | 113 | ||
| 96 | switch (form) { | 114 | switch (form) { |
| 97 | .nfd => { | 115 | .nfd => { |
| 98 | dc.cps = self.norm_data.canon_data.toNfd(cp); | 116 | dc.cps = self.canon_data.toNfd(cp); |
| 99 | if (dc.cps.len != 0) dc.form = .nfd; | 117 | if (dc.cps.len != 0) dc.form = .nfd; |
| 100 | }, | 118 | }, |
| 101 | 119 | ||
| 102 | .nfkd => { | 120 | .nfkd => { |
| 103 | dc.cps = self.norm_data.compat_data.toNfkd(cp); | 121 | dc.cps = self.compat_data.toNfkd(cp); |
| 104 | if (dc.cps.len != 0) { | 122 | if (dc.cps.len != 0) { |
| 105 | dc.form = .nfkd; | 123 | dc.form = .nfkd; |
| 106 | } else { | 124 | } else { |
| 107 | dc.cps = self.norm_data.canon_data.toNfd(cp); | 125 | dc.cps = self.canon_data.toNfd(cp); |
| 108 | if (dc.cps.len != 0) dc.form = .nfkd; | 126 | if (dc.cps.len != 0) dc.form = .nfkd; |
| 109 | } | 127 | } |
| 110 | }, | 128 | }, |
| @@ -117,7 +135,7 @@ fn mapping(self: Self, cp: u21, form: Form) Decomp { | |||
| 117 | 135 | ||
| 118 | // `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`. | 136 | // `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`. |
| 119 | fn decompose( | 137 | fn decompose( |
| 120 | self: Self, | 138 | self: Normalize, |
| 121 | cp: u21, | 139 | cp: u21, |
| 122 | form: Form, | 140 | form: Form, |
| 123 | buf: []u21, | 141 | buf: []u21, |
| @@ -127,8 +145,8 @@ fn decompose( | |||
| 127 | 145 | ||
| 128 | // NFD / NFKD quick checks. | 146 | // NFD / NFKD quick checks. |
| 129 | switch (form) { | 147 | switch (form) { |
| 130 | .nfd => if (self.norm_data.normp_data.isNfd(cp)) return .{}, | 148 | .nfd => if (self.normp_data.isNfd(cp)) return .{}, |
| 131 | .nfkd => if (self.norm_data.normp_data.isNfkd(cp)) return .{}, | 149 | .nfkd => if (self.normp_data.isNfkd(cp)) return .{}, |
| 132 | else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."), | 150 | else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."), |
| 133 | } | 151 | } |
| 134 | 152 | ||
| @@ -175,10 +193,8 @@ fn decompose( | |||
| 175 | 193 | ||
| 176 | test "decompose" { | 194 | test "decompose" { |
| 177 | const allocator = testing.allocator; | 195 | const allocator = testing.allocator; |
| 178 | var data: NormData = undefined; | 196 | const n = try Normalize.init(allocator); |
| 179 | try NormData.init(&data, allocator); | 197 | defer n.deinit(allocator); |
| 180 | defer data.deinit(allocator); | ||
| 181 | var n = Self{ .norm_data = &data }; | ||
| 182 | 198 | ||
| 183 | var buf: [18]u21 = undefined; | 199 | var buf: [18]u21 = undefined; |
| 184 | 200 | ||
| @@ -228,42 +244,42 @@ pub const Result = struct { | |||
| 228 | slice: []const u8, | 244 | slice: []const u8, |
| 229 | 245 | ||
| 230 | /// Ensures that the slice result is a copy of the input, by making a copy if it was not. | 246 | /// Ensures that the slice result is a copy of the input, by making a copy if it was not. |
| 231 | pub fn toOwned(result: Result, allocator: mem.Allocator) error{OutOfMemory}!Result { | 247 | pub fn toOwned(result: Result, allocator: Allocator) error{OutOfMemory}!Result { |
| 232 | if (result.allocated) return result; | 248 | if (result.allocated) return result; |
| 233 | return .{ .allocated = true, .slice = try allocator.dupe(u8, result.slice) }; | 249 | return .{ .allocated = true, .slice = try allocator.dupe(u8, result.slice) }; |
| 234 | } | 250 | } |
| 235 | 251 | ||
| 236 | pub fn deinit(self: *const Result, allocator: mem.Allocator) void { | 252 | pub fn deinit(self: *const Result, allocator: Allocator) void { |
| 237 | if (self.allocated) allocator.free(self.slice); | 253 | if (self.allocated) allocator.free(self.slice); |
| 238 | } | 254 | } |
| 239 | }; | 255 | }; |
| 240 | 256 | ||
| 241 | // Compares code points by Canonical Combining Class order. | 257 | // Compares code points by Canonical Combining Class order. |
| 242 | fn cccLess(self: Self, lhs: u21, rhs: u21) bool { | 258 | fn cccLess(self: Normalize, lhs: u21, rhs: u21) bool { |
| 243 | return self.norm_data.ccc_data.ccc(lhs) < self.norm_data.ccc_data.ccc(rhs); | 259 | return self.ccc_data.ccc(lhs) < self.ccc_data.ccc(rhs); |
| 244 | } | 260 | } |
| 245 | 261 | ||
| 246 | // Applies the Canonical Sorting Algorithm. | 262 | // Applies the Canonical Sorting Algorithm. |
| 247 | fn canonicalSort(self: Self, cps: []u21) void { | 263 | fn canonicalSort(self: Normalize, cps: []u21) void { |
| 248 | var i: usize = 0; | 264 | var i: usize = 0; |
| 249 | while (i < cps.len) : (i += 1) { | 265 | while (i < cps.len) : (i += 1) { |
| 250 | const start: usize = i; | 266 | const start: usize = i; |
| 251 | while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} | 267 | while (i < cps.len and self.ccc_data.ccc(cps[i]) != 0) : (i += 1) {} |
| 252 | mem.sort(u21, cps[start..i], self, cccLess); | 268 | mem.sort(u21, cps[start..i], self, cccLess); |
| 253 | } | 269 | } |
| 254 | } | 270 | } |
| 255 | 271 | ||
| 256 | /// Normalize `str` to NFD. | 272 | /// Normalize `str` to NFD. |
| 257 | pub fn nfd(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result { | 273 | pub fn nfd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { |
| 258 | return self.nfxd(allocator, str, .nfd); | 274 | return self.nfxd(allocator, str, .nfd); |
| 259 | } | 275 | } |
| 260 | 276 | ||
| 261 | /// Normalize `str` to NFKD. | 277 | /// Normalize `str` to NFKD. |
| 262 | pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result { | 278 | pub fn nfkd(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { |
| 263 | return self.nfxd(allocator, str, .nfkd); | 279 | return self.nfxd(allocator, str, .nfkd); |
| 264 | } | 280 | } |
| 265 | 281 | ||
| 266 | pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error![]u21 { | 282 | pub fn nfxdCodePoints(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error![]u21 { |
| 267 | var dcp_list = std.ArrayList(u21).init(allocator); | 283 | var dcp_list = std.ArrayList(u21).init(allocator); |
| 268 | defer dcp_list.deinit(); | 284 | defer dcp_list.deinit(); |
| 269 | 285 | ||
| @@ -284,7 +300,7 @@ pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, for | |||
| 284 | return try dcp_list.toOwnedSlice(); | 300 | return try dcp_list.toOwnedSlice(); |
| 285 | } | 301 | } |
| 286 | 302 | ||
| 287 | fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error!Result { | 303 | fn nfxd(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { |
| 288 | // Quick checks. | 304 | // Quick checks. |
| 289 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; | 305 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; |
| 290 | 306 | ||
| @@ -305,10 +321,8 @@ fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A | |||
| 305 | 321 | ||
| 306 | test "nfd ASCII / no-alloc" { | 322 | test "nfd ASCII / no-alloc" { |
| 307 | const allocator = testing.allocator; | 323 | const allocator = testing.allocator; |
| 308 | var data: NormData = undefined; | 324 | const n = try Normalize.init(allocator); |
| 309 | try NormData.init(&data, allocator); | 325 | defer n.deinit(allocator); |
| 310 | defer data.deinit(allocator); | ||
| 311 | const n = Self{ .norm_data = &data }; | ||
| 312 | 326 | ||
| 313 | const result = try n.nfd(allocator, "Hello World!"); | 327 | const result = try n.nfd(allocator, "Hello World!"); |
| 314 | defer result.deinit(allocator); | 328 | defer result.deinit(allocator); |
| @@ -318,10 +332,8 @@ test "nfd ASCII / no-alloc" { | |||
| 318 | 332 | ||
| 319 | test "nfd !ASCII / alloc" { | 333 | test "nfd !ASCII / alloc" { |
| 320 | const allocator = testing.allocator; | 334 | const allocator = testing.allocator; |
| 321 | var data: NormData = undefined; | 335 | const n = try Normalize.init(allocator); |
| 322 | try NormData.init(&data, allocator); | 336 | defer n.deinit(allocator); |
| 323 | defer data.deinit(allocator); | ||
| 324 | const n = Self{ .norm_data = &data }; | ||
| 325 | 337 | ||
| 326 | const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); | 338 | const result = try n.nfd(allocator, "Héllo World! \u{3d3}"); |
| 327 | defer result.deinit(allocator); | 339 | defer result.deinit(allocator); |
| @@ -331,10 +343,8 @@ test "nfd !ASCII / alloc" { | |||
| 331 | 343 | ||
| 332 | test "nfkd ASCII / no-alloc" { | 344 | test "nfkd ASCII / no-alloc" { |
| 333 | const allocator = testing.allocator; | 345 | const allocator = testing.allocator; |
| 334 | var data: NormData = undefined; | 346 | const n = try Normalize.init(allocator); |
| 335 | try NormData.init(&data, allocator); | 347 | defer n.deinit(allocator); |
| 336 | defer data.deinit(allocator); | ||
| 337 | const n = Self{ .norm_data = &data }; | ||
| 338 | 348 | ||
| 339 | const result = try n.nfkd(allocator, "Hello World!"); | 349 | const result = try n.nfkd(allocator, "Hello World!"); |
| 340 | defer result.deinit(allocator); | 350 | defer result.deinit(allocator); |
| @@ -344,10 +354,8 @@ test "nfkd ASCII / no-alloc" { | |||
| 344 | 354 | ||
| 345 | test "nfkd !ASCII / alloc" { | 355 | test "nfkd !ASCII / alloc" { |
| 346 | const allocator = testing.allocator; | 356 | const allocator = testing.allocator; |
| 347 | var data: NormData = undefined; | 357 | const n = try Normalize.init(allocator); |
| 348 | try NormData.init(&data, allocator); | 358 | defer n.deinit(allocator); |
| 349 | defer data.deinit(allocator); | ||
| 350 | const n = Self{ .norm_data = &data }; | ||
| 351 | 359 | ||
| 352 | const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); | 360 | const result = try n.nfkd(allocator, "Héllo World! \u{3d3}"); |
| 353 | defer result.deinit(allocator); | 361 | defer result.deinit(allocator); |
| @@ -356,10 +364,10 @@ test "nfkd !ASCII / alloc" { | |||
| 356 | } | 364 | } |
| 357 | 365 | ||
| 358 | pub fn nfdCodePoints( | 366 | pub fn nfdCodePoints( |
| 359 | self: Self, | 367 | self: Normalize, |
| 360 | allocator: mem.Allocator, | 368 | allocator: Allocator, |
| 361 | cps: []const u21, | 369 | cps: []const u21, |
| 362 | ) mem.Allocator.Error![]u21 { | 370 | ) Allocator.Error![]u21 { |
| 363 | var dcp_list = std.ArrayList(u21).init(allocator); | 371 | var dcp_list = std.ArrayList(u21).init(allocator); |
| 364 | defer dcp_list.deinit(); | 372 | defer dcp_list.deinit(); |
| 365 | 373 | ||
| @@ -381,10 +389,10 @@ pub fn nfdCodePoints( | |||
| 381 | } | 389 | } |
| 382 | 390 | ||
| 383 | pub fn nfkdCodePoints( | 391 | pub fn nfkdCodePoints( |
| 384 | self: Self, | 392 | self: Normalize, |
| 385 | allocator: mem.Allocator, | 393 | allocator: Allocator, |
| 386 | cps: []const u21, | 394 | cps: []const u21, |
| 387 | ) mem.Allocator.Error![]u21 { | 395 | ) Allocator.Error![]u21 { |
| 388 | var dcp_list = std.ArrayList(u21).init(allocator); | 396 | var dcp_list = std.ArrayList(u21).init(allocator); |
| 389 | defer dcp_list.deinit(); | 397 | defer dcp_list.deinit(); |
| 390 | 398 | ||
| @@ -407,21 +415,21 @@ pub fn nfkdCodePoints( | |||
| 407 | 415 | ||
| 408 | // Composition (NFC, NFKC) | 416 | // Composition (NFC, NFKC) |
| 409 | 417 | ||
| 410 | fn isHangul(self: Self, cp: u21) bool { | 418 | fn isHangul(self: Normalize, cp: u21) bool { |
| 411 | return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none; | 419 | return cp >= 0x1100 and self.hangul_data.syllable(cp) != .none; |
| 412 | } | 420 | } |
| 413 | 421 | ||
| 414 | /// Normalizes `str` to NFC. | 422 | /// Normalizes `str` to NFC. |
| 415 | pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result { | 423 | pub fn nfc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { |
| 416 | return self.nfxc(allocator, str, .nfc); | 424 | return self.nfxc(allocator, str, .nfc); |
| 417 | } | 425 | } |
| 418 | 426 | ||
| 419 | /// Normalizes `str` to NFKC. | 427 | /// Normalizes `str` to NFKC. |
| 420 | pub fn nfkc(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result { | 428 | pub fn nfkc(self: Normalize, allocator: Allocator, str: []const u8) Allocator.Error!Result { |
| 421 | return self.nfxc(allocator, str, .nfkc); | 429 | return self.nfxc(allocator, str, .nfkc); |
| 422 | } | 430 | } |
| 423 | 431 | ||
| 424 | fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error!Result { | 432 | fn nfxc(self: Normalize, allocator: Allocator, str: []const u8, form: Form) Allocator.Error!Result { |
| 425 | // Quick checks. | 433 | // Quick checks. |
| 426 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; | 434 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; |
| 427 | if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str }; | 435 | if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str }; |
| @@ -446,7 +454,7 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A | |||
| 446 | block_check: while (i < dcps.len) : (i += 1) { | 454 | block_check: while (i < dcps.len) : (i += 1) { |
| 447 | const C = dcps[i]; | 455 | const C = dcps[i]; |
| 448 | if (C == tombstone) continue :block_check; | 456 | if (C == tombstone) continue :block_check; |
| 449 | const cc_C = self.norm_data.ccc_data.ccc(C); | 457 | const cc_C = self.ccc_data.ccc(C); |
| 450 | var starter_index: ?usize = null; | 458 | var starter_index: ?usize = null; |
| 451 | var j: usize = i; | 459 | var j: usize = i; |
| 452 | 460 | ||
| @@ -456,11 +464,11 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A | |||
| 456 | if (dcps[j] == tombstone) continue; | 464 | if (dcps[j] == tombstone) continue; |
| 457 | 465 | ||
| 458 | // Check for starter. | 466 | // Check for starter. |
| 459 | if (self.norm_data.ccc_data.isStarter(dcps[j])) { | 467 | if (self.ccc_data.isStarter(dcps[j])) { |
| 460 | // Check for blocking conditions. | 468 | // Check for blocking conditions. |
| 461 | for (dcps[(j + 1)..i]) |B| { | 469 | for (dcps[(j + 1)..i]) |B| { |
| 462 | if (B == tombstone) continue; | 470 | if (B == tombstone) continue; |
| 463 | const cc_B = self.norm_data.ccc_data.ccc(B); | 471 | const cc_B = self.ccc_data.ccc(B); |
| 464 | if (cc_B != 0 and self.isHangul(C)) continue :block_check; | 472 | if (cc_B != 0 and self.isHangul(C)) continue :block_check; |
| 465 | if (cc_B >= cc_C) continue :block_check; | 473 | if (cc_B >= cc_C) continue :block_check; |
| 466 | } | 474 | } |
| @@ -484,8 +492,8 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A | |||
| 484 | // them algorithmically if possible. | 492 | // them algorithmically if possible. |
| 485 | if (self.isHangul(L) and self.isHangul(C)) { | 493 | if (self.isHangul(L) and self.isHangul(C)) { |
| 486 | // Get Hangul syllable types. | 494 | // Get Hangul syllable types. |
| 487 | const l_stype = self.norm_data.hangul_data.syllable(L); | 495 | const l_stype = self.hangul_data.syllable(L); |
| 488 | const c_stype = self.norm_data.hangul_data.syllable(C); | 496 | const c_stype = self.hangul_data.syllable(C); |
| 489 | 497 | ||
| 490 | if (l_stype == .LV and c_stype == .T) { | 498 | if (l_stype == .LV and c_stype == .T) { |
| 491 | // LV, T canonical composition. | 499 | // LV, T canonical composition. |
| @@ -508,13 +516,13 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A | |||
| 508 | if (!processed_hangul) { | 516 | if (!processed_hangul) { |
| 509 | // L, C are not Hangul, so check for primary composite | 517 | // L, C are not Hangul, so check for primary composite |
| 510 | // in the Unicode Character Database. | 518 | // in the Unicode Character Database. |
| 511 | if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| { | 519 | if (self.canon_data.toNfc(.{ L, C })) |P| { |
| 512 | // We have a primary composite P for L, C. | 520 | // We have a primary composite P for L, C. |
| 513 | // We must check if P is not in the Full | 521 | // We must check if P is not in the Full |
| 514 | // Composition Exclusions (FCX) list, | 522 | // Composition Exclusions (FCX) list, |
| 515 | // preventing it from appearing in any | 523 | // preventing it from appearing in any |
| 516 | // composed form (NFC, NFKC). | 524 | // composed form (NFC, NFKC). |
| 517 | if (!self.norm_data.normp_data.isFcx(P)) { | 525 | if (!self.normp_data.isFcx(P)) { |
| 518 | dcps[sidx] = P; | 526 | dcps[sidx] = P; |
| 519 | dcps[i] = tombstone; // Mark for deletion. | 527 | dcps[i] = tombstone; // Mark for deletion. |
| 520 | deleted += 1; | 528 | deleted += 1; |
| @@ -544,10 +552,8 @@ fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.A | |||
| 544 | 552 | ||
| 545 | test "nfc" { | 553 | test "nfc" { |
| 546 | const allocator = testing.allocator; | 554 | const allocator = testing.allocator; |
| 547 | var data: NormData = undefined; | 555 | const n = try Normalize.init(allocator); |
| 548 | try NormData.init(&data, allocator); | 556 | defer n.deinit(allocator); |
| 549 | defer data.deinit(allocator); | ||
| 550 | const n = Self{ .norm_data = &data }; | ||
| 551 | 557 | ||
| 552 | const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); | 558 | const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}"); |
| 553 | defer result.deinit(allocator); | 559 | defer result.deinit(allocator); |
| @@ -557,10 +563,8 @@ test "nfc" { | |||
| 557 | 563 | ||
| 558 | test "nfkc" { | 564 | test "nfkc" { |
| 559 | const allocator = testing.allocator; | 565 | const allocator = testing.allocator; |
| 560 | var data: NormData = undefined; | 566 | const n = try Normalize.init(allocator); |
| 561 | try NormData.init(&data, allocator); | 567 | defer n.deinit(allocator); |
| 562 | defer data.deinit(allocator); | ||
| 563 | const n = Self{ .norm_data = &data }; | ||
| 564 | 568 | ||
| 565 | const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); | 569 | const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}"); |
| 566 | defer result.deinit(allocator); | 570 | defer result.deinit(allocator); |
| @@ -569,7 +573,7 @@ test "nfkc" { | |||
| 569 | } | 573 | } |
| 570 | 574 | ||
| 571 | /// Tests for equality of `a` and `b` after normalizing to NFC. | 575 | /// Tests for equality of `a` and `b` after normalizing to NFC. |
| 572 | pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool { | 576 | pub fn eql(self: Normalize, allocator: Allocator, a: []const u8, b: []const u8) !bool { |
| 573 | const norm_result_a = try self.nfc(allocator, a); | 577 | const norm_result_a = try self.nfc(allocator, a); |
| 574 | defer norm_result_a.deinit(allocator); | 578 | defer norm_result_a.deinit(allocator); |
| 575 | const norm_result_b = try self.nfc(allocator, b); | 579 | const norm_result_b = try self.nfc(allocator, b); |
| @@ -580,10 +584,8 @@ pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) ! | |||
| 580 | 584 | ||
| 581 | test "eql" { | 585 | test "eql" { |
| 582 | const allocator = testing.allocator; | 586 | const allocator = testing.allocator; |
| 583 | var data: NormData = undefined; | 587 | const n = try Normalize.init(allocator); |
| 584 | try NormData.init(&data, allocator); | 588 | defer n.deinit(allocator); |
| 585 | defer data.deinit(allocator); | ||
| 586 | const n = Self{ .norm_data = &data }; | ||
| 587 | 589 | ||
| 588 | try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); | 590 | try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}")); |
| 589 | try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); | 591 | try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}")); |
| @@ -629,3 +631,24 @@ test "isLatin1Only" { | |||
| 629 | const not_latin1_only = "Héllo, World! \u{3d3}"; | 631 | const not_latin1_only = "Héllo, World! \u{3d3}"; |
| 630 | try testing.expect(!isLatin1Only(not_latin1_only)); | 632 | try testing.expect(!isLatin1Only(not_latin1_only)); |
| 631 | } | 633 | } |
| 634 | |||
| 635 | const std = @import("std"); | ||
| 636 | const debug = std.debug; | ||
| 637 | const assert = debug.assert; | ||
| 638 | const fmt = std.fmt; | ||
| 639 | const heap = std.heap; | ||
| 640 | const mem = std.mem; | ||
| 641 | const simd = std.simd; | ||
| 642 | const testing = std.testing; | ||
| 643 | const unicode = std.unicode; | ||
| 644 | const Allocator = std.mem.Allocator; | ||
| 645 | |||
| 646 | const ascii = @import("ascii"); | ||
| 647 | const CodePointIterator = @import("code_point").Iterator; | ||
| 648 | |||
| 649 | const CanonData = @import("CanonData"); | ||
| 650 | const CccData = @import("CombiningData"); | ||
| 651 | const CompatData = @import("CompatData"); | ||
| 652 | const FoldData = @import("FoldData"); | ||
| 653 | const HangulData = @import("HangulData"); | ||
| 654 | const NormPropsData = @import("NormPropsData"); | ||
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 3cb5df5..8b9069a 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -43,9 +43,8 @@ test "Unicode normalization tests" { | |||
| 43 | defer arena.deinit(); | 43 | defer arena.deinit(); |
| 44 | var allocator = arena.allocator(); | 44 | var allocator = arena.allocator(); |
| 45 | 45 | ||
| 46 | var norm_data: Normalize.NormData = undefined; | 46 | const n = try Normalize.init(allocator); |
| 47 | try Normalize.NormData.init(&norm_data, allocator); | 47 | defer n.deinit(allocator); |
| 48 | const n = Normalize{ .norm_data = &norm_data }; | ||
| 49 | 48 | ||
| 50 | var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); | 49 | var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{}); |
| 51 | defer file.close(); | 50 | defer file.close(); |