diff options
| author | 2024-02-28 19:23:23 -0400 | |
|---|---|---|
| committer | 2024-02-28 19:23:23 -0400 | |
| commit | 7cad24f76a72f534084de64153f768699170cd05 (patch) | |
| tree | 0a9eb3b4609a246046952c379ea5e92540623ab7 /src/Normalizer.zig | |
| parent | General Category with GenCatData (diff) | |
| download | zg-7cad24f76a72f534084de64153f768699170cd05.tar.gz zg-7cad24f76a72f534084de64153f768699170cd05.tar.xz zg-7cad24f76a72f534084de64153f768699170cd05.zip | |
Using slices for decompositions in Normalizer
Diffstat (limited to 'src/Normalizer.zig')
| -rw-r--r-- | src/Normalizer.zig | 185 |
1 files changed, 89 insertions, 96 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig index 26177ac..89cc50c 100644 --- a/src/Normalizer.zig +++ b/src/Normalizer.zig | |||
| @@ -5,6 +5,7 @@ | |||
| 5 | const std = @import("std"); | 5 | const std = @import("std"); |
| 6 | const testing = std.testing; | 6 | const testing = std.testing; |
| 7 | 7 | ||
| 8 | const ascii = @import("ascii"); | ||
| 8 | const CodePointIterator = @import("code_point").Iterator; | 9 | const CodePointIterator = @import("code_point").Iterator; |
| 9 | pub const NormData = @import("NormData"); | 10 | pub const NormData = @import("NormData"); |
| 10 | 11 | ||
| @@ -12,12 +13,6 @@ norm_data: *NormData, | |||
| 12 | 13 | ||
| 13 | const Self = @This(); | 14 | const Self = @This(); |
| 14 | 15 | ||
| 15 | // Hangul processing utilities. | ||
| 16 | fn isHangulPrecomposed(self: Self, cp: u21) bool { | ||
| 17 | const kind = self.norm_data.hangul_data.syllable(cp); | ||
| 18 | return kind == .LV or kind == .LVT; | ||
| 19 | } | ||
| 20 | |||
| 21 | const SBase: u21 = 0xAC00; | 16 | const SBase: u21 = 0xAC00; |
| 22 | const LBase: u21 = 0x1100; | 17 | const LBase: u21 = 0x1100; |
| 23 | const VBase: u21 = 0x1161; | 18 | const VBase: u21 = 0x1161; |
| @@ -28,17 +23,30 @@ const TCount: u21 = 28; | |||
| 28 | const NCount: u21 = 588; // VCount * TCount | 23 | const NCount: u21 = 588; // VCount * TCount |
| 29 | const SCount: u21 = 11172; // LCount * NCount | 24 | const SCount: u21 = 11172; // LCount * NCount |
| 30 | 25 | ||
| 31 | fn decomposeHangul(cp: u21) [3]u21 { | 26 | fn decomposeHangul(self: Self, cp: u21, buf: []u21) ?Decomp { |
| 27 | const kind = self.norm_data.hangul_data.syllable(cp); | ||
| 28 | if (kind != .LV and kind != .LVT) return null; | ||
| 29 | |||
| 32 | const SIndex: u21 = cp - SBase; | 30 | const SIndex: u21 = cp - SBase; |
| 33 | const LIndex: u21 = SIndex / NCount; | 31 | const LIndex: u21 = SIndex / NCount; |
| 34 | const VIndex: u21 = (SIndex % NCount) / TCount; | 32 | const VIndex: u21 = (SIndex % NCount) / TCount; |
| 35 | const TIndex: u21 = SIndex % TCount; | 33 | const TIndex: u21 = SIndex % TCount; |
| 36 | const LPart: u21 = LBase + LIndex; | 34 | const LPart: u21 = LBase + LIndex; |
| 37 | const VPart: u21 = VBase + VIndex; | 35 | const VPart: u21 = VBase + VIndex; |
| 38 | var TPart: u21 = 0; | ||
| 39 | if (TIndex != 0) TPart = TBase + TIndex; | ||
| 40 | 36 | ||
| 41 | return [3]u21{ LPart, VPart, TPart }; | 37 | var dc = Decomp{ .form = .nfd }; |
| 38 | buf[0] = LPart; | ||
| 39 | buf[1] = VPart; | ||
| 40 | |||
| 41 | if (TIndex == 0) { | ||
| 42 | dc.cps = buf[0..2]; | ||
| 43 | return dc; | ||
| 44 | } | ||
| 45 | |||
| 46 | // TPart | ||
| 47 | buf[2] = TBase + TIndex; | ||
| 48 | dc.cps = buf[0..3]; | ||
| 49 | return dc; | ||
| 42 | } | 50 | } |
| 43 | 51 | ||
| 44 | fn composeHangulCanon(lv: u21, t: u21) u21 { | 52 | fn composeHangulCanon(lv: u21, t: u21) u21 { |
| @@ -70,59 +78,59 @@ const Form = enum { | |||
| 70 | }; | 78 | }; |
| 71 | 79 | ||
| 72 | const Decomp = struct { | 80 | const Decomp = struct { |
| 73 | form: Form = .nfd, | 81 | form: Form = .same, |
| 74 | cps: [18]u21 = [_]u21{0} ** 18, | 82 | cps: []const u21 = &.{}, |
| 75 | }; | 83 | }; |
| 76 | 84 | ||
| 77 | /// `mapping` retrieves the decomposition mapping for a code point as per the UCD. | 85 | /// `mapping` retrieves the decomposition mapping for a code point as per the UCD. |
| 78 | pub fn mapping(self: Self, cp: u21, form: Form) Decomp { | 86 | pub fn mapping(self: Self, cp: u21, form: Form) Decomp { |
| 79 | std.debug.assert(form == .nfd or form == .nfkd); | 87 | var dc = Decomp{}; |
| 80 | 88 | ||
| 81 | var dc = Decomp{ .form = .nfd }; | 89 | switch (form) { |
| 82 | const canon_dc = self.norm_data.canon_data.toNfd(cp); | 90 | .nfd => { |
| 83 | const len: usize = if (canon_dc[1] == 0) 1 else 2; | 91 | dc.cps = self.norm_data.canon_data.toNfd(cp); |
| 84 | 92 | if (dc.cps.len != 0) dc.form = .nfd; | |
| 85 | if (len == 1 and canon_dc[0] == cp) { | 93 | }, |
| 86 | dc.form = .same; | 94 | |
| 87 | dc.cps[0] = cp; | 95 | .nfkd => { |
| 88 | } else { | 96 | dc.cps = self.norm_data.compat_data.toNfkd(cp); |
| 89 | @memcpy(dc.cps[0..len], canon_dc[0..len]); | 97 | if (dc.cps.len != 0) { |
| 90 | } | 98 | dc.form = .nfkd; |
| 99 | } else { | ||
| 100 | dc.cps = self.norm_data.canon_data.toNfd(cp); | ||
| 101 | if (dc.cps.len != 0) dc.form = .nfkd; | ||
| 102 | } | ||
| 103 | }, | ||
| 91 | 104 | ||
| 92 | const compat_dc = self.norm_data.compat_data.toNfkd(cp); | 105 | else => @panic("Normalizer.mapping only accepts form .nfd or .nfkd."), |
| 93 | if (compat_dc.len != 0) { | ||
| 94 | if (form != .nfd) { | ||
| 95 | dc.form = .nfkd; | ||
| 96 | @memcpy(dc.cps[0..compat_dc.len], compat_dc); | ||
| 97 | } | ||
| 98 | } | 106 | } |
| 99 | 107 | ||
| 100 | return dc; | 108 | return dc; |
| 101 | } | 109 | } |
| 102 | 110 | ||
| 103 | /// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`. | 111 | /// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`. |
| 104 | pub fn decompose(self: Self, cp: u21, form: Form) Decomp { | 112 | pub fn decompose( |
| 105 | std.debug.assert(form == .nfd or form == .nfkd); | 113 | self: Self, |
| 106 | 114 | cp: u21, | |
| 107 | var dc = Decomp{ .form = form }; | 115 | form: Form, |
| 108 | 116 | buf: []u21, | |
| 109 | // ASCII or NFD / NFKD quick checks. | 117 | ) Decomp { |
| 110 | if (cp <= 127 or | 118 | // ASCII |
| 111 | (form == .nfd and self.norm_data.normp_data.isNfd(cp)) or | 119 | if (cp < 128) return .{}; |
| 112 | (form == .nfkd and self.norm_data.normp_data.isNfkd(cp))) | 120 | |
| 113 | { | 121 | // NFD / NFKD quick checks. |
| 114 | dc.cps[0] = cp; | 122 | switch (form) { |
| 115 | return dc; | 123 | .nfd => if (self.norm_data.normp_data.isNfd(cp)) return .{}, |
| 124 | .nfkd => if (self.norm_data.normp_data.isNfkd(cp)) return .{}, | ||
| 125 | else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."), | ||
| 116 | } | 126 | } |
| 117 | 127 | ||
| 118 | // Hangul precomposed syllable full decomposition. | 128 | // Hangul precomposed syllable full decomposition. |
| 119 | if (self.isHangulPrecomposed(cp)) { | 129 | if (self.decomposeHangul(cp, buf)) |dc| return dc; |
| 120 | const cps = decomposeHangul(cp); | ||
| 121 | @memcpy(dc.cps[0..cps.len], &cps); | ||
| 122 | return dc; | ||
| 123 | } | ||
| 124 | 130 | ||
| 125 | // Full decomposition. | 131 | // Full decomposition. |
| 132 | var dc = Decomp{ .form = form }; | ||
| 133 | |||
| 126 | var result_index: usize = 0; | 134 | var result_index: usize = 0; |
| 127 | var work_index: usize = 1; | 135 | var work_index: usize = 1; |
| 128 | 136 | ||
| @@ -137,27 +145,24 @@ pub fn decompose(self: Self, cp: u21, form: Form) Decomp { | |||
| 137 | 145 | ||
| 138 | // No more of decompositions for this code point. | 146 | // No more of decompositions for this code point. |
| 139 | if (m.form == .same) { | 147 | if (m.form == .same) { |
| 140 | dc.cps[result_index] = m.cps[0]; | 148 | buf[result_index] = next; |
| 141 | result_index += 1; | 149 | result_index += 1; |
| 142 | continue; | 150 | continue; |
| 143 | } | 151 | } |
| 144 | 152 | ||
| 145 | // Find last index of decomposition. | ||
| 146 | const m_last = for (m.cps, 0..) |mcp, i| { | ||
| 147 | if (mcp == 0) break i; | ||
| 148 | } else m.cps.len; | ||
| 149 | |||
| 150 | // Work backwards through decomposition. | 153 | // Work backwards through decomposition. |
| 151 | // `i` starts at 1 because m_last is 1 past the last code point. | 154 | // `i` starts at 1 because m_last is 1 past the last code point. |
| 152 | var i: usize = 1; | 155 | var i: usize = 1; |
| 153 | while (i <= m_last) : ({ | 156 | while (i <= m.cps.len) : ({ |
| 154 | i += 1; | 157 | i += 1; |
| 155 | work_index += 1; | 158 | work_index += 1; |
| 156 | }) { | 159 | }) { |
| 157 | work[work_index] = m.cps[m_last - i]; | 160 | work[work_index] = m.cps[m.cps.len - i]; |
| 158 | } | 161 | } |
| 159 | } | 162 | } |
| 160 | 163 | ||
| 164 | dc.cps = buf[0..result_index]; | ||
| 165 | |||
| 161 | return dc; | 166 | return dc; |
| 162 | } | 167 | } |
| 163 | 168 | ||
| @@ -167,58 +172,45 @@ test "decompose" { | |||
| 167 | defer data.deinit(); | 172 | defer data.deinit(); |
| 168 | var n = Self{ .norm_data = &data }; | 173 | var n = Self{ .norm_data = &data }; |
| 169 | 174 | ||
| 170 | var dc = n.decompose('é', .nfd); | 175 | var buf: [18]u21 = undefined; |
| 176 | |||
| 177 | var dc = n.decompose('é', .nfd, &buf); | ||
| 171 | try std.testing.expect(dc.form == .nfd); | 178 | try std.testing.expect(dc.form == .nfd); |
| 172 | try std.testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]); | 179 | try std.testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]); |
| 173 | 180 | ||
| 174 | dc = n.decompose('\u{1e0a}', .nfd); | 181 | dc = n.decompose('\u{1e0a}', .nfd, &buf); |
| 175 | try std.testing.expect(dc.form == .nfd); | 182 | try std.testing.expect(dc.form == .nfd); |
| 176 | try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); | 183 | try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); |
| 177 | 184 | ||
| 178 | dc = n.decompose('\u{1e0a}', .nfkd); | 185 | dc = n.decompose('\u{1e0a}', .nfkd, &buf); |
| 179 | try std.testing.expect(dc.form == .nfkd); | 186 | try std.testing.expect(dc.form == .nfkd); |
| 180 | try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); | 187 | try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); |
| 181 | 188 | ||
| 182 | dc = n.decompose('\u{3189}', .nfd); | 189 | dc = n.decompose('\u{3189}', .nfd, &buf); |
| 183 | try std.testing.expect(dc.form == .nfd); | 190 | try std.testing.expect(dc.form == .same); |
| 184 | try std.testing.expectEqualSlices(u21, &[_]u21{'\u{3189}'}, dc.cps[0..1]); | 191 | try std.testing.expect(dc.cps.len == 0); |
| 185 | 192 | ||
| 186 | dc = n.decompose('\u{3189}', .nfkd); | 193 | dc = n.decompose('\u{3189}', .nfkd, &buf); |
| 187 | try std.testing.expect(dc.form == .nfkd); | 194 | try std.testing.expect(dc.form == .nfkd); |
| 188 | try std.testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]); | 195 | try std.testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]); |
| 189 | 196 | ||
| 190 | dc = n.decompose('\u{ace1}', .nfd); | 197 | dc = n.decompose('\u{ace1}', .nfd, &buf); |
| 191 | try std.testing.expect(dc.form == .nfd); | 198 | try std.testing.expect(dc.form == .nfd); |
| 192 | try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); | 199 | try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); |
| 193 | 200 | ||
| 194 | dc = n.decompose('\u{ace1}', .nfkd); | 201 | dc = n.decompose('\u{ace1}', .nfkd, &buf); |
| 195 | try std.testing.expect(dc.form == .nfkd); | 202 | try std.testing.expect(dc.form == .nfd); |
| 196 | try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); | 203 | try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); |
| 197 | 204 | ||
| 198 | dc = n.decompose('\u{3d3}', .nfd); | 205 | dc = n.decompose('\u{3d3}', .nfd, &buf); |
| 199 | try std.testing.expect(dc.form == .nfd); | 206 | try std.testing.expect(dc.form == .nfd); |
| 200 | try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]); | 207 | try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]); |
| 201 | 208 | ||
| 202 | dc = n.decompose('\u{3d3}', .nfkd); | 209 | dc = n.decompose('\u{3d3}', .nfkd, &buf); |
| 203 | try std.testing.expect(dc.form == .nfkd); | 210 | try std.testing.expect(dc.form == .nfkd); |
| 204 | try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]); | 211 | try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]); |
| 205 | } | 212 | } |
| 206 | 213 | ||
| 207 | // Some quick checks. | ||
| 208 | |||
| 209 | fn onlyAscii(str: []const u8) bool { | ||
| 210 | return for (str) |b| { | ||
| 211 | if (b > 127) break false; | ||
| 212 | } else true; | ||
| 213 | } | ||
| 214 | |||
| 215 | fn onlyLatin1(str: []const u8) bool { | ||
| 216 | var cp_iter = CodePointIterator{ .bytes = str }; | ||
| 217 | return while (cp_iter.next()) |cp| { | ||
| 218 | if (cp.code > 256) break false; | ||
| 219 | } else true; | ||
| 220 | } | ||
| 221 | |||
| 222 | /// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory. | 214 | /// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory. |
| 223 | pub const Result = struct { | 215 | pub const Result = struct { |
| 224 | allocator: ?std.mem.Allocator = null, | 216 | allocator: ?std.mem.Allocator = null, |
| @@ -256,18 +248,21 @@ pub fn nfkd(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { | |||
| 256 | 248 | ||
| 257 | fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result { | 249 | fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result { |
| 258 | // Quick checks. | 250 | // Quick checks. |
| 259 | if (onlyAscii(str)) return Result{ .slice = str }; | 251 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; |
| 260 | 252 | ||
| 261 | var dcp_list = try std.ArrayList(u21).initCapacity(allocator, str.len + str.len / 2); | 253 | var dcp_list = try std.ArrayList(u21).initCapacity(allocator, str.len * 3); |
| 262 | defer dcp_list.deinit(); | 254 | defer dcp_list.deinit(); |
| 263 | 255 | ||
| 264 | var cp_iter = CodePointIterator{ .bytes = str }; | 256 | var cp_iter = CodePointIterator{ .bytes = str }; |
| 257 | var dc_buf: [18]u21 = undefined; | ||
| 258 | |||
| 265 | while (cp_iter.next()) |cp| { | 259 | while (cp_iter.next()) |cp| { |
| 266 | const dc = self.decompose(cp.code, form); | 260 | const dc = self.decompose(cp.code, form, &dc_buf); |
| 267 | const slice = for (dc.cps, 0..) |dcp, i| { | 261 | if (dc.form == .same) { |
| 268 | if (dcp == 0) break dc.cps[0..i]; | 262 | try dcp_list.append(cp.code); |
| 269 | } else dc.cps[0..]; | 263 | } else { |
| 270 | try dcp_list.appendSlice(slice); | 264 | try dcp_list.appendSlice(dc.cps); |
| 265 | } | ||
| 271 | } | 266 | } |
| 272 | 267 | ||
| 273 | self.canonicalSort(dcp_list.items); | 268 | self.canonicalSort(dcp_list.items); |
| @@ -354,8 +349,7 @@ pub fn nfkc(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result { | |||
| 354 | 349 | ||
| 355 | fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result { | 350 | fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result { |
| 356 | // Quick checks. | 351 | // Quick checks. |
| 357 | if (onlyAscii(str)) return Result{ .slice = str }; | 352 | if (ascii.isAsciiOnly(str)) return Result{ .slice = str }; |
| 358 | if (form == .nfc and onlyLatin1(str)) return Result{ .slice = str }; | ||
| 359 | 353 | ||
| 360 | // Decompose first. | 354 | // Decompose first. |
| 361 | var d_result = if (form == .nfc) | 355 | var d_result = if (form == .nfc) |
| @@ -522,15 +516,14 @@ test "eql" { | |||
| 522 | // FCD | 516 | // FCD |
| 523 | fn getLeadCcc(self: Self, cp: u21) u8 { | 517 | fn getLeadCcc(self: Self, cp: u21) u8 { |
| 524 | const dc = self.mapping(cp, .nfd); | 518 | const dc = self.mapping(cp, .nfd); |
| 525 | return self.norm_data.ccc_data.ccc(dc.cps[0]); | 519 | const dcp = if (dc.form == .same) cp else dc.cps[0]; |
| 520 | return self.norm_data.ccc_data.ccc(dcp); | ||
| 526 | } | 521 | } |
| 527 | 522 | ||
| 528 | fn getTrailCcc(self: Self, cp: u21) u8 { | 523 | fn getTrailCcc(self: Self, cp: u21) u8 { |
| 529 | const dc = self.mapping(cp, .nfd); | 524 | const dc = self.mapping(cp, .nfd); |
| 530 | const len = for (dc.cps, 0..) |dcp, i| { | 525 | const dcp = if (dc.form == .same) cp else dc.cps[dc.cps.len - 1]; |
| 531 | if (dcp == 0) break i; | 526 | return self.norm_data.ccc_data.ccc(dcp); |
| 532 | } else dc.cps.len; | ||
| 533 | return self.norm_data.ccc_data.ccc(dc.cps[len - 1]); | ||
| 534 | } | 527 | } |
| 535 | 528 | ||
| 536 | /// Fast check to detect if a string is already in NFC or NFD form. | 529 | /// Fast check to detect if a string is already in NFC or NFD form. |