summaryrefslogtreecommitdiff
path: root/src/Normalizer.zig
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-28 19:23:23 -0400
committerGravatar Jose Colon Rodriguez2024-02-28 19:23:23 -0400
commit7cad24f76a72f534084de64153f768699170cd05 (patch)
tree0a9eb3b4609a246046952c379ea5e92540623ab7 /src/Normalizer.zig
parentGeneral Category with GenCatData (diff)
downloadzg-7cad24f76a72f534084de64153f768699170cd05.tar.gz
zg-7cad24f76a72f534084de64153f768699170cd05.tar.xz
zg-7cad24f76a72f534084de64153f768699170cd05.zip
Using slices for decompositions in Normalizer
Diffstat (limited to 'src/Normalizer.zig')
-rw-r--r--src/Normalizer.zig185
1 files changed, 89 insertions, 96 deletions
diff --git a/src/Normalizer.zig b/src/Normalizer.zig
index 26177ac..89cc50c 100644
--- a/src/Normalizer.zig
+++ b/src/Normalizer.zig
@@ -5,6 +5,7 @@
5const std = @import("std"); 5const std = @import("std");
6const testing = std.testing; 6const testing = std.testing;
7 7
8const ascii = @import("ascii");
8const CodePointIterator = @import("code_point").Iterator; 9const CodePointIterator = @import("code_point").Iterator;
9pub const NormData = @import("NormData"); 10pub const NormData = @import("NormData");
10 11
@@ -12,12 +13,6 @@ norm_data: *NormData,
12 13
13const Self = @This(); 14const Self = @This();
14 15
15// Hangul processing utilities.
16fn isHangulPrecomposed(self: Self, cp: u21) bool {
17 const kind = self.norm_data.hangul_data.syllable(cp);
18 return kind == .LV or kind == .LVT;
19}
20
21const SBase: u21 = 0xAC00; 16const SBase: u21 = 0xAC00;
22const LBase: u21 = 0x1100; 17const LBase: u21 = 0x1100;
23const VBase: u21 = 0x1161; 18const VBase: u21 = 0x1161;
@@ -28,17 +23,30 @@ const TCount: u21 = 28;
28const NCount: u21 = 588; // VCount * TCount 23const NCount: u21 = 588; // VCount * TCount
29const SCount: u21 = 11172; // LCount * NCount 24const SCount: u21 = 11172; // LCount * NCount
30 25
31fn decomposeHangul(cp: u21) [3]u21 { 26fn decomposeHangul(self: Self, cp: u21, buf: []u21) ?Decomp {
27 const kind = self.norm_data.hangul_data.syllable(cp);
28 if (kind != .LV and kind != .LVT) return null;
29
32 const SIndex: u21 = cp - SBase; 30 const SIndex: u21 = cp - SBase;
33 const LIndex: u21 = SIndex / NCount; 31 const LIndex: u21 = SIndex / NCount;
34 const VIndex: u21 = (SIndex % NCount) / TCount; 32 const VIndex: u21 = (SIndex % NCount) / TCount;
35 const TIndex: u21 = SIndex % TCount; 33 const TIndex: u21 = SIndex % TCount;
36 const LPart: u21 = LBase + LIndex; 34 const LPart: u21 = LBase + LIndex;
37 const VPart: u21 = VBase + VIndex; 35 const VPart: u21 = VBase + VIndex;
38 var TPart: u21 = 0;
39 if (TIndex != 0) TPart = TBase + TIndex;
40 36
41 return [3]u21{ LPart, VPart, TPart }; 37 var dc = Decomp{ .form = .nfd };
38 buf[0] = LPart;
39 buf[1] = VPart;
40
41 if (TIndex == 0) {
42 dc.cps = buf[0..2];
43 return dc;
44 }
45
46 // TPart
47 buf[2] = TBase + TIndex;
48 dc.cps = buf[0..3];
49 return dc;
42} 50}
43 51
44fn composeHangulCanon(lv: u21, t: u21) u21 { 52fn composeHangulCanon(lv: u21, t: u21) u21 {
@@ -70,59 +78,59 @@ const Form = enum {
70}; 78};
71 79
72const Decomp = struct { 80const Decomp = struct {
73 form: Form = .nfd, 81 form: Form = .same,
74 cps: [18]u21 = [_]u21{0} ** 18, 82 cps: []const u21 = &.{},
75}; 83};
76 84
77/// `mapping` retrieves the decomposition mapping for a code point as per the UCD. 85/// `mapping` retrieves the decomposition mapping for a code point as per the UCD.
78pub fn mapping(self: Self, cp: u21, form: Form) Decomp { 86pub fn mapping(self: Self, cp: u21, form: Form) Decomp {
79 std.debug.assert(form == .nfd or form == .nfkd); 87 var dc = Decomp{};
80 88
81 var dc = Decomp{ .form = .nfd }; 89 switch (form) {
82 const canon_dc = self.norm_data.canon_data.toNfd(cp); 90 .nfd => {
83 const len: usize = if (canon_dc[1] == 0) 1 else 2; 91 dc.cps = self.norm_data.canon_data.toNfd(cp);
84 92 if (dc.cps.len != 0) dc.form = .nfd;
85 if (len == 1 and canon_dc[0] == cp) { 93 },
86 dc.form = .same; 94
87 dc.cps[0] = cp; 95 .nfkd => {
88 } else { 96 dc.cps = self.norm_data.compat_data.toNfkd(cp);
89 @memcpy(dc.cps[0..len], canon_dc[0..len]); 97 if (dc.cps.len != 0) {
90 } 98 dc.form = .nfkd;
99 } else {
100 dc.cps = self.norm_data.canon_data.toNfd(cp);
101 if (dc.cps.len != 0) dc.form = .nfkd;
102 }
103 },
91 104
92 const compat_dc = self.norm_data.compat_data.toNfkd(cp); 105 else => @panic("Normalizer.mapping only accepts form .nfd or .nfkd."),
93 if (compat_dc.len != 0) {
94 if (form != .nfd) {
95 dc.form = .nfkd;
96 @memcpy(dc.cps[0..compat_dc.len], compat_dc);
97 }
98 } 106 }
99 107
100 return dc; 108 return dc;
101} 109}
102 110
103/// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`. 111/// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`.
104pub fn decompose(self: Self, cp: u21, form: Form) Decomp { 112pub fn decompose(
105 std.debug.assert(form == .nfd or form == .nfkd); 113 self: Self,
106 114 cp: u21,
107 var dc = Decomp{ .form = form }; 115 form: Form,
108 116 buf: []u21,
109 // ASCII or NFD / NFKD quick checks. 117) Decomp {
110 if (cp <= 127 or 118 // ASCII
111 (form == .nfd and self.norm_data.normp_data.isNfd(cp)) or 119 if (cp < 128) return .{};
112 (form == .nfkd and self.norm_data.normp_data.isNfkd(cp))) 120
113 { 121 // NFD / NFKD quick checks.
114 dc.cps[0] = cp; 122 switch (form) {
115 return dc; 123 .nfd => if (self.norm_data.normp_data.isNfd(cp)) return .{},
124 .nfkd => if (self.norm_data.normp_data.isNfkd(cp)) return .{},
125 else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."),
116 } 126 }
117 127
118 // Hangul precomposed syllable full decomposition. 128 // Hangul precomposed syllable full decomposition.
119 if (self.isHangulPrecomposed(cp)) { 129 if (self.decomposeHangul(cp, buf)) |dc| return dc;
120 const cps = decomposeHangul(cp);
121 @memcpy(dc.cps[0..cps.len], &cps);
122 return dc;
123 }
124 130
125 // Full decomposition. 131 // Full decomposition.
132 var dc = Decomp{ .form = form };
133
126 var result_index: usize = 0; 134 var result_index: usize = 0;
127 var work_index: usize = 1; 135 var work_index: usize = 1;
128 136
@@ -137,27 +145,24 @@ pub fn decompose(self: Self, cp: u21, form: Form) Decomp {
137 145
138 // No more of decompositions for this code point. 146 // No more of decompositions for this code point.
139 if (m.form == .same) { 147 if (m.form == .same) {
140 dc.cps[result_index] = m.cps[0]; 148 buf[result_index] = next;
141 result_index += 1; 149 result_index += 1;
142 continue; 150 continue;
143 } 151 }
144 152
145 // Find last index of decomposition.
146 const m_last = for (m.cps, 0..) |mcp, i| {
147 if (mcp == 0) break i;
148 } else m.cps.len;
149
150 // Work backwards through decomposition. 153 // Work backwards through decomposition.
151 // `i` starts at 1 because m_last is 1 past the last code point. 154 // `i` starts at 1 because m_last is 1 past the last code point.
152 var i: usize = 1; 155 var i: usize = 1;
153 while (i <= m_last) : ({ 156 while (i <= m.cps.len) : ({
154 i += 1; 157 i += 1;
155 work_index += 1; 158 work_index += 1;
156 }) { 159 }) {
157 work[work_index] = m.cps[m_last - i]; 160 work[work_index] = m.cps[m.cps.len - i];
158 } 161 }
159 } 162 }
160 163
164 dc.cps = buf[0..result_index];
165
161 return dc; 166 return dc;
162} 167}
163 168
@@ -167,58 +172,45 @@ test "decompose" {
167 defer data.deinit(); 172 defer data.deinit();
168 var n = Self{ .norm_data = &data }; 173 var n = Self{ .norm_data = &data };
169 174
170 var dc = n.decompose('é', .nfd); 175 var buf: [18]u21 = undefined;
176
177 var dc = n.decompose('é', .nfd, &buf);
171 try std.testing.expect(dc.form == .nfd); 178 try std.testing.expect(dc.form == .nfd);
172 try std.testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]); 179 try std.testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]);
173 180
174 dc = n.decompose('\u{1e0a}', .nfd); 181 dc = n.decompose('\u{1e0a}', .nfd, &buf);
175 try std.testing.expect(dc.form == .nfd); 182 try std.testing.expect(dc.form == .nfd);
176 try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); 183 try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
177 184
178 dc = n.decompose('\u{1e0a}', .nfkd); 185 dc = n.decompose('\u{1e0a}', .nfkd, &buf);
179 try std.testing.expect(dc.form == .nfkd); 186 try std.testing.expect(dc.form == .nfkd);
180 try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]); 187 try std.testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
181 188
182 dc = n.decompose('\u{3189}', .nfd); 189 dc = n.decompose('\u{3189}', .nfd, &buf);
183 try std.testing.expect(dc.form == .nfd); 190 try std.testing.expect(dc.form == .same);
184 try std.testing.expectEqualSlices(u21, &[_]u21{'\u{3189}'}, dc.cps[0..1]); 191 try std.testing.expect(dc.cps.len == 0);
185 192
186 dc = n.decompose('\u{3189}', .nfkd); 193 dc = n.decompose('\u{3189}', .nfkd, &buf);
187 try std.testing.expect(dc.form == .nfkd); 194 try std.testing.expect(dc.form == .nfkd);
188 try std.testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]); 195 try std.testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]);
189 196
190 dc = n.decompose('\u{ace1}', .nfd); 197 dc = n.decompose('\u{ace1}', .nfd, &buf);
191 try std.testing.expect(dc.form == .nfd); 198 try std.testing.expect(dc.form == .nfd);
192 try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); 199 try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
193 200
194 dc = n.decompose('\u{ace1}', .nfkd); 201 dc = n.decompose('\u{ace1}', .nfkd, &buf);
195 try std.testing.expect(dc.form == .nfkd); 202 try std.testing.expect(dc.form == .nfd);
196 try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]); 203 try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
197 204
198 dc = n.decompose('\u{3d3}', .nfd); 205 dc = n.decompose('\u{3d3}', .nfd, &buf);
199 try std.testing.expect(dc.form == .nfd); 206 try std.testing.expect(dc.form == .nfd);
200 try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]); 207 try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]);
201 208
202 dc = n.decompose('\u{3d3}', .nfkd); 209 dc = n.decompose('\u{3d3}', .nfkd, &buf);
203 try std.testing.expect(dc.form == .nfkd); 210 try std.testing.expect(dc.form == .nfkd);
204 try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]); 211 try std.testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]);
205} 212}
206 213
207// Some quick checks.
208
209fn onlyAscii(str: []const u8) bool {
210 return for (str) |b| {
211 if (b > 127) break false;
212 } else true;
213}
214
215fn onlyLatin1(str: []const u8) bool {
216 var cp_iter = CodePointIterator{ .bytes = str };
217 return while (cp_iter.next()) |cp| {
218 if (cp.code > 256) break false;
219 } else true;
220}
221
222/// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory. 214/// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory.
223pub const Result = struct { 215pub const Result = struct {
224 allocator: ?std.mem.Allocator = null, 216 allocator: ?std.mem.Allocator = null,
@@ -256,18 +248,21 @@ pub fn nfkd(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result {
256 248
257fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result { 249fn nfxd(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result {
258 // Quick checks. 250 // Quick checks.
259 if (onlyAscii(str)) return Result{ .slice = str }; 251 if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
260 252
261 var dcp_list = try std.ArrayList(u21).initCapacity(allocator, str.len + str.len / 2); 253 var dcp_list = try std.ArrayList(u21).initCapacity(allocator, str.len * 3);
262 defer dcp_list.deinit(); 254 defer dcp_list.deinit();
263 255
264 var cp_iter = CodePointIterator{ .bytes = str }; 256 var cp_iter = CodePointIterator{ .bytes = str };
257 var dc_buf: [18]u21 = undefined;
258
265 while (cp_iter.next()) |cp| { 259 while (cp_iter.next()) |cp| {
266 const dc = self.decompose(cp.code, form); 260 const dc = self.decompose(cp.code, form, &dc_buf);
267 const slice = for (dc.cps, 0..) |dcp, i| { 261 if (dc.form == .same) {
268 if (dcp == 0) break dc.cps[0..i]; 262 try dcp_list.append(cp.code);
269 } else dc.cps[0..]; 263 } else {
270 try dcp_list.appendSlice(slice); 264 try dcp_list.appendSlice(dc.cps);
265 }
271 } 266 }
272 267
273 self.canonicalSort(dcp_list.items); 268 self.canonicalSort(dcp_list.items);
@@ -354,8 +349,7 @@ pub fn nfkc(self: Self, allocator: std.mem.Allocator, str: []const u8) !Result {
354 349
355fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result { 350fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !Result {
356 // Quick checks. 351 // Quick checks.
357 if (onlyAscii(str)) return Result{ .slice = str }; 352 if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
358 if (form == .nfc and onlyLatin1(str)) return Result{ .slice = str };
359 353
360 // Decompose first. 354 // Decompose first.
361 var d_result = if (form == .nfc) 355 var d_result = if (form == .nfc)
@@ -522,15 +516,14 @@ test "eql" {
522// FCD 516// FCD
523fn getLeadCcc(self: Self, cp: u21) u8 { 517fn getLeadCcc(self: Self, cp: u21) u8 {
524 const dc = self.mapping(cp, .nfd); 518 const dc = self.mapping(cp, .nfd);
525 return self.norm_data.ccc_data.ccc(dc.cps[0]); 519 const dcp = if (dc.form == .same) cp else dc.cps[0];
520 return self.norm_data.ccc_data.ccc(dcp);
526} 521}
527 522
528fn getTrailCcc(self: Self, cp: u21) u8 { 523fn getTrailCcc(self: Self, cp: u21) u8 {
529 const dc = self.mapping(cp, .nfd); 524 const dc = self.mapping(cp, .nfd);
530 const len = for (dc.cps, 0..) |dcp, i| { 525 const dcp = if (dc.form == .same) cp else dc.cps[dc.cps.len - 1];
531 if (dcp == 0) break i; 526 return self.norm_data.ccc_data.ccc(dcp);
532 } else dc.cps.len;
533 return self.norm_data.ccc_data.ccc(dc.cps[len - 1]);
534} 527}
535 528
536/// Fast check to detect if a string is already in NFC or NFD form. 529/// Fast check to detect if a string is already in NFC or NFD form.