summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-27 09:26:40 -0400
committerGravatar Jose Colon Rodriguez2024-02-27 09:26:40 -0400
commit32c68059a05dde8a57a330db6d14a32506081516 (patch)
treec2b3b9bbbf48330db3570135d371cb92b552f1cb /src
parentUsing NormData nfkd (diff)
downloadzg-32c68059a05dde8a57a330db6d14a32506081516.tar.gz
zg-32c68059a05dde8a57a330db6d14a32506081516.tar.xz
zg-32c68059a05dde8a57a330db6d14a32506081516.zip
Using HangulData in NormData
Diffstat (limited to 'src')
-rw-r--r--src/HangulData.zig52
-rw-r--r--src/NormData.zig4
-rw-r--r--src/Normalizer.zig23
3 files changed, 67 insertions, 12 deletions
diff --git a/src/HangulData.zig b/src/HangulData.zig
new file mode 100644
index 0000000..4d80c99
--- /dev/null
+++ b/src/HangulData.zig
@@ -0,0 +1,52 @@
1const std = @import("std");
2const builtin = @import("builtin");
3const compress = std.compress;
4const mem = std.mem;
5const testing = std.testing;
6
7pub const Syllable = enum {
8 none,
9 L,
10 LV,
11 LVT,
12 V,
13 T,
14};
15
16allocator: mem.Allocator,
17s1: []u16 = undefined,
18s2: []Syllable = undefined,
19
20const Self = @This();
21
22pub fn init(allocator: mem.Allocator) !Self {
23 const decompressor = compress.deflate.decompressor;
24 const in_bytes = @embedFile("hangul");
25 var in_fbs = std.io.fixedBufferStream(in_bytes);
26 var in_decomp = try decompressor(allocator, in_fbs.reader(), null);
27 defer in_decomp.deinit();
28 var reader = in_decomp.reader();
29
30 const endian = builtin.cpu.arch.endian();
31 var self = Self{ .allocator = allocator };
32
33 const stage_1_len: u16 = try reader.readInt(u16, endian);
34 self.s1 = try allocator.alloc(u16, stage_1_len);
35 for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
36
37 const stage_2_len: u16 = try reader.readInt(u16, endian);
38 self.s2 = try allocator.alloc(Syllable, stage_2_len);
39 for (0..stage_2_len) |i| self.s2[i] = @enumFromInt(try reader.readInt(u8, endian));
40
41 return self;
42}
43
44pub fn deinit(self: *Self) void {
45 self.allocator.free(self.s1);
46 self.allocator.free(self.s2);
47}
48
49/// Returns the Hangul syllable type for `cp`.
50pub inline fn syllable(self: Self, cp: u21) Syllable {
51 return self.s2[self.s1[cp >> 8] + (cp & 0xff)];
52}
diff --git a/src/NormData.zig b/src/NormData.zig
index 83110f0..8923382 100644
--- a/src/NormData.zig
+++ b/src/NormData.zig
@@ -4,10 +4,12 @@ const mem = std.mem;
4const CanonData = @import("CanonData"); 4const CanonData = @import("CanonData");
5const CccData = @import("CombiningData"); 5const CccData = @import("CombiningData");
6const CompatData = @import("CompatData"); 6const CompatData = @import("CompatData");
7const HangulData = @import("HangulData");
7 8
8canon_data: CanonData, 9canon_data: CanonData,
9ccc_data: CccData, 10ccc_data: CccData,
10compat_data: CompatData, 11compat_data: CompatData,
12hangul_data: HangulData,
11 13
12const Self = @This(); 14const Self = @This();
13 15
@@ -16,6 +18,7 @@ pub fn init(allocator: std.mem.Allocator) !Self {
16 .canon_data = try CanonData.init(allocator), 18 .canon_data = try CanonData.init(allocator),
17 .ccc_data = try CccData.init(allocator), 19 .ccc_data = try CccData.init(allocator),
18 .compat_data = try CompatData.init(allocator), 20 .compat_data = try CompatData.init(allocator),
21 .hangul_data = try HangulData.init(allocator),
19 }; 22 };
20} 23}
21 24
@@ -23,4 +26,5 @@ pub fn deinit(self: *Self) void {
23 self.canon_data.deinit(); 26 self.canon_data.deinit();
24 self.ccc_data.deinit(); 27 self.ccc_data.deinit();
25 self.compat_data.deinit(); 28 self.compat_data.deinit();
29 self.hangul_data.deinit();
26} 30}
diff --git a/src/Normalizer.zig b/src/Normalizer.zig
index 1434043..0670cae 100644
--- a/src/Normalizer.zig
+++ b/src/Normalizer.zig
@@ -7,7 +7,6 @@ const testing = std.testing;
7 7
8const CodePointIterator = @import("code_point").Iterator; 8const CodePointIterator = @import("code_point").Iterator;
9const case_fold_map = @import("ziglyph").case_folding; 9const case_fold_map = @import("ziglyph").case_folding;
10const hangul_map = @import("ziglyph").hangul;
11const norm_props = @import("ziglyph").normalization_props; 10const norm_props = @import("ziglyph").normalization_props;
12 11
13pub const NormData = @import("NormData"); 12pub const NormData = @import("NormData");
@@ -17,9 +16,9 @@ norm_data: *NormData,
17const Self = @This(); 16const Self = @This();
18 17
19// Hangul processing utilities. 18// Hangul processing utilities.
20fn isHangulPrecomposed(cp: u21) bool { 19fn isHangulPrecomposed(self: Self, cp: u21) bool {
21 if (hangul_map.syllableType(cp)) |kind| return kind == .LV or kind == .LVT; 20 const kind = self.norm_data.hangul_data.syllable(cp);
22 return false; 21 return kind == .LV or kind == .LVT;
23} 22}
24 23
25const SBase: u21 = 0xAC00; 24const SBase: u21 = 0xAC00;
@@ -117,7 +116,7 @@ pub fn decompose(self: Self, cp: u21, form: Form) Decomp {
117 } 116 }
118 117
119 // Hangul precomposed syllable full decomposition. 118 // Hangul precomposed syllable full decomposition.
120 if (isHangulPrecomposed(cp)) { 119 if (self.isHangulPrecomposed(cp)) {
121 const cps = decomposeHangul(cp); 120 const cps = decomposeHangul(cp);
122 @memcpy(dc.cps[0..cps.len], &cps); 121 @memcpy(dc.cps[0..cps.len], &cps);
123 return dc; 122 return dc;
@@ -335,12 +334,12 @@ test "nfkd !ASCII / alloc" {
335 334
336// Composition utilities. 335// Composition utilities.
337 336
338fn isHangul(cp: u21) bool { 337fn isHangul(self: Self, cp: u21) bool {
339 return cp >= 0x1100 and hangul_map.syllableType(cp) != null; 338 return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none;
340} 339}
341 340
342fn isNonHangulStarter(self: Self, cp: u21) bool { 341fn isNonHangulStarter(self: Self, cp: u21) bool {
343 return !isHangul(cp) and self.norm_data.ccc_data.isStarter(cp); 342 return !self.isHangul(cp) and self.norm_data.ccc_data.isStarter(cp);
344} 343}
345 344
346/// Normalizes `str` to NFC. 345/// Normalizes `str` to NFC.
@@ -395,7 +394,7 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
395 for (d_list.items[(j + 1)..i]) |B| { 394 for (d_list.items[(j + 1)..i]) |B| {
396 const cc_B = self.norm_data.ccc_data.ccc(B); 395 const cc_B = self.norm_data.ccc_data.ccc(B);
397 // Check for blocking conditions. 396 // Check for blocking conditions.
398 if (isHangul(C)) { 397 if (self.isHangul(C)) {
399 if (cc_B != 0 or self.isNonHangulStarter(B)) continue :block_check; 398 if (cc_B != 0 or self.isNonHangulStarter(B)) continue :block_check;
400 } 399 }
401 if (cc_B >= cc_C) continue :block_check; 400 if (cc_B >= cc_C) continue :block_check;
@@ -414,9 +413,9 @@ fn nfxc(self: Self, allocator: std.mem.Allocator, str: []const u8, form: Form) !
414 const L = d_list.items[sidx]; 413 const L = d_list.items[sidx];
415 var processed_hangul = false; 414 var processed_hangul = false;
416 415
417 if (isHangul(L) and isHangul(C)) { 416 if (self.isHangul(L) and self.isHangul(C)) {
418 const l_stype = hangul_map.syllableType(L).?; 417 const l_stype = self.norm_data.hangul_data.syllable(L);
419 const c_stype = hangul_map.syllableType(C).?; 418 const c_stype = self.norm_data.hangul_data.syllable(C);
420 419
421 if (l_stype == .LV and c_stype == .T) { 420 if (l_stype == .LV and c_stype == .T) {
422 // LV, T 421 // LV, T