summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-14 18:46:33 -0400
committerGravatar Jose Colon Rodriguez2024-02-14 18:46:33 -0400
commit25d843cb4285f4a0c99952792568f4faff5c533f (patch)
treede37db6fbd4a3e85633bed69555b22d03fc65503 /src
parentRemoved readCodePoint and StreamingGraphemeIterator (diff)
downloadzg-25d843cb4285f4a0c99952792568f4faff5c533f.tar.gz
zg-25d843cb4285f4a0c99952792568f4faff5c533f.tar.xz
zg-25d843cb4285f4a0c99952792568f4faff5c533f.zip
Passing Unicode 15.1.0 Grapheme Break Tests
Diffstat (limited to 'src')
-rw-r--r--src/Grapheme.zig72
1 files changed, 51 insertions, 21 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig
index 41ea545..a0ec900 100644
--- a/src/Grapheme.zig
+++ b/src/Grapheme.zig
@@ -10,6 +10,7 @@ const CodePointIterator = CodePoint.CodePointIterator;
10// const gbp = ziglyph.grapheme_break; 10// const gbp = ziglyph.grapheme_break;
11const gbp = @import("gbp"); 11const gbp = @import("gbp");
12const emoji = @import("emoji"); 12const emoji = @import("emoji");
13const indic = @import("indic");
13 14
14pub const Grapheme = @This(); 15pub const Grapheme = @This();
15 16
@@ -89,18 +90,18 @@ fn isIgnorable(cp: u21) bool {
89 return gbp.isExtend(cp) or gbp.isSpacingmark(cp) or cp == '\u{200d}'; 90 return gbp.isExtend(cp) or gbp.isSpacingmark(cp) or cp == '\u{200d}';
90} 91}
91 92
92test "Segmentation comptime GraphemeIterator" { 93// test "Segmentation comptime GraphemeIterator" {
93 const want = [_][]const u8{ "H", "é", "l", "l", "o" }; 94// const want = [_][]const u8{ "H", "é", "l", "l", "o" };
94 95//
95 comptime { 96// comptime {
96 const src = "Héllo"; 97// const src = "Héllo";
97 var ct_iter = GraphemeIterator.init(src); 98// var ct_iter = GraphemeIterator.init(src);
98 var i = 0; 99// var i = 0;
99 while (ct_iter.next()) |grapheme| : (i += 1) { 100// while (ct_iter.next()) |grapheme| : (i += 1) {
100 try std.testing.expect(grapheme.eql(src, want[i])); 101// try std.testing.expect(grapheme.eql(src, want[i]));
101 } 102// }
102 } 103// }
103} 104// }
104 105
105test "Segmentation ZWJ and ZWSP emoji sequences" { 106test "Segmentation ZWJ and ZWSP emoji sequences" {
106 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; 107 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
@@ -148,7 +149,7 @@ inline fn unsetRegional(state: *u3) void {
148} 149}
149// Indic Conjunct 150// Indic Conjunct
150inline fn hasIndic(state: *const u3) bool { 151inline fn hasIndic(state: *const u3) bool {
151 return state.* & 2 == 4; 152 return state.* & 4 == 4;
152} 153}
153inline fn setIndic(state: *u3) void { 154inline fn setIndic(state: *u3) void {
154 state.* |= 4; 155 state.* |= 4;
@@ -171,7 +172,7 @@ pub fn graphemeBreak(
171 // GB11: Emoji Extend* ZWJ x Emoji 172 // GB11: Emoji Extend* ZWJ x Emoji
172 if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state); 173 if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state);
173 // GB9c: Indic Conjunct Break 174 // GB9c: Indic Conjunct Break
174 // if (!hasIndic(state) and indic.isConsonant(cp1)) setIndic(state); 175 if (!hasIndic(state) and indic.isConsonant(cp1)) setIndic(state);
175 176
176 // GB3: CR x LF 177 // GB3: CR x LF
177 if (cp1 == '\r' and cp2 == '\n') return false; 178 if (cp1 == '\r' and cp2 == '\n') return false;
@@ -228,13 +229,42 @@ pub fn graphemeBreak(
228 } 229 }
229 230
230 // GB9c: Indic Conjunct Break 231 // GB9c: Indic Conjunct Break
231 // if (hasIndic(state) and 232 if (hasIndic(state) and
232 // indic.isLinker(cp1) and 233 indic.isConsonant(cp1) and
233 // indic.isConsonant(cp2)) 234 indic.isExtend(cp2))
234 // { 235 {
235 // unsetIndic(state); 236 return false;
236 // return false; 237 }
237 // } 238
239 if (hasIndic(state) and
240 indic.isConsonant(cp1) and
241 indic.isLinker(cp2))
242 {
243 return false;
244 }
245
246 if (hasIndic(state) and
247 indic.isExtend(cp1) and
248 indic.isLinker(cp2))
249 {
250 return false;
251 }
252
253 if (hasIndic(state) and
254 indic.isLinker(cp1) and
255 indic.isConsonant(cp2))
256 {
257 unsetIndic(state);
258 return false;
259 }
260
261 if (hasIndic(state) and
262 gbp.isZwj(cp1) and
263 indic.isConsonant(cp2))
264 {
265 unsetIndic(state);
266 return false;
267 }
238 268
239 return true; 269 return true;
240} 270}