summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-14 21:39:37 -0400
committerGravatar Jose Colon Rodriguez2024-02-14 21:39:37 -0400
commitb2be2562b908d5563a8e71c44cbad577e4df4201 (patch)
treeeed831bdd2200247f9ffdb1adc8c2a051bd3f3e8 /src
parentPassing Unicode 15.1.0 Grapheme Break Tests (diff)
downloadzg-b2be2562b908d5563a8e71c44cbad577e4df4201.tar.gz
zg-b2be2562b908d5563a8e71c44cbad577e4df4201.tar.xz
zg-b2be2562b908d5563a8e71c44cbad577e4df4201.zip
gbp and indic direct array access
Diffstat (limited to 'src')
-rw-r--r--src/Grapheme.zig89
1 files changed, 47 insertions, 42 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig
index a0ec900..56eecbe 100644
--- a/src/Grapheme.zig
+++ b/src/Grapheme.zig
@@ -82,26 +82,27 @@ pub const GraphemeIterator = struct {
82}; 82};
83 83
84// Predicates 84// Predicates
85fn isBreaker(cp: u21) bool { 85inline fn isBreaker(cp: u21) bool {
86 return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp); 86 return cp == '\x0d' or cp == '\x0a' or gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]] == .control;
87} 87}
88 88
89fn isIgnorable(cp: u21) bool { 89inline fn isIgnorable(cp: u21) bool {
90 return gbp.isExtend(cp) or gbp.isSpacingmark(cp) or cp == '\u{200d}'; 90 const cp_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]];
91 return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}';
91} 92}
92 93
93// test "Segmentation comptime GraphemeIterator" { 94test "Segmentation comptime GraphemeIterator" {
94// const want = [_][]const u8{ "H", "é", "l", "l", "o" }; 95 const want = [_][]const u8{ "H", "é", "l", "l", "o" };
95// 96
96// comptime { 97 comptime {
97// const src = "Héllo"; 98 const src = "Héllo";
98// var ct_iter = GraphemeIterator.init(src); 99 var ct_iter = GraphemeIterator.init(src);
99// var i = 0; 100 var i = 0;
100// while (ct_iter.next()) |grapheme| : (i += 1) { 101 while (ct_iter.next()) |grapheme| : (i += 1) {
101// try std.testing.expect(grapheme.eql(src, want[i])); 102 try std.testing.expect(grapheme.eql(src, want[i]));
102// } 103 }
103// } 104 }
104// } 105}
105 106
106test "Segmentation ZWJ and ZWSP emoji sequences" { 107test "Segmentation ZWJ and ZWSP emoji sequences" {
107 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; 108 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
@@ -172,7 +173,9 @@ pub fn graphemeBreak(
172 // GB11: Emoji Extend* ZWJ x Emoji 173 // GB11: Emoji Extend* ZWJ x Emoji
173 if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state); 174 if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state);
174 // GB9c: Indic Conjunct Break 175 // GB9c: Indic Conjunct Break
175 if (!hasIndic(state) and indic.isConsonant(cp1)) setIndic(state); 176 const cp1_indic_prop = indic.stage_3[indic.stage_2[indic.stage_1[cp1 >> 8] + (cp1 & 0xff)]];
177 const cp2_indic_prop = indic.stage_3[indic.stage_2[indic.stage_1[cp2 >> 8] + (cp2 & 0xff)]];
178 if (!hasIndic(state) and cp1_indic_prop == .Consonant) setIndic(state);
176 179
177 // GB3: CR x LF 180 // GB3: CR x LF
178 if (cp1 == '\r' and cp2 == '\n') return false; 181 if (cp1 == '\r' and cp2 == '\n') return false;
@@ -181,35 +184,37 @@ pub fn graphemeBreak(
181 if (isBreaker(cp1)) return true; 184 if (isBreaker(cp1)) return true;
182 185
183 // GB6: Hangul L x (L|V|LV|VT) 186 // GB6: Hangul L x (L|V|LV|VT)
184 if (gbp.isL(cp1)) { 187 const cp1_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]];
185 if (gbp.isL(cp2) or 188 const cp2_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp2 >> 8] + (cp2 & 0xff)]];
186 gbp.isV(cp2) or 189 if (cp1_gbp_prop == .hangul_l) {
187 gbp.isLv(cp2) or 190 if (cp2_gbp_prop == .hangul_l or
188 gbp.isLvt(cp2)) return false; 191 cp2_gbp_prop == .hangul_v or
192 cp2_gbp_prop == .hangul_lv or
193 cp2_gbp_prop == .hangul_lvt) return false;
189 } 194 }
190 195
191 // GB7: Hangul (LV | V) x (V | T) 196 // GB7: Hangul (LV | V) x (V | T)
192 if (gbp.isLv(cp1) or gbp.isV(cp1)) { 197 if (cp1_gbp_prop == .hangul_lv or cp1_gbp_prop == .hangul_v) {
193 if (gbp.isV(cp2) or 198 if (cp2_gbp_prop == .hangul_v or
194 gbp.isT(cp2)) return false; 199 cp2_gbp_prop == .hangul_t) return false;
195 } 200 }
196 201
197 // GB8: Hangul (LVT | T) x T 202 // GB8: Hangul (LVT | T) x T
198 if (gbp.isLvt(cp1) or gbp.isT(cp1)) { 203 if (cp1_gbp_prop == .hangul_lvt or cp1_gbp_prop == .hangul_t) {
199 if (gbp.isT(cp2)) return false; 204 if (cp2_gbp_prop == .hangul_t) return false;
200 } 205 }
201 206
202 // GB9b: x (Extend | ZWJ) 207 // GB9b: x (Extend | ZWJ)
203 if (gbp.isExtend(cp2) or gbp.isZwj(cp2)) return false; 208 if (cp2_gbp_prop == .extend or cp2_gbp_prop == .zwj) return false;
204 209
205 // GB9a: x Spacing 210 // GB9a: x Spacing
206 if (gbp.isSpacingmark(cp2)) return false; 211 if (cp2_gbp_prop == .spacing) return false;
207 212
208 // GB9b: Prepend x 213 // GB9b: Prepend x
209 if (gbp.isPrepend(cp1) and !isBreaker(cp2)) return false; 214 if (cp1_gbp_prop == .prepend and !isBreaker(cp2)) return false;
210 215
211 // GB12, GB13: RI x RI 216 // GB12, GB13: RI x RI
212 if (gbp.isRegionalIndicator(cp1) and gbp.isRegionalIndicator(cp2)) { 217 if (cp1_gbp_prop == .regional and cp2_gbp_prop == .regional) {
213 if (hasRegional(state)) { 218 if (hasRegional(state)) {
214 unsetRegional(state); 219 unsetRegional(state);
215 return true; 220 return true;
@@ -221,7 +226,7 @@ pub fn graphemeBreak(
221 226
222 // GB11: Emoji Extend* ZWJ x Emoji 227 // GB11: Emoji Extend* ZWJ x Emoji
223 if (hasXpic(state) and 228 if (hasXpic(state) and
224 gbp.isZwj(cp1) and 229 cp1_gbp_prop == .zwj and
225 emoji.isExtendedPictographic(cp2)) 230 emoji.isExtendedPictographic(cp2))
226 { 231 {
227 unsetXpic(state); 232 unsetXpic(state);
@@ -230,37 +235,37 @@ pub fn graphemeBreak(
230 235
231 // GB9c: Indic Conjunct Break 236 // GB9c: Indic Conjunct Break
232 if (hasIndic(state) and 237 if (hasIndic(state) and
233 indic.isConsonant(cp1) and 238 cp1_indic_prop == .Consonant and
234 indic.isExtend(cp2)) 239 cp2_indic_prop == .Extend)
235 { 240 {
236 return false; 241 return false;
237 } 242 }
238 243
239 if (hasIndic(state) and 244 if (hasIndic(state) and
240 indic.isConsonant(cp1) and 245 cp1_indic_prop == .Consonant and
241 indic.isLinker(cp2)) 246 cp2_indic_prop == .Linker)
242 { 247 {
243 return false; 248 return false;
244 } 249 }
245 250
246 if (hasIndic(state) and 251 if (hasIndic(state) and
247 indic.isExtend(cp1) and 252 cp1_indic_prop == .Extend and
248 indic.isLinker(cp2)) 253 cp2_indic_prop == .Linker)
249 { 254 {
250 return false; 255 return false;
251 } 256 }
252 257
253 if (hasIndic(state) and 258 if (hasIndic(state) and
254 indic.isLinker(cp1) and 259 cp1_indic_prop == .Linker and
255 indic.isConsonant(cp2)) 260 cp2_indic_prop == .Consonant)
256 { 261 {
257 unsetIndic(state); 262 unsetIndic(state);
258 return false; 263 return false;
259 } 264 }
260 265
261 if (hasIndic(state) and 266 if (hasIndic(state) and
262 gbp.isZwj(cp1) and 267 cp1_gbp_prop == .zwj and
263 indic.isConsonant(cp2)) 268 cp2_indic_prop == .Consonant)
264 { 269 {
265 unsetIndic(state); 270 unsetIndic(state);
266 return false; 271 return false;