diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/Grapheme.zig | 89 |
1 files changed, 47 insertions, 42 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig index a0ec900..56eecbe 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig | |||
| @@ -82,26 +82,27 @@ pub const GraphemeIterator = struct { | |||
| 82 | }; | 82 | }; |
| 83 | 83 | ||
| 84 | // Predicates | 84 | // Predicates |
| 85 | fn isBreaker(cp: u21) bool { | 85 | inline fn isBreaker(cp: u21) bool { |
| 86 | return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp); | 86 | return cp == '\x0d' or cp == '\x0a' or gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]] == .control; |
| 87 | } | 87 | } |
| 88 | 88 | ||
| 89 | fn isIgnorable(cp: u21) bool { | 89 | inline fn isIgnorable(cp: u21) bool { |
| 90 | return gbp.isExtend(cp) or gbp.isSpacingmark(cp) or cp == '\u{200d}'; | 90 | const cp_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]]; |
| 91 | return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}'; | ||
| 91 | } | 92 | } |
| 92 | 93 | ||
| 93 | // test "Segmentation comptime GraphemeIterator" { | 94 | test "Segmentation comptime GraphemeIterator" { |
| 94 | // const want = [_][]const u8{ "H", "é", "l", "l", "o" }; | 95 | const want = [_][]const u8{ "H", "é", "l", "l", "o" }; |
| 95 | // | 96 | |
| 96 | // comptime { | 97 | comptime { |
| 97 | // const src = "Héllo"; | 98 | const src = "Héllo"; |
| 98 | // var ct_iter = GraphemeIterator.init(src); | 99 | var ct_iter = GraphemeIterator.init(src); |
| 99 | // var i = 0; | 100 | var i = 0; |
| 100 | // while (ct_iter.next()) |grapheme| : (i += 1) { | 101 | while (ct_iter.next()) |grapheme| : (i += 1) { |
| 101 | // try std.testing.expect(grapheme.eql(src, want[i])); | 102 | try std.testing.expect(grapheme.eql(src, want[i])); |
| 102 | // } | 103 | } |
| 103 | // } | 104 | } |
| 104 | // } | 105 | } |
| 105 | 106 | ||
| 106 | test "Segmentation ZWJ and ZWSP emoji sequences" { | 107 | test "Segmentation ZWJ and ZWSP emoji sequences" { |
| 107 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | 108 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; |
| @@ -172,7 +173,9 @@ pub fn graphemeBreak( | |||
| 172 | // GB11: Emoji Extend* ZWJ x Emoji | 173 | // GB11: Emoji Extend* ZWJ x Emoji |
| 173 | if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state); | 174 | if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state); |
| 174 | // GB9c: Indic Conjunct Break | 175 | // GB9c: Indic Conjunct Break |
| 175 | if (!hasIndic(state) and indic.isConsonant(cp1)) setIndic(state); | 176 | const cp1_indic_prop = indic.stage_3[indic.stage_2[indic.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; |
| 177 | const cp2_indic_prop = indic.stage_3[indic.stage_2[indic.stage_1[cp2 >> 8] + (cp2 & 0xff)]]; | ||
| 178 | if (!hasIndic(state) and cp1_indic_prop == .Consonant) setIndic(state); | ||
| 176 | 179 | ||
| 177 | // GB3: CR x LF | 180 | // GB3: CR x LF |
| 178 | if (cp1 == '\r' and cp2 == '\n') return false; | 181 | if (cp1 == '\r' and cp2 == '\n') return false; |
| @@ -181,35 +184,37 @@ pub fn graphemeBreak( | |||
| 181 | if (isBreaker(cp1)) return true; | 184 | if (isBreaker(cp1)) return true; |
| 182 | 185 | ||
| 183 | // GB6: Hangul L x (L|V|LV|VT) | 186 | // GB6: Hangul L x (L|V|LV|VT) |
| 184 | if (gbp.isL(cp1)) { | 187 | const cp1_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; |
| 185 | if (gbp.isL(cp2) or | 188 | const cp2_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp2 >> 8] + (cp2 & 0xff)]]; |
| 186 | gbp.isV(cp2) or | 189 | if (cp1_gbp_prop == .hangul_l) { |
| 187 | gbp.isLv(cp2) or | 190 | if (cp2_gbp_prop == .hangul_l or |
| 188 | gbp.isLvt(cp2)) return false; | 191 | cp2_gbp_prop == .hangul_v or |
| 192 | cp2_gbp_prop == .hangul_lv or | ||
| 193 | cp2_gbp_prop == .hangul_lvt) return false; | ||
| 189 | } | 194 | } |
| 190 | 195 | ||
| 191 | // GB7: Hangul (LV | V) x (V | T) | 196 | // GB7: Hangul (LV | V) x (V | T) |
| 192 | if (gbp.isLv(cp1) or gbp.isV(cp1)) { | 197 | if (cp1_gbp_prop == .hangul_lv or cp1_gbp_prop == .hangul_v) { |
| 193 | if (gbp.isV(cp2) or | 198 | if (cp2_gbp_prop == .hangul_v or |
| 194 | gbp.isT(cp2)) return false; | 199 | cp2_gbp_prop == .hangul_t) return false; |
| 195 | } | 200 | } |
| 196 | 201 | ||
| 197 | // GB8: Hangul (LVT | T) x T | 202 | // GB8: Hangul (LVT | T) x T |
| 198 | if (gbp.isLvt(cp1) or gbp.isT(cp1)) { | 203 | if (cp1_gbp_prop == .hangul_lvt or cp1_gbp_prop == .hangul_t) { |
| 199 | if (gbp.isT(cp2)) return false; | 204 | if (cp2_gbp_prop == .hangul_t) return false; |
| 200 | } | 205 | } |
| 201 | 206 | ||
| 202 | // GB9b: x (Extend | ZWJ) | 207 | // GB9b: x (Extend | ZWJ) |
| 203 | if (gbp.isExtend(cp2) or gbp.isZwj(cp2)) return false; | 208 | if (cp2_gbp_prop == .extend or cp2_gbp_prop == .zwj) return false; |
| 204 | 209 | ||
| 205 | // GB9a: x Spacing | 210 | // GB9a: x Spacing |
| 206 | if (gbp.isSpacingmark(cp2)) return false; | 211 | if (cp2_gbp_prop == .spacing) return false; |
| 207 | 212 | ||
| 208 | // GB9b: Prepend x | 213 | // GB9b: Prepend x |
| 209 | if (gbp.isPrepend(cp1) and !isBreaker(cp2)) return false; | 214 | if (cp1_gbp_prop == .prepend and !isBreaker(cp2)) return false; |
| 210 | 215 | ||
| 211 | // GB12, GB13: RI x RI | 216 | // GB12, GB13: RI x RI |
| 212 | if (gbp.isRegionalIndicator(cp1) and gbp.isRegionalIndicator(cp2)) { | 217 | if (cp1_gbp_prop == .regional and cp2_gbp_prop == .regional) { |
| 213 | if (hasRegional(state)) { | 218 | if (hasRegional(state)) { |
| 214 | unsetRegional(state); | 219 | unsetRegional(state); |
| 215 | return true; | 220 | return true; |
| @@ -221,7 +226,7 @@ pub fn graphemeBreak( | |||
| 221 | 226 | ||
| 222 | // GB11: Emoji Extend* ZWJ x Emoji | 227 | // GB11: Emoji Extend* ZWJ x Emoji |
| 223 | if (hasXpic(state) and | 228 | if (hasXpic(state) and |
| 224 | gbp.isZwj(cp1) and | 229 | cp1_gbp_prop == .zwj and |
| 225 | emoji.isExtendedPictographic(cp2)) | 230 | emoji.isExtendedPictographic(cp2)) |
| 226 | { | 231 | { |
| 227 | unsetXpic(state); | 232 | unsetXpic(state); |
| @@ -230,37 +235,37 @@ pub fn graphemeBreak( | |||
| 230 | 235 | ||
| 231 | // GB9c: Indic Conjunct Break | 236 | // GB9c: Indic Conjunct Break |
| 232 | if (hasIndic(state) and | 237 | if (hasIndic(state) and |
| 233 | indic.isConsonant(cp1) and | 238 | cp1_indic_prop == .Consonant and |
| 234 | indic.isExtend(cp2)) | 239 | cp2_indic_prop == .Extend) |
| 235 | { | 240 | { |
| 236 | return false; | 241 | return false; |
| 237 | } | 242 | } |
| 238 | 243 | ||
| 239 | if (hasIndic(state) and | 244 | if (hasIndic(state) and |
| 240 | indic.isConsonant(cp1) and | 245 | cp1_indic_prop == .Consonant and |
| 241 | indic.isLinker(cp2)) | 246 | cp2_indic_prop == .Linker) |
| 242 | { | 247 | { |
| 243 | return false; | 248 | return false; |
| 244 | } | 249 | } |
| 245 | 250 | ||
| 246 | if (hasIndic(state) and | 251 | if (hasIndic(state) and |
| 247 | indic.isExtend(cp1) and | 252 | cp1_indic_prop == .Extend and |
| 248 | indic.isLinker(cp2)) | 253 | cp2_indic_prop == .Linker) |
| 249 | { | 254 | { |
| 250 | return false; | 255 | return false; |
| 251 | } | 256 | } |
| 252 | 257 | ||
| 253 | if (hasIndic(state) and | 258 | if (hasIndic(state) and |
| 254 | indic.isLinker(cp1) and | 259 | cp1_indic_prop == .Linker and |
| 255 | indic.isConsonant(cp2)) | 260 | cp2_indic_prop == .Consonant) |
| 256 | { | 261 | { |
| 257 | unsetIndic(state); | 262 | unsetIndic(state); |
| 258 | return false; | 263 | return false; |
| 259 | } | 264 | } |
| 260 | 265 | ||
| 261 | if (hasIndic(state) and | 266 | if (hasIndic(state) and |
| 262 | gbp.isZwj(cp1) and | 267 | cp1_gbp_prop == .zwj and |
| 263 | indic.isConsonant(cp2)) | 268 | cp2_indic_prop == .Consonant) |
| 264 | { | 269 | { |
| 265 | unsetIndic(state); | 270 | unsetIndic(state); |
| 266 | return false; | 271 | return false; |