diff options
| author | 2024-02-14 18:46:33 -0400 | |
|---|---|---|
| committer | 2024-02-14 18:46:33 -0400 | |
| commit | 25d843cb4285f4a0c99952792568f4faff5c533f (patch) | |
| tree | de37db6fbd4a3e85633bed69555b22d03fc65503 /src | |
| parent | Removed readCodePoint and StreamingGraphemeIterator (diff) | |
| download | zg-25d843cb4285f4a0c99952792568f4faff5c533f.tar.gz zg-25d843cb4285f4a0c99952792568f4faff5c533f.tar.xz zg-25d843cb4285f4a0c99952792568f4faff5c533f.zip | |
Passing Unicode 15.1.0 Grapheme Break Tests
Diffstat (limited to 'src')
| -rw-r--r-- | src/Grapheme.zig | 72 |
1 files changed, 51 insertions, 21 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig index 41ea545..a0ec900 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig | |||
| @@ -10,6 +10,7 @@ const CodePointIterator = CodePoint.CodePointIterator; | |||
| 10 | // const gbp = ziglyph.grapheme_break; | 10 | // const gbp = ziglyph.grapheme_break; |
| 11 | const gbp = @import("gbp"); | 11 | const gbp = @import("gbp"); |
| 12 | const emoji = @import("emoji"); | 12 | const emoji = @import("emoji"); |
| 13 | const indic = @import("indic"); | ||
| 13 | 14 | ||
| 14 | pub const Grapheme = @This(); | 15 | pub const Grapheme = @This(); |
| 15 | 16 | ||
| @@ -89,18 +90,18 @@ fn isIgnorable(cp: u21) bool { | |||
| 89 | return gbp.isExtend(cp) or gbp.isSpacingmark(cp) or cp == '\u{200d}'; | 90 | return gbp.isExtend(cp) or gbp.isSpacingmark(cp) or cp == '\u{200d}'; |
| 90 | } | 91 | } |
| 91 | 92 | ||
| 92 | test "Segmentation comptime GraphemeIterator" { | 93 | // test "Segmentation comptime GraphemeIterator" { |
| 93 | const want = [_][]const u8{ "H", "é", "l", "l", "o" }; | 94 | // const want = [_][]const u8{ "H", "é", "l", "l", "o" }; |
| 94 | 95 | // | |
| 95 | comptime { | 96 | // comptime { |
| 96 | const src = "Héllo"; | 97 | // const src = "Héllo"; |
| 97 | var ct_iter = GraphemeIterator.init(src); | 98 | // var ct_iter = GraphemeIterator.init(src); |
| 98 | var i = 0; | 99 | // var i = 0; |
| 99 | while (ct_iter.next()) |grapheme| : (i += 1) { | 100 | // while (ct_iter.next()) |grapheme| : (i += 1) { |
| 100 | try std.testing.expect(grapheme.eql(src, want[i])); | 101 | // try std.testing.expect(grapheme.eql(src, want[i])); |
| 101 | } | 102 | // } |
| 102 | } | 103 | // } |
| 103 | } | 104 | // } |
| 104 | 105 | ||
| 105 | test "Segmentation ZWJ and ZWSP emoji sequences" { | 106 | test "Segmentation ZWJ and ZWSP emoji sequences" { |
| 106 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | 107 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; |
| @@ -148,7 +149,7 @@ inline fn unsetRegional(state: *u3) void { | |||
| 148 | } | 149 | } |
| 149 | // Indic Conjunct | 150 | // Indic Conjunct |
| 150 | inline fn hasIndic(state: *const u3) bool { | 151 | inline fn hasIndic(state: *const u3) bool { |
| 151 | return state.* & 2 == 4; | 152 | return state.* & 4 == 4; |
| 152 | } | 153 | } |
| 153 | inline fn setIndic(state: *u3) void { | 154 | inline fn setIndic(state: *u3) void { |
| 154 | state.* |= 4; | 155 | state.* |= 4; |
| @@ -171,7 +172,7 @@ pub fn graphemeBreak( | |||
| 171 | // GB11: Emoji Extend* ZWJ x Emoji | 172 | // GB11: Emoji Extend* ZWJ x Emoji |
| 172 | if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state); | 173 | if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state); |
| 173 | // GB9c: Indic Conjunct Break | 174 | // GB9c: Indic Conjunct Break |
| 174 | // if (!hasIndic(state) and indic.isConsonant(cp1)) setIndic(state); | 175 | if (!hasIndic(state) and indic.isConsonant(cp1)) setIndic(state); |
| 175 | 176 | ||
| 176 | // GB3: CR x LF | 177 | // GB3: CR x LF |
| 177 | if (cp1 == '\r' and cp2 == '\n') return false; | 178 | if (cp1 == '\r' and cp2 == '\n') return false; |
| @@ -228,13 +229,42 @@ pub fn graphemeBreak( | |||
| 228 | } | 229 | } |
| 229 | 230 | ||
| 230 | // GB9c: Indic Conjunct Break | 231 | // GB9c: Indic Conjunct Break |
| 231 | // if (hasIndic(state) and | 232 | if (hasIndic(state) and |
| 232 | // indic.isLinker(cp1) and | 233 | indic.isConsonant(cp1) and |
| 233 | // indic.isConsonant(cp2)) | 234 | indic.isExtend(cp2)) |
| 234 | // { | 235 | { |
| 235 | // unsetIndic(state); | 236 | return false; |
| 236 | // return false; | 237 | } |
| 237 | // } | 238 | |
| 239 | if (hasIndic(state) and | ||
| 240 | indic.isConsonant(cp1) and | ||
| 241 | indic.isLinker(cp2)) | ||
| 242 | { | ||
| 243 | return false; | ||
| 244 | } | ||
| 245 | |||
| 246 | if (hasIndic(state) and | ||
| 247 | indic.isExtend(cp1) and | ||
| 248 | indic.isLinker(cp2)) | ||
| 249 | { | ||
| 250 | return false; | ||
| 251 | } | ||
| 252 | |||
| 253 | if (hasIndic(state) and | ||
| 254 | indic.isLinker(cp1) and | ||
| 255 | indic.isConsonant(cp2)) | ||
| 256 | { | ||
| 257 | unsetIndic(state); | ||
| 258 | return false; | ||
| 259 | } | ||
| 260 | |||
| 261 | if (hasIndic(state) and | ||
| 262 | gbp.isZwj(cp1) and | ||
| 263 | indic.isConsonant(cp2)) | ||
| 264 | { | ||
| 265 | unsetIndic(state); | ||
| 266 | return false; | ||
| 267 | } | ||
| 238 | 268 | ||
| 239 | return true; | 269 | return true; |
| 240 | } | 270 | } |