diff options
| author | 2024-02-15 10:21:30 -0400 | |
|---|---|---|
| committer | 2024-02-15 10:21:30 -0400 | |
| commit | 5d9080f1eac3147754ade3d50e64f6def4b6eb25 (patch) | |
| tree | a6de7e9881a8743026ee5e2fd63b12ec0a3e1773 /src | |
| parent | Combinedd Indic ifs (diff) | |
| download | zg-5d9080f1eac3147754ade3d50e64f6def4b6eb25.tar.gz zg-5d9080f1eac3147754ade3d50e64f6def4b6eb25.tar.xz zg-5d9080f1eac3147754ade3d50e64f6def4b6eb25.zip | |
New single byte props table
Diffstat (limited to 'src')
| -rw-r--r-- | src/Grapheme.zig | 58 |
1 files changed, 31 insertions, 27 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig index d868ef7..ba90b24 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig | |||
| @@ -6,11 +6,7 @@ const unicode = std.unicode; | |||
| 6 | const ziglyph = @import("ziglyph"); | 6 | const ziglyph = @import("ziglyph"); |
| 7 | const CodePoint = @import("CodePoint.zig"); | 7 | const CodePoint = @import("CodePoint.zig"); |
| 8 | const CodePointIterator = CodePoint.CodePointIterator; | 8 | const CodePointIterator = CodePoint.CodePointIterator; |
| 9 | // const emoji = ziglyph.emoji; | ||
| 10 | // const gbp = ziglyph.grapheme_break; | ||
| 11 | const gbp = @import("gbp"); | 9 | const gbp = @import("gbp"); |
| 12 | const emoji = @import("emoji"); | ||
| 13 | const indic = @import("indic"); | ||
| 14 | 10 | ||
| 15 | pub const Grapheme = @This(); | 11 | pub const Grapheme = @This(); |
| 16 | 12 | ||
| @@ -83,7 +79,10 @@ pub const GraphemeIterator = struct { | |||
| 83 | 79 | ||
| 84 | // Predicates | 80 | // Predicates |
| 85 | inline fn isBreaker(cp: u21) bool { | 81 | inline fn isBreaker(cp: u21) bool { |
| 86 | return cp == '\x0d' or cp == '\x0a' or gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]] == .control; | 82 | // Extract relevant properties. |
| 83 | const cp_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]]; | ||
| 84 | const cp_gbp_prop: gbp.Gbp = @enumFromInt(cp_props_byte >> 4); | ||
| 85 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; | ||
| 87 | } | 86 | } |
| 88 | 87 | ||
| 89 | inline fn isIgnorable(cp: u21) bool { | 88 | inline fn isIgnorable(cp: u21) bool { |
| @@ -170,13 +169,20 @@ pub fn graphemeBreak( | |||
| 170 | cp2: u21, | 169 | cp2: u21, |
| 171 | state: *u3, | 170 | state: *u3, |
| 172 | ) bool { | 171 | ) bool { |
| 172 | // Extract relevant properties. | ||
| 173 | const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; | ||
| 174 | const cp1_gbp_prop: gbp.Gbp = @enumFromInt(cp1_props_byte >> 4); | ||
| 175 | const cp1_indic_prop: gbp.Indic = @enumFromInt((cp1_props_byte >> 1) & 0x7); | ||
| 176 | const cp1_is_emoji = cp1_props_byte & 1 == 1; | ||
| 177 | |||
| 178 | const cp2_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp2 >> 8] + (cp2 & 0xff)]]; | ||
| 179 | const cp2_gbp_prop: gbp.Gbp = @enumFromInt(cp2_props_byte >> 4); | ||
| 180 | const cp2_indic_prop: gbp.Indic = @enumFromInt((cp2_props_byte >> 1) & 0x7); | ||
| 181 | const cp2_is_emoji = cp2_props_byte & 1 == 1; | ||
| 182 | |||
| 173 | // GB11: Emoji Extend* ZWJ x Emoji | 183 | // GB11: Emoji Extend* ZWJ x Emoji |
| 174 | const cp1_is_emoji = emoji.stage_2[emoji.stage_1[cp1 >> 8] + (cp1 & 0xff)]; | ||
| 175 | const cp2_is_emoji = emoji.stage_2[emoji.stage_1[cp2 >> 8] + (cp2 & 0xff)]; | ||
| 176 | if (!hasXpic(state) and cp1_is_emoji) setXpic(state); | 184 | if (!hasXpic(state) and cp1_is_emoji) setXpic(state); |
| 177 | // GB9c: Indic Conjunct Break | 185 | // GB9c: Indic Conjunct Break |
| 178 | const cp1_indic_prop = indic.stage_3[indic.stage_2[indic.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; | ||
| 179 | const cp2_indic_prop = indic.stage_3[indic.stage_2[indic.stage_1[cp2 >> 8] + (cp2 & 0xff)]]; | ||
| 180 | if (!hasIndic(state) and cp1_indic_prop == .Consonant) setIndic(state); | 186 | if (!hasIndic(state) and cp1_indic_prop == .Consonant) setIndic(state); |
| 181 | 187 | ||
| 182 | // GB3: CR x LF | 188 | // GB3: CR x LF |
| @@ -186,37 +192,35 @@ pub fn graphemeBreak( | |||
| 186 | if (isBreaker(cp1)) return true; | 192 | if (isBreaker(cp1)) return true; |
| 187 | 193 | ||
| 188 | // GB6: Hangul L x (L|V|LV|VT) | 194 | // GB6: Hangul L x (L|V|LV|VT) |
| 189 | const cp1_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; | 195 | if (cp1_gbp_prop == .L) { |
| 190 | const cp2_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp2 >> 8] + (cp2 & 0xff)]]; | 196 | if (cp2_gbp_prop == .L or |
| 191 | if (cp1_gbp_prop == .hangul_l) { | 197 | cp2_gbp_prop == .V or |
| 192 | if (cp2_gbp_prop == .hangul_l or | 198 | cp2_gbp_prop == .LV or |
| 193 | cp2_gbp_prop == .hangul_v or | 199 | cp2_gbp_prop == .LVT) return false; |
| 194 | cp2_gbp_prop == .hangul_lv or | ||
| 195 | cp2_gbp_prop == .hangul_lvt) return false; | ||
| 196 | } | 200 | } |
| 197 | 201 | ||
| 198 | // GB7: Hangul (LV | V) x (V | T) | 202 | // GB7: Hangul (LV | V) x (V | T) |
| 199 | if (cp1_gbp_prop == .hangul_lv or cp1_gbp_prop == .hangul_v) { | 203 | if (cp1_gbp_prop == .LV or cp1_gbp_prop == .V) { |
| 200 | if (cp2_gbp_prop == .hangul_v or | 204 | if (cp2_gbp_prop == .V or |
| 201 | cp2_gbp_prop == .hangul_t) return false; | 205 | cp2_gbp_prop == .T) return false; |
| 202 | } | 206 | } |
| 203 | 207 | ||
| 204 | // GB8: Hangul (LVT | T) x T | 208 | // GB8: Hangul (LVT | T) x T |
| 205 | if (cp1_gbp_prop == .hangul_lvt or cp1_gbp_prop == .hangul_t) { | 209 | if (cp1_gbp_prop == .LVT or cp1_gbp_prop == .T) { |
| 206 | if (cp2_gbp_prop == .hangul_t) return false; | 210 | if (cp2_gbp_prop == .T) return false; |
| 207 | } | 211 | } |
| 208 | 212 | ||
| 209 | // GB9b: x (Extend | ZWJ) | 213 | // GB9b: x (Extend | ZWJ) |
| 210 | if (cp2_gbp_prop == .extend or cp2_gbp_prop == .zwj) return false; | 214 | if (cp2_gbp_prop == .Extend or cp2_gbp_prop == .ZWJ) return false; |
| 211 | 215 | ||
| 212 | // GB9a: x Spacing | 216 | // GB9a: x Spacing |
| 213 | if (cp2_gbp_prop == .spacing) return false; | 217 | if (cp2_gbp_prop == .SpacingMark) return false; |
| 214 | 218 | ||
| 215 | // GB9b: Prepend x | 219 | // GB9b: Prepend x |
| 216 | if (cp1_gbp_prop == .prepend and !isBreaker(cp2)) return false; | 220 | if (cp1_gbp_prop == .Prepend and !isBreaker(cp2)) return false; |
| 217 | 221 | ||
| 218 | // GB12, GB13: RI x RI | 222 | // GB12, GB13: RI x RI |
| 219 | if (cp1_gbp_prop == .regional and cp2_gbp_prop == .regional) { | 223 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { |
| 220 | if (hasRegional(state)) { | 224 | if (hasRegional(state)) { |
| 221 | unsetRegional(state); | 225 | unsetRegional(state); |
| 222 | return true; | 226 | return true; |
| @@ -228,7 +232,7 @@ pub fn graphemeBreak( | |||
| 228 | 232 | ||
| 229 | // GB11: Emoji Extend* ZWJ x Emoji | 233 | // GB11: Emoji Extend* ZWJ x Emoji |
| 230 | if (hasXpic(state) and | 234 | if (hasXpic(state) and |
| 231 | cp1_gbp_prop == .zwj and | 235 | cp1_gbp_prop == .ZWJ and |
| 232 | cp2_is_emoji) | 236 | cp2_is_emoji) |
| 233 | { | 237 | { |
| 234 | unsetXpic(state); | 238 | unsetXpic(state); |
| @@ -251,7 +255,7 @@ pub fn graphemeBreak( | |||
| 251 | } | 255 | } |
| 252 | 256 | ||
| 253 | if (hasIndic(state) and | 257 | if (hasIndic(state) and |
| 254 | (cp1_indic_prop == .Linker or cp1_gbp_prop == .zwj) and | 258 | (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and |
| 255 | cp2_indic_prop == .Consonant) | 259 | cp2_indic_prop == .Consonant) |
| 256 | { | 260 | { |
| 257 | unsetIndic(state); | 261 | unsetIndic(state); |