diff options
| author | 2024-02-17 11:31:52 -0400 | |
|---|---|---|
| committer | 2024-02-17 11:31:52 -0400 | |
| commit | 490fd008e29420e5b317fd5ef7526f3cc92ba2eb (patch) | |
| tree | 7864cae008cd64a881736fecedf12e5dd5611e83 /src/Grapheme.zig | |
| parent | GraphemeIterator ASCII optimization 3x faster (diff) | |
| download | zg-490fd008e29420e5b317fd5ef7526f3cc92ba2eb.tar.gz zg-490fd008e29420e5b317fd5ef7526f3cc92ba2eb.tar.xz zg-490fd008e29420e5b317fd5ef7526f3cc92ba2eb.zip | |
display_width tweaks
Diffstat (limited to 'src/Grapheme.zig')
| -rw-r--r-- | src/Grapheme.zig | 136 |
1 files changed, 71 insertions, 65 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig index 6892a2a..910aec5 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig | |||
| @@ -56,7 +56,7 @@ pub const GraphemeIterator = struct { | |||
| 56 | 56 | ||
| 57 | const gc_start = self.buf[0].?.offset; | 57 | const gc_start = self.buf[0].?.offset; |
| 58 | var gc_len: usize = self.buf[0].?.len; | 58 | var gc_len: usize = self.buf[0].?.len; |
| 59 | var state: u3 = 0; | 59 | var state = State{}; |
| 60 | 60 | ||
| 61 | if (graphemeBreak( | 61 | if (graphemeBreak( |
| 62 | self.buf[0].?.code, | 62 | self.buf[0].?.code, |
| @@ -95,36 +95,42 @@ fn isIgnorable(cp: u21) bool { | |||
| 95 | } | 95 | } |
| 96 | 96 | ||
| 97 | // Grapheme break state. | 97 | // Grapheme break state. |
| 98 | // Extended Pictographic (emoji) | 98 | const State = struct { |
| 99 | fn hasXpic(state: *const u3) bool { | 99 | bits: u3 = 0, |
| 100 | return state.* & 1 == 1; | 100 | |
| 101 | } | 101 | // Extended Pictographic (emoji) |
| 102 | fn setXpic(state: *u3) void { | 102 | fn hasXpic(self: State) bool { |
| 103 | state.* |= 1; | 103 | return self.bits & 1 == 1; |
| 104 | } | 104 | } |
| 105 | fn unsetXpic(state: *u3) void { | 105 | fn setXpic(self: *State) void { |
| 106 | state.* ^= 1; | 106 | self.bits |= 1; |
| 107 | } | 107 | } |
| 108 | // Regional Indicatior (flags) | 108 | fn unsetXpic(self: *State) void { |
| 109 | fn hasRegional(state: *const u3) bool { | 109 | self.bits ^= 1; |
| 110 | return state.* & 2 == 2; | 110 | } |
| 111 | } | 111 | |
| 112 | fn setRegional(state: *u3) void { | 112 | // Regional Indicatior (flags) |
| 113 | state.* |= 2; | 113 | fn hasRegional(self: State) bool { |
| 114 | } | 114 | return self.bits & 2 == 2; |
| 115 | fn unsetRegional(state: *u3) void { | 115 | } |
| 116 | state.* ^= 2; | 116 | fn setRegional(self: *State) void { |
| 117 | } | 117 | self.bits |= 2; |
| 118 | // Indic Conjunct | 118 | } |
| 119 | fn hasIndic(state: *const u3) bool { | 119 | fn unsetRegional(self: *State) void { |
| 120 | return state.* & 4 == 4; | 120 | self.bits ^= 2; |
| 121 | } | 121 | } |
| 122 | fn setIndic(state: *u3) void { | 122 | |
| 123 | state.* |= 4; | 123 | // Indic Conjunct |
| 124 | } | 124 | fn hasIndic(self: State) bool { |
| 125 | fn unsetIndic(state: *u3) void { | 125 | return self.bits & 4 == 4; |
| 126 | state.* ^= 4; | 126 | } |
| 127 | } | 127 | fn setIndic(self: *State) void { |
| 128 | self.bits |= 4; | ||
| 129 | } | ||
| 130 | fn unsetIndic(self: *State) void { | ||
| 131 | self.bits ^= 4; | ||
| 132 | } | ||
| 133 | }; | ||
| 128 | 134 | ||
| 129 | /// `graphemeBreak` returns true only if a grapheme break point is required | 135 | /// `graphemeBreak` returns true only if a grapheme break point is required |
| 130 | /// between `cp1` and `cp2`. `state` should start out as 0. If calling | 136 | /// between `cp1` and `cp2`. `state` should start out as 0. If calling |
| @@ -135,7 +141,7 @@ fn unsetIndic(state: *u3) void { | |||
| 135 | pub fn graphemeBreak( | 141 | pub fn graphemeBreak( |
| 136 | cp1: u21, | 142 | cp1: u21, |
| 137 | cp2: u21, | 143 | cp2: u21, |
| 138 | state: *u3, | 144 | state: *State, |
| 139 | ) bool { | 145 | ) bool { |
| 140 | // Extract relevant properties. | 146 | // Extract relevant properties. |
| 141 | const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; | 147 | const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; |
| @@ -149,9 +155,9 @@ pub fn graphemeBreak( | |||
| 149 | const cp2_is_emoji = cp2_props_byte & 1 == 1; | 155 | const cp2_is_emoji = cp2_props_byte & 1 == 1; |
| 150 | 156 | ||
| 151 | // GB11: Emoji Extend* ZWJ x Emoji | 157 | // GB11: Emoji Extend* ZWJ x Emoji |
| 152 | if (!hasXpic(state) and cp1_is_emoji) setXpic(state); | 158 | if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); |
| 153 | // GB9c: Indic Conjunct Break | 159 | // GB9c: Indic Conjunct Break |
| 154 | if (!hasIndic(state) and cp1_indic_prop == .Consonant) setIndic(state); | 160 | if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic(); |
| 155 | 161 | ||
| 156 | // GB3: CR x LF | 162 | // GB3: CR x LF |
| 157 | if (cp1 == '\r' and cp2 == '\n') return false; | 163 | if (cp1 == '\r' and cp2 == '\n') return false; |
| @@ -159,23 +165,13 @@ pub fn graphemeBreak( | |||
| 159 | // GB4: Control | 165 | // GB4: Control |
| 160 | if (isBreaker(cp1)) return true; | 166 | if (isBreaker(cp1)) return true; |
| 161 | 167 | ||
| 162 | // GB6: Hangul L x (L|V|LV|VT) | 168 | // GB11: Emoji Extend* ZWJ x Emoji |
| 163 | if (cp1_gbp_prop == .L) { | 169 | if (state.hasXpic() and |
| 164 | if (cp2_gbp_prop == .L or | 170 | cp1_gbp_prop == .ZWJ and |
| 165 | cp2_gbp_prop == .V or | 171 | cp2_is_emoji) |
| 166 | cp2_gbp_prop == .LV or | 172 | { |
| 167 | cp2_gbp_prop == .LVT) return false; | 173 | state.unsetXpic(); |
| 168 | } | 174 | return false; |
| 169 | |||
| 170 | // GB7: Hangul (LV | V) x (V | T) | ||
| 171 | if (cp1_gbp_prop == .LV or cp1_gbp_prop == .V) { | ||
| 172 | if (cp2_gbp_prop == .V or | ||
| 173 | cp2_gbp_prop == .T) return false; | ||
| 174 | } | ||
| 175 | |||
| 176 | // GB8: Hangul (LVT | T) x T | ||
| 177 | if (cp1_gbp_prop == .LVT or cp1_gbp_prop == .T) { | ||
| 178 | if (cp2_gbp_prop == .T) return false; | ||
| 179 | } | 175 | } |
| 180 | 176 | ||
| 181 | // GB9b: x (Extend | ZWJ) | 177 | // GB9b: x (Extend | ZWJ) |
| @@ -189,44 +185,54 @@ pub fn graphemeBreak( | |||
| 189 | 185 | ||
| 190 | // GB12, GB13: RI x RI | 186 | // GB12, GB13: RI x RI |
| 191 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { | 187 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { |
| 192 | if (hasRegional(state)) { | 188 | if (state.hasRegional()) { |
| 193 | unsetRegional(state); | 189 | state.unsetRegional(); |
| 194 | return true; | 190 | return true; |
| 195 | } else { | 191 | } else { |
| 196 | setRegional(state); | 192 | state.setRegional(); |
| 197 | return false; | 193 | return false; |
| 198 | } | 194 | } |
| 199 | } | 195 | } |
| 200 | 196 | ||
| 201 | // GB11: Emoji Extend* ZWJ x Emoji | 197 | // GB6: Hangul L x (L|V|LV|VT) |
| 202 | if (hasXpic(state) and | 198 | if (cp1_gbp_prop == .L) { |
| 203 | cp1_gbp_prop == .ZWJ and | 199 | if (cp2_gbp_prop == .L or |
| 204 | cp2_is_emoji) | 200 | cp2_gbp_prop == .V or |
| 205 | { | 201 | cp2_gbp_prop == .LV or |
| 206 | unsetXpic(state); | 202 | cp2_gbp_prop == .LVT) return false; |
| 207 | return false; | 203 | } |
| 204 | |||
| 205 | // GB7: Hangul (LV | V) x (V | T) | ||
| 206 | if (cp1_gbp_prop == .LV or cp1_gbp_prop == .V) { | ||
| 207 | if (cp2_gbp_prop == .V or | ||
| 208 | cp2_gbp_prop == .T) return false; | ||
| 209 | } | ||
| 210 | |||
| 211 | // GB8: Hangul (LVT | T) x T | ||
| 212 | if (cp1_gbp_prop == .LVT or cp1_gbp_prop == .T) { | ||
| 213 | if (cp2_gbp_prop == .T) return false; | ||
| 208 | } | 214 | } |
| 209 | 215 | ||
| 210 | // GB9c: Indic Conjunct Break | 216 | // GB9c: Indic Conjunct Break |
| 211 | if (hasIndic(state) and | 217 | if (state.hasIndic() and |
| 212 | cp1_indic_prop == .Consonant and | 218 | cp1_indic_prop == .Consonant and |
| 213 | (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) | 219 | (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) |
| 214 | { | 220 | { |
| 215 | return false; | 221 | return false; |
| 216 | } | 222 | } |
| 217 | 223 | ||
| 218 | if (hasIndic(state) and | 224 | if (state.hasIndic() and |
| 219 | cp1_indic_prop == .Extend and | 225 | cp1_indic_prop == .Extend and |
| 220 | cp2_indic_prop == .Linker) | 226 | cp2_indic_prop == .Linker) |
| 221 | { | 227 | { |
| 222 | return false; | 228 | return false; |
| 223 | } | 229 | } |
| 224 | 230 | ||
| 225 | if (hasIndic(state) and | 231 | if (state.hasIndic() and |
| 226 | (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and | 232 | (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and |
| 227 | cp2_indic_prop == .Consonant) | 233 | cp2_indic_prop == .Consonant) |
| 228 | { | 234 | { |
| 229 | unsetIndic(state); | 235 | state.unsetIndic(); |
| 230 | return false; | 236 | return false; |
| 231 | } | 237 | } |
| 232 | 238 | ||