summaryrefslogtreecommitdiff
path: root/src/Grapheme.zig
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-17 11:31:52 -0400
committerGravatar Jose Colon Rodriguez2024-02-17 11:31:52 -0400
commit490fd008e29420e5b317fd5ef7526f3cc92ba2eb (patch)
tree7864cae008cd64a881736fecedf12e5dd5611e83 /src/Grapheme.zig
parentGraphemeIterator ASCII optimization 3x faster (diff)
downloadzg-490fd008e29420e5b317fd5ef7526f3cc92ba2eb.tar.gz
zg-490fd008e29420e5b317fd5ef7526f3cc92ba2eb.tar.xz
zg-490fd008e29420e5b317fd5ef7526f3cc92ba2eb.zip
display_width tweaks
Diffstat (limited to 'src/Grapheme.zig')
-rw-r--r--src/Grapheme.zig136
1 files changed, 71 insertions, 65 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig
index 6892a2a..910aec5 100644
--- a/src/Grapheme.zig
+++ b/src/Grapheme.zig
@@ -56,7 +56,7 @@ pub const GraphemeIterator = struct {
56 56
57 const gc_start = self.buf[0].?.offset; 57 const gc_start = self.buf[0].?.offset;
58 var gc_len: usize = self.buf[0].?.len; 58 var gc_len: usize = self.buf[0].?.len;
59 var state: u3 = 0; 59 var state = State{};
60 60
61 if (graphemeBreak( 61 if (graphemeBreak(
62 self.buf[0].?.code, 62 self.buf[0].?.code,
@@ -95,36 +95,42 @@ fn isIgnorable(cp: u21) bool {
95} 95}
96 96
97// Grapheme break state. 97// Grapheme break state.
98// Extended Pictographic (emoji) 98const State = struct {
99fn hasXpic(state: *const u3) bool { 99 bits: u3 = 0,
100 return state.* & 1 == 1; 100
101} 101 // Extended Pictographic (emoji)
102fn setXpic(state: *u3) void { 102 fn hasXpic(self: State) bool {
103 state.* |= 1; 103 return self.bits & 1 == 1;
104} 104 }
105fn unsetXpic(state: *u3) void { 105 fn setXpic(self: *State) void {
106 state.* ^= 1; 106 self.bits |= 1;
107} 107 }
108// Regional Indicatior (flags) 108 fn unsetXpic(self: *State) void {
109fn hasRegional(state: *const u3) bool { 109 self.bits ^= 1;
110 return state.* & 2 == 2; 110 }
111} 111
112fn setRegional(state: *u3) void { 112 // Regional Indicatior (flags)
113 state.* |= 2; 113 fn hasRegional(self: State) bool {
114} 114 return self.bits & 2 == 2;
115fn unsetRegional(state: *u3) void { 115 }
116 state.* ^= 2; 116 fn setRegional(self: *State) void {
117} 117 self.bits |= 2;
118// Indic Conjunct 118 }
119fn hasIndic(state: *const u3) bool { 119 fn unsetRegional(self: *State) void {
120 return state.* & 4 == 4; 120 self.bits ^= 2;
121} 121 }
122fn setIndic(state: *u3) void { 122
123 state.* |= 4; 123 // Indic Conjunct
124} 124 fn hasIndic(self: State) bool {
125fn unsetIndic(state: *u3) void { 125 return self.bits & 4 == 4;
126 state.* ^= 4; 126 }
127} 127 fn setIndic(self: *State) void {
128 self.bits |= 4;
129 }
130 fn unsetIndic(self: *State) void {
131 self.bits ^= 4;
132 }
133};
128 134
129/// `graphemeBreak` returns true only if a grapheme break point is required 135/// `graphemeBreak` returns true only if a grapheme break point is required
130/// between `cp1` and `cp2`. `state` should start out as 0. If calling 136/// between `cp1` and `cp2`. `state` should start out as 0. If calling
@@ -135,7 +141,7 @@ fn unsetIndic(state: *u3) void {
135pub fn graphemeBreak( 141pub fn graphemeBreak(
136 cp1: u21, 142 cp1: u21,
137 cp2: u21, 143 cp2: u21,
138 state: *u3, 144 state: *State,
139) bool { 145) bool {
140 // Extract relevant properties. 146 // Extract relevant properties.
141 const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; 147 const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]];
@@ -149,9 +155,9 @@ pub fn graphemeBreak(
149 const cp2_is_emoji = cp2_props_byte & 1 == 1; 155 const cp2_is_emoji = cp2_props_byte & 1 == 1;
150 156
151 // GB11: Emoji Extend* ZWJ x Emoji 157 // GB11: Emoji Extend* ZWJ x Emoji
152 if (!hasXpic(state) and cp1_is_emoji) setXpic(state); 158 if (!state.hasXpic() and cp1_is_emoji) state.setXpic();
153 // GB9c: Indic Conjunct Break 159 // GB9c: Indic Conjunct Break
154 if (!hasIndic(state) and cp1_indic_prop == .Consonant) setIndic(state); 160 if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic();
155 161
156 // GB3: CR x LF 162 // GB3: CR x LF
157 if (cp1 == '\r' and cp2 == '\n') return false; 163 if (cp1 == '\r' and cp2 == '\n') return false;
@@ -159,23 +165,13 @@ pub fn graphemeBreak(
159 // GB4: Control 165 // GB4: Control
160 if (isBreaker(cp1)) return true; 166 if (isBreaker(cp1)) return true;
161 167
162 // GB6: Hangul L x (L|V|LV|VT) 168 // GB11: Emoji Extend* ZWJ x Emoji
163 if (cp1_gbp_prop == .L) { 169 if (state.hasXpic() and
164 if (cp2_gbp_prop == .L or 170 cp1_gbp_prop == .ZWJ and
165 cp2_gbp_prop == .V or 171 cp2_is_emoji)
166 cp2_gbp_prop == .LV or 172 {
167 cp2_gbp_prop == .LVT) return false; 173 state.unsetXpic();
168 } 174 return false;
169
170 // GB7: Hangul (LV | V) x (V | T)
171 if (cp1_gbp_prop == .LV or cp1_gbp_prop == .V) {
172 if (cp2_gbp_prop == .V or
173 cp2_gbp_prop == .T) return false;
174 }
175
176 // GB8: Hangul (LVT | T) x T
177 if (cp1_gbp_prop == .LVT or cp1_gbp_prop == .T) {
178 if (cp2_gbp_prop == .T) return false;
179 } 175 }
180 176
181 // GB9b: x (Extend | ZWJ) 177 // GB9b: x (Extend | ZWJ)
@@ -189,44 +185,54 @@ pub fn graphemeBreak(
189 185
190 // GB12, GB13: RI x RI 186 // GB12, GB13: RI x RI
191 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { 187 if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
192 if (hasRegional(state)) { 188 if (state.hasRegional()) {
193 unsetRegional(state); 189 state.unsetRegional();
194 return true; 190 return true;
195 } else { 191 } else {
196 setRegional(state); 192 state.setRegional();
197 return false; 193 return false;
198 } 194 }
199 } 195 }
200 196
201 // GB11: Emoji Extend* ZWJ x Emoji 197 // GB6: Hangul L x (L|V|LV|VT)
202 if (hasXpic(state) and 198 if (cp1_gbp_prop == .L) {
203 cp1_gbp_prop == .ZWJ and 199 if (cp2_gbp_prop == .L or
204 cp2_is_emoji) 200 cp2_gbp_prop == .V or
205 { 201 cp2_gbp_prop == .LV or
206 unsetXpic(state); 202 cp2_gbp_prop == .LVT) return false;
207 return false; 203 }
204
205 // GB7: Hangul (LV | V) x (V | T)
206 if (cp1_gbp_prop == .LV or cp1_gbp_prop == .V) {
207 if (cp2_gbp_prop == .V or
208 cp2_gbp_prop == .T) return false;
209 }
210
211 // GB8: Hangul (LVT | T) x T
212 if (cp1_gbp_prop == .LVT or cp1_gbp_prop == .T) {
213 if (cp2_gbp_prop == .T) return false;
208 } 214 }
209 215
210 // GB9c: Indic Conjunct Break 216 // GB9c: Indic Conjunct Break
211 if (hasIndic(state) and 217 if (state.hasIndic() and
212 cp1_indic_prop == .Consonant and 218 cp1_indic_prop == .Consonant and
213 (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) 219 (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker))
214 { 220 {
215 return false; 221 return false;
216 } 222 }
217 223
218 if (hasIndic(state) and 224 if (state.hasIndic() and
219 cp1_indic_prop == .Extend and 225 cp1_indic_prop == .Extend and
220 cp2_indic_prop == .Linker) 226 cp2_indic_prop == .Linker)
221 { 227 {
222 return false; 228 return false;
223 } 229 }
224 230
225 if (hasIndic(state) and 231 if (state.hasIndic() and
226 (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and 232 (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and
227 cp2_indic_prop == .Consonant) 233 cp2_indic_prop == .Consonant)
228 { 234 {
229 unsetIndic(state); 235 state.unsetIndic();
230 return false; 236 return false;
231 } 237 }
232 238