summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-14 21:39:37 -0400
committerGravatar Jose Colon Rodriguez2024-02-14 21:39:37 -0400
commitb2be2562b908d5563a8e71c44cbad577e4df4201 (patch)
treeeed831bdd2200247f9ffdb1adc8c2a051bd3f3e8
parentPassing Unicode 15.1.0 Grapheme Break Tests (diff)
downloadzg-b2be2562b908d5563a8e71c44cbad577e4df4201.tar.gz
zg-b2be2562b908d5563a8e71c44cbad577e4df4201.tar.xz
zg-b2be2562b908d5563a8e71c44cbad577e4df4201.zip
gbp and indic direct array access
-rw-r--r--codegen/grapheme_break.zig60
-rw-r--r--codegen/indic.zig32
-rw-r--r--src/Grapheme.zig89
3 files changed, 55 insertions, 126 deletions
diff --git a/codegen/grapheme_break.zig b/codegen/grapheme_break.zig
index ace875c..95237f3 100644
--- a/codegen/grapheme_break.zig
+++ b/codegen/grapheme_break.zig
@@ -112,7 +112,7 @@ pub fn main() !void {
112 const writer = out_buf.writer(); 112 const writer = out_buf.writer();
113 113
114 const prop_code = 114 const prop_code =
115 \\const Prop = enum { 115 \\pub const Prop = enum {
116 \\ none, 116 \\ none,
117 \\ 117 \\
118 \\ control, 118 \\ control,
@@ -132,75 +132,23 @@ pub fn main() !void {
132 132
133 try writer.writeAll(prop_code); 133 try writer.writeAll(prop_code);
134 134
135 try writer.print("const stage_1 = [{}]u16{{", .{stage1.items.len}); 135 try writer.print("pub const stage_1 = [{}]u16{{", .{stage1.items.len});
136 for (stage1.items) |v| { 136 for (stage1.items) |v| {
137 _ = try writer.print("{},", .{v}); 137 _ = try writer.print("{},", .{v});
138 } 138 }
139 try writer.writeAll("};\n"); 139 try writer.writeAll("};\n");
140 140
141 try writer.print("const stage_2 = [{}]u4{{", .{stage2.items.len}); 141 try writer.print("pub const stage_2 = [{}]u4{{", .{stage2.items.len});
142 for (stage2.items) |v| { 142 for (stage2.items) |v| {
143 _ = try writer.print("{},", .{v}); 143 _ = try writer.print("{},", .{v});
144 } 144 }
145 try writer.writeAll("};\n"); 145 try writer.writeAll("};\n");
146 146
147 try writer.print("const stage_3 = [{}]Prop{{", .{stage3.items.len}); 147 try writer.print("pub const stage_3 = [{}]Prop{{", .{stage3.items.len});
148 for (stage3.items) |v| { 148 for (stage3.items) |v| {
149 _ = try writer.print(".{s},", .{@tagName(v)}); 149 _ = try writer.print(".{s},", .{@tagName(v)});
150 } 150 }
151 try writer.writeAll("};\n"); 151 try writer.writeAll("};\n");
152 152
153 const code =
154 \\inline fn getProp(cp: u21) Prop {
155 \\ const stage_1_index = cp >> 8;
156 \\ const stage_2_index = stage_1[stage_1_index] + (cp & 0xff);
157 \\ const stage_3_index = stage_2[stage_2_index];
158 \\ return stage_3[stage_3_index];
159 \\}
160 \\
161 \\pub inline fn isControl(cp: u21) bool {
162 \\ return getProp(cp) == .control;
163 \\}
164 \\
165 \\pub inline fn isExtend(cp: u21) bool {
166 \\ return getProp(cp) == .extend;
167 \\}
168 \\
169 \\pub inline fn isL(cp: u21) bool {
170 \\ return getProp(cp) == .hangul_l;
171 \\}
172 \\pub inline fn isLv(cp: u21) bool {
173 \\ return getProp(cp) == .hangul_lv;
174 \\}
175 \\pub inline fn isLvt(cp: u21) bool {
176 \\ return getProp(cp) == .hangul_lvt;
177 \\}
178 \\pub inline fn isV(cp: u21) bool {
179 \\ return getProp(cp) == .hangul_v;
180 \\}
181 \\pub inline fn isT(cp: u21) bool {
182 \\ return getProp(cp) == .hangul_t;
183 \\}
184 \\
185 \\pub inline fn isPrepend(cp: u21) bool {
186 \\ return getProp(cp) == .prepend;
187 \\}
188 \\
189 \\pub inline fn isRegionalIndicator(cp: u21) bool {
190 \\ return getProp(cp) == .regional;
191 \\}
192 \\
193 \\pub inline fn isSpacingmark(cp: u21) bool {
194 \\ return getProp(cp) == .spacing;
195 \\}
196 \\
197 \\pub inline fn isZwj(cp: u21) bool {
198 \\ return getProp(cp) == .zwj;
199 \\}
200 \\
201 ;
202
203 try writer.writeAll(code);
204
205 try out_buf.flush(); 153 try out_buf.flush();
206} 154}
diff --git a/codegen/indic.zig b/codegen/indic.zig
index 871f1c5..07bcd92 100644
--- a/codegen/indic.zig
+++ b/codegen/indic.zig
@@ -130,7 +130,7 @@ pub fn main() !void {
130 const writer = out_buf.writer(); 130 const writer = out_buf.writer();
131 131
132 const prop_code = 132 const prop_code =
133 \\const Prop = enum { 133 \\pub const Prop = enum {
134 \\ none, 134 \\ none,
135 \\ 135 \\
136 \\ Consonant, 136 \\ Consonant,
@@ -142,47 +142,23 @@ pub fn main() !void {
142 142
143 try writer.writeAll(prop_code); 143 try writer.writeAll(prop_code);
144 144
145 try writer.print("const stage_1 = [{}]u16{{", .{stage1.items.len}); 145 try writer.print("pub const stage_1 = [{}]u16{{", .{stage1.items.len});
146 for (stage1.items) |v| { 146 for (stage1.items) |v| {
147 _ = try writer.print("{},", .{v}); 147 _ = try writer.print("{},", .{v});
148 } 148 }
149 try writer.writeAll("};\n"); 149 try writer.writeAll("};\n");
150 150
151 try writer.print("const stage_2 = [{}]u3{{", .{stage2.items.len}); 151 try writer.print("pub const stage_2 = [{}]u3{{", .{stage2.items.len});
152 for (stage2.items) |v| { 152 for (stage2.items) |v| {
153 _ = try writer.print("{},", .{v}); 153 _ = try writer.print("{},", .{v});
154 } 154 }
155 try writer.writeAll("};\n"); 155 try writer.writeAll("};\n");
156 156
157 try writer.print("const stage_3 = [{}]Prop{{", .{stage3.items.len}); 157 try writer.print("pub const stage_3 = [{}]Prop{{", .{stage3.items.len});
158 for (stage3.items) |v| { 158 for (stage3.items) |v| {
159 _ = try writer.print(".{s},", .{@tagName(v)}); 159 _ = try writer.print(".{s},", .{@tagName(v)});
160 } 160 }
161 try writer.writeAll("};\n"); 161 try writer.writeAll("};\n");
162 162
163 const code =
164 \\inline fn getProp(cp: u21) Prop {
165 \\ const stage_1_index = cp >> 8;
166 \\ const stage_2_index = stage_1[stage_1_index] + (cp & 0xff);
167 \\ const stage_3_index = stage_2[stage_2_index];
168 \\ return stage_3[stage_3_index];
169 \\}
170 \\
171 \\pub inline fn isConsonant(cp: u21) bool {
172 \\ return getProp(cp) == .Consonant;
173 \\}
174 \\
175 \\pub inline fn isExtend(cp: u21) bool {
176 \\ return getProp(cp) == .Extend;
177 \\}
178 \\
179 \\pub inline fn isLinker(cp: u21) bool {
180 \\ return getProp(cp) == .Linker;
181 \\}
182 \\
183 ;
184
185 try writer.writeAll(code);
186
187 try out_buf.flush(); 163 try out_buf.flush();
188} 164}
diff --git a/src/Grapheme.zig b/src/Grapheme.zig
index a0ec900..56eecbe 100644
--- a/src/Grapheme.zig
+++ b/src/Grapheme.zig
@@ -82,26 +82,27 @@ pub const GraphemeIterator = struct {
82}; 82};
83 83
84// Predicates 84// Predicates
85fn isBreaker(cp: u21) bool { 85inline fn isBreaker(cp: u21) bool {
86 return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp); 86 return cp == '\x0d' or cp == '\x0a' or gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]] == .control;
87} 87}
88 88
89fn isIgnorable(cp: u21) bool { 89inline fn isIgnorable(cp: u21) bool {
90 return gbp.isExtend(cp) or gbp.isSpacingmark(cp) or cp == '\u{200d}'; 90 const cp_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]];
91 return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}';
91} 92}
92 93
93// test "Segmentation comptime GraphemeIterator" { 94test "Segmentation comptime GraphemeIterator" {
94// const want = [_][]const u8{ "H", "é", "l", "l", "o" }; 95 const want = [_][]const u8{ "H", "é", "l", "l", "o" };
95// 96
96// comptime { 97 comptime {
97// const src = "Héllo"; 98 const src = "Héllo";
98// var ct_iter = GraphemeIterator.init(src); 99 var ct_iter = GraphemeIterator.init(src);
99// var i = 0; 100 var i = 0;
100// while (ct_iter.next()) |grapheme| : (i += 1) { 101 while (ct_iter.next()) |grapheme| : (i += 1) {
101// try std.testing.expect(grapheme.eql(src, want[i])); 102 try std.testing.expect(grapheme.eql(src, want[i]));
102// } 103 }
103// } 104 }
104// } 105}
105 106
106test "Segmentation ZWJ and ZWSP emoji sequences" { 107test "Segmentation ZWJ and ZWSP emoji sequences" {
107 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; 108 const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
@@ -172,7 +173,9 @@ pub fn graphemeBreak(
172 // GB11: Emoji Extend* ZWJ x Emoji 173 // GB11: Emoji Extend* ZWJ x Emoji
173 if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state); 174 if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state);
174 // GB9c: Indic Conjunct Break 175 // GB9c: Indic Conjunct Break
175 if (!hasIndic(state) and indic.isConsonant(cp1)) setIndic(state); 176 const cp1_indic_prop = indic.stage_3[indic.stage_2[indic.stage_1[cp1 >> 8] + (cp1 & 0xff)]];
177 const cp2_indic_prop = indic.stage_3[indic.stage_2[indic.stage_1[cp2 >> 8] + (cp2 & 0xff)]];
178 if (!hasIndic(state) and cp1_indic_prop == .Consonant) setIndic(state);
176 179
177 // GB3: CR x LF 180 // GB3: CR x LF
178 if (cp1 == '\r' and cp2 == '\n') return false; 181 if (cp1 == '\r' and cp2 == '\n') return false;
@@ -181,35 +184,37 @@ pub fn graphemeBreak(
181 if (isBreaker(cp1)) return true; 184 if (isBreaker(cp1)) return true;
182 185
183 // GB6: Hangul L x (L|V|LV|VT) 186 // GB6: Hangul L x (L|V|LV|VT)
184 if (gbp.isL(cp1)) { 187 const cp1_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]];
185 if (gbp.isL(cp2) or 188 const cp2_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp2 >> 8] + (cp2 & 0xff)]];
186 gbp.isV(cp2) or 189 if (cp1_gbp_prop == .hangul_l) {
187 gbp.isLv(cp2) or 190 if (cp2_gbp_prop == .hangul_l or
188 gbp.isLvt(cp2)) return false; 191 cp2_gbp_prop == .hangul_v or
192 cp2_gbp_prop == .hangul_lv or
193 cp2_gbp_prop == .hangul_lvt) return false;
189 } 194 }
190 195
191 // GB7: Hangul (LV | V) x (V | T) 196 // GB7: Hangul (LV | V) x (V | T)
192 if (gbp.isLv(cp1) or gbp.isV(cp1)) { 197 if (cp1_gbp_prop == .hangul_lv or cp1_gbp_prop == .hangul_v) {
193 if (gbp.isV(cp2) or 198 if (cp2_gbp_prop == .hangul_v or
194 gbp.isT(cp2)) return false; 199 cp2_gbp_prop == .hangul_t) return false;
195 } 200 }
196 201
197 // GB8: Hangul (LVT | T) x T 202 // GB8: Hangul (LVT | T) x T
198 if (gbp.isLvt(cp1) or gbp.isT(cp1)) { 203 if (cp1_gbp_prop == .hangul_lvt or cp1_gbp_prop == .hangul_t) {
199 if (gbp.isT(cp2)) return false; 204 if (cp2_gbp_prop == .hangul_t) return false;
200 } 205 }
201 206
202 // GB9b: x (Extend | ZWJ) 207 // GB9b: x (Extend | ZWJ)
203 if (gbp.isExtend(cp2) or gbp.isZwj(cp2)) return false; 208 if (cp2_gbp_prop == .extend or cp2_gbp_prop == .zwj) return false;
204 209
205 // GB9a: x Spacing 210 // GB9a: x Spacing
206 if (gbp.isSpacingmark(cp2)) return false; 211 if (cp2_gbp_prop == .spacing) return false;
207 212
208 // GB9b: Prepend x 213 // GB9b: Prepend x
209 if (gbp.isPrepend(cp1) and !isBreaker(cp2)) return false; 214 if (cp1_gbp_prop == .prepend and !isBreaker(cp2)) return false;
210 215
211 // GB12, GB13: RI x RI 216 // GB12, GB13: RI x RI
212 if (gbp.isRegionalIndicator(cp1) and gbp.isRegionalIndicator(cp2)) { 217 if (cp1_gbp_prop == .regional and cp2_gbp_prop == .regional) {
213 if (hasRegional(state)) { 218 if (hasRegional(state)) {
214 unsetRegional(state); 219 unsetRegional(state);
215 return true; 220 return true;
@@ -221,7 +226,7 @@ pub fn graphemeBreak(
221 226
222 // GB11: Emoji Extend* ZWJ x Emoji 227 // GB11: Emoji Extend* ZWJ x Emoji
223 if (hasXpic(state) and 228 if (hasXpic(state) and
224 gbp.isZwj(cp1) and 229 cp1_gbp_prop == .zwj and
225 emoji.isExtendedPictographic(cp2)) 230 emoji.isExtendedPictographic(cp2))
226 { 231 {
227 unsetXpic(state); 232 unsetXpic(state);
@@ -230,37 +235,37 @@ pub fn graphemeBreak(
230 235
231 // GB9c: Indic Conjunct Break 236 // GB9c: Indic Conjunct Break
232 if (hasIndic(state) and 237 if (hasIndic(state) and
233 indic.isConsonant(cp1) and 238 cp1_indic_prop == .Consonant and
234 indic.isExtend(cp2)) 239 cp2_indic_prop == .Extend)
235 { 240 {
236 return false; 241 return false;
237 } 242 }
238 243
239 if (hasIndic(state) and 244 if (hasIndic(state) and
240 indic.isConsonant(cp1) and 245 cp1_indic_prop == .Consonant and
241 indic.isLinker(cp2)) 246 cp2_indic_prop == .Linker)
242 { 247 {
243 return false; 248 return false;
244 } 249 }
245 250
246 if (hasIndic(state) and 251 if (hasIndic(state) and
247 indic.isExtend(cp1) and 252 cp1_indic_prop == .Extend and
248 indic.isLinker(cp2)) 253 cp2_indic_prop == .Linker)
249 { 254 {
250 return false; 255 return false;
251 } 256 }
252 257
253 if (hasIndic(state) and 258 if (hasIndic(state) and
254 indic.isLinker(cp1) and 259 cp1_indic_prop == .Linker and
255 indic.isConsonant(cp2)) 260 cp2_indic_prop == .Consonant)
256 { 261 {
257 unsetIndic(state); 262 unsetIndic(state);
258 return false; 263 return false;
259 } 264 }
260 265
261 if (hasIndic(state) and 266 if (hasIndic(state) and
262 gbp.isZwj(cp1) and 267 cp1_gbp_prop == .zwj and
263 indic.isConsonant(cp2)) 268 cp2_indic_prop == .Consonant)
264 { 269 {
265 unsetIndic(state); 270 unsetIndic(state);
266 return false; 271 return false;