diff options
| author | 2024-02-14 21:39:37 -0400 | |
|---|---|---|
| committer | 2024-02-14 21:39:37 -0400 | |
| commit | b2be2562b908d5563a8e71c44cbad577e4df4201 (patch) | |
| tree | eed831bdd2200247f9ffdb1adc8c2a051bd3f3e8 | |
| parent | Passing Unicode 15.1.0 Grapheme Break Tests (diff) | |
| download | zg-b2be2562b908d5563a8e71c44cbad577e4df4201.tar.gz zg-b2be2562b908d5563a8e71c44cbad577e4df4201.tar.xz zg-b2be2562b908d5563a8e71c44cbad577e4df4201.zip | |
gbp and indic direct array access
| -rw-r--r-- | codegen/grapheme_break.zig | 60 | ||||
| -rw-r--r-- | codegen/indic.zig | 32 | ||||
| -rw-r--r-- | src/Grapheme.zig | 89 |
3 files changed, 55 insertions, 126 deletions
diff --git a/codegen/grapheme_break.zig b/codegen/grapheme_break.zig index ace875c..95237f3 100644 --- a/codegen/grapheme_break.zig +++ b/codegen/grapheme_break.zig | |||
| @@ -112,7 +112,7 @@ pub fn main() !void { | |||
| 112 | const writer = out_buf.writer(); | 112 | const writer = out_buf.writer(); |
| 113 | 113 | ||
| 114 | const prop_code = | 114 | const prop_code = |
| 115 | \\const Prop = enum { | 115 | \\pub const Prop = enum { |
| 116 | \\ none, | 116 | \\ none, |
| 117 | \\ | 117 | \\ |
| 118 | \\ control, | 118 | \\ control, |
| @@ -132,75 +132,23 @@ pub fn main() !void { | |||
| 132 | 132 | ||
| 133 | try writer.writeAll(prop_code); | 133 | try writer.writeAll(prop_code); |
| 134 | 134 | ||
| 135 | try writer.print("const stage_1 = [{}]u16{{", .{stage1.items.len}); | 135 | try writer.print("pub const stage_1 = [{}]u16{{", .{stage1.items.len}); |
| 136 | for (stage1.items) |v| { | 136 | for (stage1.items) |v| { |
| 137 | _ = try writer.print("{},", .{v}); | 137 | _ = try writer.print("{},", .{v}); |
| 138 | } | 138 | } |
| 139 | try writer.writeAll("};\n"); | 139 | try writer.writeAll("};\n"); |
| 140 | 140 | ||
| 141 | try writer.print("const stage_2 = [{}]u4{{", .{stage2.items.len}); | 141 | try writer.print("pub const stage_2 = [{}]u4{{", .{stage2.items.len}); |
| 142 | for (stage2.items) |v| { | 142 | for (stage2.items) |v| { |
| 143 | _ = try writer.print("{},", .{v}); | 143 | _ = try writer.print("{},", .{v}); |
| 144 | } | 144 | } |
| 145 | try writer.writeAll("};\n"); | 145 | try writer.writeAll("};\n"); |
| 146 | 146 | ||
| 147 | try writer.print("const stage_3 = [{}]Prop{{", .{stage3.items.len}); | 147 | try writer.print("pub const stage_3 = [{}]Prop{{", .{stage3.items.len}); |
| 148 | for (stage3.items) |v| { | 148 | for (stage3.items) |v| { |
| 149 | _ = try writer.print(".{s},", .{@tagName(v)}); | 149 | _ = try writer.print(".{s},", .{@tagName(v)}); |
| 150 | } | 150 | } |
| 151 | try writer.writeAll("};\n"); | 151 | try writer.writeAll("};\n"); |
| 152 | 152 | ||
| 153 | const code = | ||
| 154 | \\inline fn getProp(cp: u21) Prop { | ||
| 155 | \\ const stage_1_index = cp >> 8; | ||
| 156 | \\ const stage_2_index = stage_1[stage_1_index] + (cp & 0xff); | ||
| 157 | \\ const stage_3_index = stage_2[stage_2_index]; | ||
| 158 | \\ return stage_3[stage_3_index]; | ||
| 159 | \\} | ||
| 160 | \\ | ||
| 161 | \\pub inline fn isControl(cp: u21) bool { | ||
| 162 | \\ return getProp(cp) == .control; | ||
| 163 | \\} | ||
| 164 | \\ | ||
| 165 | \\pub inline fn isExtend(cp: u21) bool { | ||
| 166 | \\ return getProp(cp) == .extend; | ||
| 167 | \\} | ||
| 168 | \\ | ||
| 169 | \\pub inline fn isL(cp: u21) bool { | ||
| 170 | \\ return getProp(cp) == .hangul_l; | ||
| 171 | \\} | ||
| 172 | \\pub inline fn isLv(cp: u21) bool { | ||
| 173 | \\ return getProp(cp) == .hangul_lv; | ||
| 174 | \\} | ||
| 175 | \\pub inline fn isLvt(cp: u21) bool { | ||
| 176 | \\ return getProp(cp) == .hangul_lvt; | ||
| 177 | \\} | ||
| 178 | \\pub inline fn isV(cp: u21) bool { | ||
| 179 | \\ return getProp(cp) == .hangul_v; | ||
| 180 | \\} | ||
| 181 | \\pub inline fn isT(cp: u21) bool { | ||
| 182 | \\ return getProp(cp) == .hangul_t; | ||
| 183 | \\} | ||
| 184 | \\ | ||
| 185 | \\pub inline fn isPrepend(cp: u21) bool { | ||
| 186 | \\ return getProp(cp) == .prepend; | ||
| 187 | \\} | ||
| 188 | \\ | ||
| 189 | \\pub inline fn isRegionalIndicator(cp: u21) bool { | ||
| 190 | \\ return getProp(cp) == .regional; | ||
| 191 | \\} | ||
| 192 | \\ | ||
| 193 | \\pub inline fn isSpacingmark(cp: u21) bool { | ||
| 194 | \\ return getProp(cp) == .spacing; | ||
| 195 | \\} | ||
| 196 | \\ | ||
| 197 | \\pub inline fn isZwj(cp: u21) bool { | ||
| 198 | \\ return getProp(cp) == .zwj; | ||
| 199 | \\} | ||
| 200 | \\ | ||
| 201 | ; | ||
| 202 | |||
| 203 | try writer.writeAll(code); | ||
| 204 | |||
| 205 | try out_buf.flush(); | 153 | try out_buf.flush(); |
| 206 | } | 154 | } |
diff --git a/codegen/indic.zig b/codegen/indic.zig index 871f1c5..07bcd92 100644 --- a/codegen/indic.zig +++ b/codegen/indic.zig | |||
| @@ -130,7 +130,7 @@ pub fn main() !void { | |||
| 130 | const writer = out_buf.writer(); | 130 | const writer = out_buf.writer(); |
| 131 | 131 | ||
| 132 | const prop_code = | 132 | const prop_code = |
| 133 | \\const Prop = enum { | 133 | \\pub const Prop = enum { |
| 134 | \\ none, | 134 | \\ none, |
| 135 | \\ | 135 | \\ |
| 136 | \\ Consonant, | 136 | \\ Consonant, |
| @@ -142,47 +142,23 @@ pub fn main() !void { | |||
| 142 | 142 | ||
| 143 | try writer.writeAll(prop_code); | 143 | try writer.writeAll(prop_code); |
| 144 | 144 | ||
| 145 | try writer.print("const stage_1 = [{}]u16{{", .{stage1.items.len}); | 145 | try writer.print("pub const stage_1 = [{}]u16{{", .{stage1.items.len}); |
| 146 | for (stage1.items) |v| { | 146 | for (stage1.items) |v| { |
| 147 | _ = try writer.print("{},", .{v}); | 147 | _ = try writer.print("{},", .{v}); |
| 148 | } | 148 | } |
| 149 | try writer.writeAll("};\n"); | 149 | try writer.writeAll("};\n"); |
| 150 | 150 | ||
| 151 | try writer.print("const stage_2 = [{}]u3{{", .{stage2.items.len}); | 151 | try writer.print("pub const stage_2 = [{}]u3{{", .{stage2.items.len}); |
| 152 | for (stage2.items) |v| { | 152 | for (stage2.items) |v| { |
| 153 | _ = try writer.print("{},", .{v}); | 153 | _ = try writer.print("{},", .{v}); |
| 154 | } | 154 | } |
| 155 | try writer.writeAll("};\n"); | 155 | try writer.writeAll("};\n"); |
| 156 | 156 | ||
| 157 | try writer.print("const stage_3 = [{}]Prop{{", .{stage3.items.len}); | 157 | try writer.print("pub const stage_3 = [{}]Prop{{", .{stage3.items.len}); |
| 158 | for (stage3.items) |v| { | 158 | for (stage3.items) |v| { |
| 159 | _ = try writer.print(".{s},", .{@tagName(v)}); | 159 | _ = try writer.print(".{s},", .{@tagName(v)}); |
| 160 | } | 160 | } |
| 161 | try writer.writeAll("};\n"); | 161 | try writer.writeAll("};\n"); |
| 162 | 162 | ||
| 163 | const code = | ||
| 164 | \\inline fn getProp(cp: u21) Prop { | ||
| 165 | \\ const stage_1_index = cp >> 8; | ||
| 166 | \\ const stage_2_index = stage_1[stage_1_index] + (cp & 0xff); | ||
| 167 | \\ const stage_3_index = stage_2[stage_2_index]; | ||
| 168 | \\ return stage_3[stage_3_index]; | ||
| 169 | \\} | ||
| 170 | \\ | ||
| 171 | \\pub inline fn isConsonant(cp: u21) bool { | ||
| 172 | \\ return getProp(cp) == .Consonant; | ||
| 173 | \\} | ||
| 174 | \\ | ||
| 175 | \\pub inline fn isExtend(cp: u21) bool { | ||
| 176 | \\ return getProp(cp) == .Extend; | ||
| 177 | \\} | ||
| 178 | \\ | ||
| 179 | \\pub inline fn isLinker(cp: u21) bool { | ||
| 180 | \\ return getProp(cp) == .Linker; | ||
| 181 | \\} | ||
| 182 | \\ | ||
| 183 | ; | ||
| 184 | |||
| 185 | try writer.writeAll(code); | ||
| 186 | |||
| 187 | try out_buf.flush(); | 163 | try out_buf.flush(); |
| 188 | } | 164 | } |
diff --git a/src/Grapheme.zig b/src/Grapheme.zig index a0ec900..56eecbe 100644 --- a/src/Grapheme.zig +++ b/src/Grapheme.zig | |||
| @@ -82,26 +82,27 @@ pub const GraphemeIterator = struct { | |||
| 82 | }; | 82 | }; |
| 83 | 83 | ||
| 84 | // Predicates | 84 | // Predicates |
| 85 | fn isBreaker(cp: u21) bool { | 85 | inline fn isBreaker(cp: u21) bool { |
| 86 | return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp); | 86 | return cp == '\x0d' or cp == '\x0a' or gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]] == .control; |
| 87 | } | 87 | } |
| 88 | 88 | ||
| 89 | fn isIgnorable(cp: u21) bool { | 89 | inline fn isIgnorable(cp: u21) bool { |
| 90 | return gbp.isExtend(cp) or gbp.isSpacingmark(cp) or cp == '\u{200d}'; | 90 | const cp_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]]; |
| 91 | return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}'; | ||
| 91 | } | 92 | } |
| 92 | 93 | ||
| 93 | // test "Segmentation comptime GraphemeIterator" { | 94 | test "Segmentation comptime GraphemeIterator" { |
| 94 | // const want = [_][]const u8{ "H", "é", "l", "l", "o" }; | 95 | const want = [_][]const u8{ "H", "é", "l", "l", "o" }; |
| 95 | // | 96 | |
| 96 | // comptime { | 97 | comptime { |
| 97 | // const src = "Héllo"; | 98 | const src = "Héllo"; |
| 98 | // var ct_iter = GraphemeIterator.init(src); | 99 | var ct_iter = GraphemeIterator.init(src); |
| 99 | // var i = 0; | 100 | var i = 0; |
| 100 | // while (ct_iter.next()) |grapheme| : (i += 1) { | 101 | while (ct_iter.next()) |grapheme| : (i += 1) { |
| 101 | // try std.testing.expect(grapheme.eql(src, want[i])); | 102 | try std.testing.expect(grapheme.eql(src, want[i])); |
| 102 | // } | 103 | } |
| 103 | // } | 104 | } |
| 104 | // } | 105 | } |
| 105 | 106 | ||
| 106 | test "Segmentation ZWJ and ZWSP emoji sequences" { | 107 | test "Segmentation ZWJ and ZWSP emoji sequences" { |
| 107 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | 108 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; |
| @@ -172,7 +173,9 @@ pub fn graphemeBreak( | |||
| 172 | // GB11: Emoji Extend* ZWJ x Emoji | 173 | // GB11: Emoji Extend* ZWJ x Emoji |
| 173 | if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state); | 174 | if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state); |
| 174 | // GB9c: Indic Conjunct Break | 175 | // GB9c: Indic Conjunct Break |
| 175 | if (!hasIndic(state) and indic.isConsonant(cp1)) setIndic(state); | 176 | const cp1_indic_prop = indic.stage_3[indic.stage_2[indic.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; |
| 177 | const cp2_indic_prop = indic.stage_3[indic.stage_2[indic.stage_1[cp2 >> 8] + (cp2 & 0xff)]]; | ||
| 178 | if (!hasIndic(state) and cp1_indic_prop == .Consonant) setIndic(state); | ||
| 176 | 179 | ||
| 177 | // GB3: CR x LF | 180 | // GB3: CR x LF |
| 178 | if (cp1 == '\r' and cp2 == '\n') return false; | 181 | if (cp1 == '\r' and cp2 == '\n') return false; |
| @@ -181,35 +184,37 @@ pub fn graphemeBreak( | |||
| 181 | if (isBreaker(cp1)) return true; | 184 | if (isBreaker(cp1)) return true; |
| 182 | 185 | ||
| 183 | // GB6: Hangul L x (L|V|LV|VT) | 186 | // GB6: Hangul L x (L|V|LV|VT) |
| 184 | if (gbp.isL(cp1)) { | 187 | const cp1_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; |
| 185 | if (gbp.isL(cp2) or | 188 | const cp2_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp2 >> 8] + (cp2 & 0xff)]]; |
| 186 | gbp.isV(cp2) or | 189 | if (cp1_gbp_prop == .hangul_l) { |
| 187 | gbp.isLv(cp2) or | 190 | if (cp2_gbp_prop == .hangul_l or |
| 188 | gbp.isLvt(cp2)) return false; | 191 | cp2_gbp_prop == .hangul_v or |
| 192 | cp2_gbp_prop == .hangul_lv or | ||
| 193 | cp2_gbp_prop == .hangul_lvt) return false; | ||
| 189 | } | 194 | } |
| 190 | 195 | ||
| 191 | // GB7: Hangul (LV | V) x (V | T) | 196 | // GB7: Hangul (LV | V) x (V | T) |
| 192 | if (gbp.isLv(cp1) or gbp.isV(cp1)) { | 197 | if (cp1_gbp_prop == .hangul_lv or cp1_gbp_prop == .hangul_v) { |
| 193 | if (gbp.isV(cp2) or | 198 | if (cp2_gbp_prop == .hangul_v or |
| 194 | gbp.isT(cp2)) return false; | 199 | cp2_gbp_prop == .hangul_t) return false; |
| 195 | } | 200 | } |
| 196 | 201 | ||
| 197 | // GB8: Hangul (LVT | T) x T | 202 | // GB8: Hangul (LVT | T) x T |
| 198 | if (gbp.isLvt(cp1) or gbp.isT(cp1)) { | 203 | if (cp1_gbp_prop == .hangul_lvt or cp1_gbp_prop == .hangul_t) { |
| 199 | if (gbp.isT(cp2)) return false; | 204 | if (cp2_gbp_prop == .hangul_t) return false; |
| 200 | } | 205 | } |
| 201 | 206 | ||
| 202 | // GB9b: x (Extend | ZWJ) | 207 | // GB9b: x (Extend | ZWJ) |
| 203 | if (gbp.isExtend(cp2) or gbp.isZwj(cp2)) return false; | 208 | if (cp2_gbp_prop == .extend or cp2_gbp_prop == .zwj) return false; |
| 204 | 209 | ||
| 205 | // GB9a: x Spacing | 210 | // GB9a: x Spacing |
| 206 | if (gbp.isSpacingmark(cp2)) return false; | 211 | if (cp2_gbp_prop == .spacing) return false; |
| 207 | 212 | ||
| 208 | // GB9b: Prepend x | 213 | // GB9b: Prepend x |
| 209 | if (gbp.isPrepend(cp1) and !isBreaker(cp2)) return false; | 214 | if (cp1_gbp_prop == .prepend and !isBreaker(cp2)) return false; |
| 210 | 215 | ||
| 211 | // GB12, GB13: RI x RI | 216 | // GB12, GB13: RI x RI |
| 212 | if (gbp.isRegionalIndicator(cp1) and gbp.isRegionalIndicator(cp2)) { | 217 | if (cp1_gbp_prop == .regional and cp2_gbp_prop == .regional) { |
| 213 | if (hasRegional(state)) { | 218 | if (hasRegional(state)) { |
| 214 | unsetRegional(state); | 219 | unsetRegional(state); |
| 215 | return true; | 220 | return true; |
| @@ -221,7 +226,7 @@ pub fn graphemeBreak( | |||
| 221 | 226 | ||
| 222 | // GB11: Emoji Extend* ZWJ x Emoji | 227 | // GB11: Emoji Extend* ZWJ x Emoji |
| 223 | if (hasXpic(state) and | 228 | if (hasXpic(state) and |
| 224 | gbp.isZwj(cp1) and | 229 | cp1_gbp_prop == .zwj and |
| 225 | emoji.isExtendedPictographic(cp2)) | 230 | emoji.isExtendedPictographic(cp2)) |
| 226 | { | 231 | { |
| 227 | unsetXpic(state); | 232 | unsetXpic(state); |
| @@ -230,37 +235,37 @@ pub fn graphemeBreak( | |||
| 230 | 235 | ||
| 231 | // GB9c: Indic Conjunct Break | 236 | // GB9c: Indic Conjunct Break |
| 232 | if (hasIndic(state) and | 237 | if (hasIndic(state) and |
| 233 | indic.isConsonant(cp1) and | 238 | cp1_indic_prop == .Consonant and |
| 234 | indic.isExtend(cp2)) | 239 | cp2_indic_prop == .Extend) |
| 235 | { | 240 | { |
| 236 | return false; | 241 | return false; |
| 237 | } | 242 | } |
| 238 | 243 | ||
| 239 | if (hasIndic(state) and | 244 | if (hasIndic(state) and |
| 240 | indic.isConsonant(cp1) and | 245 | cp1_indic_prop == .Consonant and |
| 241 | indic.isLinker(cp2)) | 246 | cp2_indic_prop == .Linker) |
| 242 | { | 247 | { |
| 243 | return false; | 248 | return false; |
| 244 | } | 249 | } |
| 245 | 250 | ||
| 246 | if (hasIndic(state) and | 251 | if (hasIndic(state) and |
| 247 | indic.isExtend(cp1) and | 252 | cp1_indic_prop == .Extend and |
| 248 | indic.isLinker(cp2)) | 253 | cp2_indic_prop == .Linker) |
| 249 | { | 254 | { |
| 250 | return false; | 255 | return false; |
| 251 | } | 256 | } |
| 252 | 257 | ||
| 253 | if (hasIndic(state) and | 258 | if (hasIndic(state) and |
| 254 | indic.isLinker(cp1) and | 259 | cp1_indic_prop == .Linker and |
| 255 | indic.isConsonant(cp2)) | 260 | cp2_indic_prop == .Consonant) |
| 256 | { | 261 | { |
| 257 | unsetIndic(state); | 262 | unsetIndic(state); |
| 258 | return false; | 263 | return false; |
| 259 | } | 264 | } |
| 260 | 265 | ||
| 261 | if (hasIndic(state) and | 266 | if (hasIndic(state) and |
| 262 | gbp.isZwj(cp1) and | 267 | cp1_gbp_prop == .zwj and |
| 263 | indic.isConsonant(cp2)) | 268 | cp2_indic_prop == .Consonant) |
| 264 | { | 269 | { |
| 265 | unsetIndic(state); | 270 | unsetIndic(state); |
| 266 | return false; | 271 | return false; |