diff options
| author | 2024-02-18 11:26:00 -0400 | |
|---|---|---|
| committer | 2024-02-18 11:26:00 -0400 | |
| commit | 2c14444bc9054604b71ad5b0e8fb3c3e34877e58 (patch) | |
| tree | b6713a4413288445fd0ec2e7fead29653600b524 /src/Grapheme.zig | |
| parent | Back to zg code_point. 4ms faster than Ghostty's Utf8Decoder (diff) | |
| download | zg-2c14444bc9054604b71ad5b0e8fb3c3e34877e58.tar.gz zg-2c14444bc9054604b71ad5b0e8fb3c3e34877e58.tar.xz zg-2c14444bc9054604b71ad5b0e8fb3c3e34877e58.zip | |
Grapheme -> grapheme
Diffstat (limited to 'src/Grapheme.zig')
| -rw-r--r-- | src/Grapheme.zig | 332 |
1 files changed, 0 insertions, 332 deletions
diff --git a/src/Grapheme.zig b/src/Grapheme.zig deleted file mode 100644 index f013aba..0000000 --- a/src/Grapheme.zig +++ /dev/null | |||
| @@ -1,332 +0,0 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const unicode = std.unicode; | ||
| 3 | |||
| 4 | const CodePoint = @import("code_point").CodePoint; | ||
| 5 | const CodePointIterator = @import("code_point").Iterator; | ||
| 6 | const gbp = @import("gbp"); | ||
| 7 | |||
| 8 | /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. | ||
| 9 | pub const Grapheme = struct { | ||
| 10 | len: u8, | ||
| 11 | offset: u32, | ||
| 12 | |||
| 13 | /// `bytes` returns the slice of bytes that correspond to | ||
| 14 | /// this grapheme cluster in `src`. | ||
| 15 | pub fn bytes(self: Grapheme, src: []const u8) []const u8 { | ||
| 16 | return src[self.offset..][0..self.len]; | ||
| 17 | } | ||
| 18 | }; | ||
| 19 | |||
| 20 | /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. | ||
| 21 | pub const Iterator = struct { | ||
| 22 | buf: [2]?CodePoint = .{ null, null }, | ||
| 23 | cp_iter: CodePointIterator, | ||
| 24 | |||
| 25 | const Self = @This(); | ||
| 26 | |||
| 27 | /// Assumes `src` is valid UTF-8. | ||
| 28 | pub fn init(str: []const u8) Self { | ||
| 29 | var self = Self{ .cp_iter = CodePointIterator{ .bytes = str } }; | ||
| 30 | self.advance(); | ||
| 31 | return self; | ||
| 32 | } | ||
| 33 | |||
| 34 | fn advance(self: *Self) void { | ||
| 35 | self.buf[0] = self.buf[1]; | ||
| 36 | self.buf[1] = self.cp_iter.next(); | ||
| 37 | } | ||
| 38 | |||
| 39 | pub fn next(self: *Self) ?Grapheme { | ||
| 40 | self.advance(); | ||
| 41 | |||
| 42 | // If no more | ||
| 43 | if (self.buf[0] == null) return null; | ||
| 44 | // If last one | ||
| 45 | if (self.buf[1] == null) return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset }; | ||
| 46 | // If ASCII | ||
| 47 | if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) { | ||
| 48 | return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset }; | ||
| 49 | } | ||
| 50 | |||
| 51 | const gc_start = self.buf[0].?.offset; | ||
| 52 | var gc_len: u8 = self.buf[0].?.len; | ||
| 53 | var state = State{}; | ||
| 54 | |||
| 55 | if (graphemeBreak( | ||
| 56 | self.buf[0].?.code, | ||
| 57 | self.buf[1].?.code, | ||
| 58 | &state, | ||
| 59 | )) return Grapheme{ .len = gc_len, .offset = gc_start }; | ||
| 60 | |||
| 61 | while (true) { | ||
| 62 | self.advance(); | ||
| 63 | if (self.buf[0] == null) break; | ||
| 64 | |||
| 65 | gc_len += self.buf[0].?.len; | ||
| 66 | |||
| 67 | if (graphemeBreak( | ||
| 68 | self.buf[0].?.code, | ||
| 69 | if (self.buf[1]) |ncp| ncp.code else 0, | ||
| 70 | &state, | ||
| 71 | )) break; | ||
| 72 | } | ||
| 73 | |||
| 74 | return Grapheme{ .len = gc_len, .offset = gc_start }; | ||
| 75 | } | ||
| 76 | }; | ||
| 77 | |||
| 78 | // Predicates | ||
| 79 | fn isBreaker(cp: u21) bool { | ||
| 80 | // Extract relevant properties. | ||
| 81 | const cp_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]]; | ||
| 82 | const cp_gbp_prop: gbp.Gbp = @enumFromInt(cp_props_byte >> 4); | ||
| 83 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; | ||
| 84 | } | ||
| 85 | |||
| 86 | fn isIgnorable(cp: u21) bool { | ||
| 87 | const cp_gbp_prop = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp >> 8] + (cp & 0xff)]]; | ||
| 88 | return cp_gbp_prop == .extend or cp_gbp_prop == .spacing or cp == '\u{200d}'; | ||
| 89 | } | ||
| 90 | |||
| 91 | // Grapheme break state. | ||
| 92 | const State = struct { | ||
| 93 | bits: u3 = 0, | ||
| 94 | |||
| 95 | // Extended Pictographic (emoji) | ||
| 96 | fn hasXpic(self: State) bool { | ||
| 97 | return self.bits & 1 == 1; | ||
| 98 | } | ||
| 99 | fn setXpic(self: *State) void { | ||
| 100 | self.bits |= 1; | ||
| 101 | } | ||
| 102 | fn unsetXpic(self: *State) void { | ||
| 103 | self.bits ^= 1; | ||
| 104 | } | ||
| 105 | |||
| 106 | // Regional Indicatior (flags) | ||
| 107 | fn hasRegional(self: State) bool { | ||
| 108 | return self.bits & 2 == 2; | ||
| 109 | } | ||
| 110 | fn setRegional(self: *State) void { | ||
| 111 | self.bits |= 2; | ||
| 112 | } | ||
| 113 | fn unsetRegional(self: *State) void { | ||
| 114 | self.bits ^= 2; | ||
| 115 | } | ||
| 116 | |||
| 117 | // Indic Conjunct | ||
| 118 | fn hasIndic(self: State) bool { | ||
| 119 | return self.bits & 4 == 4; | ||
| 120 | } | ||
| 121 | fn setIndic(self: *State) void { | ||
| 122 | self.bits |= 4; | ||
| 123 | } | ||
| 124 | fn unsetIndic(self: *State) void { | ||
| 125 | self.bits ^= 4; | ||
| 126 | } | ||
| 127 | }; | ||
| 128 | |||
| 129 | /// `graphemeBreak` returns true only if a grapheme break point is required | ||
| 130 | /// between `cp1` and `cp2`. `state` should start out as 0. If calling | ||
| 131 | /// iteratively over a sequence of code points, this function must be called | ||
| 132 | /// IN ORDER on ALL potential breaks in a string. | ||
| 133 | /// Modeled after the API of utf8proc's `utf8proc_grapheme_break_stateful`. | ||
| 134 | /// https://github.com/JuliaStrings/utf8proc/blob/2bbb1ba932f727aad1fab14fafdbc89ff9dc4604/utf8proc.h#L599-L617 | ||
| 135 | pub fn graphemeBreak( | ||
| 136 | cp1: u21, | ||
| 137 | cp2: u21, | ||
| 138 | state: *State, | ||
| 139 | ) bool { | ||
| 140 | // Extract relevant properties. | ||
| 141 | const cp1_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp1 >> 8] + (cp1 & 0xff)]]; | ||
| 142 | const cp1_gbp_prop: gbp.Gbp = @enumFromInt(cp1_props_byte >> 4); | ||
| 143 | const cp1_indic_prop: gbp.Indic = @enumFromInt((cp1_props_byte >> 1) & 0x7); | ||
| 144 | const cp1_is_emoji = cp1_props_byte & 1 == 1; | ||
| 145 | |||
| 146 | const cp2_props_byte = gbp.stage_3[gbp.stage_2[gbp.stage_1[cp2 >> 8] + (cp2 & 0xff)]]; | ||
| 147 | const cp2_gbp_prop: gbp.Gbp = @enumFromInt(cp2_props_byte >> 4); | ||
| 148 | const cp2_indic_prop: gbp.Indic = @enumFromInt((cp2_props_byte >> 1) & 0x7); | ||
| 149 | const cp2_is_emoji = cp2_props_byte & 1 == 1; | ||
| 150 | |||
| 151 | // GB11: Emoji Extend* ZWJ x Emoji | ||
| 152 | if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); | ||
| 153 | // GB9c: Indic Conjunct Break | ||
| 154 | if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic(); | ||
| 155 | |||
| 156 | // GB3: CR x LF | ||
| 157 | if (cp1 == '\r' and cp2 == '\n') return false; | ||
| 158 | |||
| 159 | // GB4: Control | ||
| 160 | if (isBreaker(cp1)) return true; | ||
| 161 | |||
| 162 | // GB11: Emoji Extend* ZWJ x Emoji | ||
| 163 | if (state.hasXpic() and | ||
| 164 | cp1_gbp_prop == .ZWJ and | ||
| 165 | cp2_is_emoji) | ||
| 166 | { | ||
| 167 | state.unsetXpic(); | ||
| 168 | return false; | ||
| 169 | } | ||
| 170 | |||
| 171 | // GB9b: x (Extend | ZWJ) | ||
| 172 | if (cp2_gbp_prop == .Extend or cp2_gbp_prop == .ZWJ) return false; | ||
| 173 | |||
| 174 | // GB9a: x Spacing | ||
| 175 | if (cp2_gbp_prop == .SpacingMark) return false; | ||
| 176 | |||
| 177 | // GB9b: Prepend x | ||
| 178 | if (cp1_gbp_prop == .Prepend and !isBreaker(cp2)) return false; | ||
| 179 | |||
| 180 | // GB12, GB13: RI x RI | ||
| 181 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { | ||
| 182 | if (state.hasRegional()) { | ||
| 183 | state.unsetRegional(); | ||
| 184 | return true; | ||
| 185 | } else { | ||
| 186 | state.setRegional(); | ||
| 187 | return false; | ||
| 188 | } | ||
| 189 | } | ||
| 190 | |||
| 191 | // GB6: Hangul L x (L|V|LV|VT) | ||
| 192 | if (cp1_gbp_prop == .L) { | ||
| 193 | if (cp2_gbp_prop == .L or | ||
| 194 | cp2_gbp_prop == .V or | ||
| 195 | cp2_gbp_prop == .LV or | ||
| 196 | cp2_gbp_prop == .LVT) return false; | ||
| 197 | } | ||
| 198 | |||
| 199 | // GB7: Hangul (LV | V) x (V | T) | ||
| 200 | if (cp1_gbp_prop == .LV or cp1_gbp_prop == .V) { | ||
| 201 | if (cp2_gbp_prop == .V or | ||
| 202 | cp2_gbp_prop == .T) return false; | ||
| 203 | } | ||
| 204 | |||
| 205 | // GB8: Hangul (LVT | T) x T | ||
| 206 | if (cp1_gbp_prop == .LVT or cp1_gbp_prop == .T) { | ||
| 207 | if (cp2_gbp_prop == .T) return false; | ||
| 208 | } | ||
| 209 | |||
| 210 | // GB9c: Indic Conjunct Break | ||
| 211 | if (state.hasIndic() and | ||
| 212 | cp1_indic_prop == .Consonant and | ||
| 213 | (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) | ||
| 214 | { | ||
| 215 | return false; | ||
| 216 | } | ||
| 217 | |||
| 218 | if (state.hasIndic() and | ||
| 219 | cp1_indic_prop == .Extend and | ||
| 220 | cp2_indic_prop == .Linker) | ||
| 221 | { | ||
| 222 | return false; | ||
| 223 | } | ||
| 224 | |||
| 225 | if (state.hasIndic() and | ||
| 226 | (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and | ||
| 227 | cp2_indic_prop == .Consonant) | ||
| 228 | { | ||
| 229 | state.unsetIndic(); | ||
| 230 | return false; | ||
| 231 | } | ||
| 232 | |||
| 233 | return true; | ||
| 234 | } | ||
| 235 | |||
| 236 | test "Segmentation GraphemeIterator" { | ||
| 237 | const allocator = std.testing.allocator; | ||
| 238 | var file = try std.fs.cwd().openFile("GraphemeBreakTest.txt", .{}); | ||
| 239 | defer file.close(); | ||
| 240 | var buf_reader = std.io.bufferedReader(file.reader()); | ||
| 241 | var input_stream = buf_reader.reader(); | ||
| 242 | |||
| 243 | var buf: [4096]u8 = undefined; | ||
| 244 | var line_no: usize = 1; | ||
| 245 | |||
| 246 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { | ||
| 247 | // Skip comments or empty lines. | ||
| 248 | if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; | ||
| 249 | |||
| 250 | // Clean up. | ||
| 251 | var line = std.mem.trimLeft(u8, raw, "÷ "); | ||
| 252 | if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { | ||
| 253 | line = line[0..octo]; | ||
| 254 | } | ||
| 255 | // Iterate over fields. | ||
| 256 | var want = std.ArrayList(Grapheme).init(allocator); | ||
| 257 | defer want.deinit(); | ||
| 258 | |||
| 259 | var all_bytes = std.ArrayList(u8).init(allocator); | ||
| 260 | defer all_bytes.deinit(); | ||
| 261 | |||
| 262 | var graphemes = std.mem.split(u8, line, " ÷ "); | ||
| 263 | var bytes_index: u32 = 0; | ||
| 264 | |||
| 265 | while (graphemes.next()) |field| { | ||
| 266 | var code_points = std.mem.split(u8, field, " "); | ||
| 267 | var cp_buf: [4]u8 = undefined; | ||
| 268 | var cp_index: u32 = 0; | ||
| 269 | var gc_len: u8 = 0; | ||
| 270 | |||
| 271 | while (code_points.next()) |code_point| { | ||
| 272 | if (std.mem.eql(u8, code_point, "×")) continue; | ||
| 273 | const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); | ||
| 274 | const len = try unicode.utf8Encode(cp, &cp_buf); | ||
| 275 | try all_bytes.appendSlice(cp_buf[0..len]); | ||
| 276 | cp_index += len; | ||
| 277 | gc_len += len; | ||
| 278 | } | ||
| 279 | |||
| 280 | try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); | ||
| 281 | bytes_index += cp_index; | ||
| 282 | } | ||
| 283 | |||
| 284 | // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); | ||
| 285 | var iter = Iterator.init(all_bytes.items); | ||
| 286 | |||
| 287 | // Chaeck. | ||
| 288 | for (want.items) |want_gc| { | ||
| 289 | const got_gc = (iter.next()).?; | ||
| 290 | try std.testing.expectEqualStrings( | ||
| 291 | want_gc.bytes(all_bytes.items), | ||
| 292 | got_gc.bytes(all_bytes.items), | ||
| 293 | ); | ||
| 294 | } | ||
| 295 | } | ||
| 296 | } | ||
| 297 | |||
| 298 | test "Segmentation comptime GraphemeIterator" { | ||
| 299 | const want = [_][]const u8{ "H", "é", "l", "l", "o" }; | ||
| 300 | |||
| 301 | comptime { | ||
| 302 | const src = "Héllo"; | ||
| 303 | var ct_iter = Iterator.init(src); | ||
| 304 | var i = 0; | ||
| 305 | while (ct_iter.next()) |grapheme| : (i += 1) { | ||
| 306 | try std.testing.expectEqualStrings(grapheme.bytes(src), want[i]); | ||
| 307 | } | ||
| 308 | } | ||
| 309 | } | ||
| 310 | |||
| 311 | test "Segmentation ZWJ and ZWSP emoji sequences" { | ||
| 312 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | ||
| 313 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | ||
| 314 | const with_zwj = seq_1 ++ "\u{200D}" ++ seq_2; | ||
| 315 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; | ||
| 316 | const no_joiner = seq_1 ++ seq_2; | ||
| 317 | |||
| 318 | var ct_iter = Iterator.init(with_zwj); | ||
| 319 | var i: usize = 0; | ||
| 320 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 321 | try std.testing.expectEqual(@as(usize, 1), i); | ||
| 322 | |||
| 323 | ct_iter = Iterator.init(with_zwsp); | ||
| 324 | i = 0; | ||
| 325 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 326 | try std.testing.expectEqual(@as(usize, 3), i); | ||
| 327 | |||
| 328 | ct_iter = Iterator.init(no_joiner); | ||
| 329 | i = 0; | ||
| 330 | while (ct_iter.next()) |_| : (i += 1) {} | ||
| 331 | try std.testing.expectEqual(@as(usize, 2), i); | ||
| 332 | } | ||