diff options
| author | 2025-04-30 12:02:17 -0400 | |
|---|---|---|
| committer | 2025-04-30 12:02:17 -0400 | |
| commit | 7a212f5ec5aabf016d17d3ed28649e7982b810ef (patch) | |
| tree | c6b06b0a0afb0ed2ba18f147d9ee200e5eee09a1 /src/grapheme.zig | |
| parent | Factor out 'Data' for grapheme and DisplayWidth (diff) | |
| download | zg-7a212f5ec5aabf016d17d3ed28649e7982b810ef.tar.gz zg-7a212f5ec5aabf016d17d3ed28649e7982b810ef.tar.xz zg-7a212f5ec5aabf016d17d3ed28649e7982b810ef.zip | |
grapheme now Graphemes, Data files gone
Diffstat (limited to 'src/grapheme.zig')
| -rw-r--r-- | src/grapheme.zig | 421 |
1 files changed, 0 insertions, 421 deletions
diff --git a/src/grapheme.zig b/src/grapheme.zig deleted file mode 100644 index 79cd2c6..0000000 --- a/src/grapheme.zig +++ /dev/null | |||
| @@ -1,421 +0,0 @@ | |||
| 1 | const std = @import("std"); | ||
| 2 | const builtin = @import("builtin"); | ||
| 3 | const mem = std.mem; | ||
| 4 | const Allocator = mem.Allocator; | ||
| 5 | const compress = std.compress; | ||
| 6 | const unicode = std.unicode; | ||
| 7 | |||
| 8 | const CodePoint = @import("code_point").CodePoint; | ||
| 9 | const CodePointIterator = @import("code_point").Iterator; | ||
| 10 | |||
| 11 | s1: []u16 = undefined, | ||
| 12 | s2: []u16 = undefined, | ||
| 13 | s3: []u8 = undefined, | ||
| 14 | |||
| 15 | const Graphemes = @This(); | ||
| 16 | |||
| 17 | pub inline fn init(allocator: mem.Allocator) mem.Allocator.Error!Graphemes { | ||
| 18 | const decompressor = compress.flate.inflate.decompressor; | ||
| 19 | const in_bytes = @embedFile("gbp"); | ||
| 20 | var in_fbs = std.io.fixedBufferStream(in_bytes); | ||
| 21 | var in_decomp = decompressor(.raw, in_fbs.reader()); | ||
| 22 | var reader = in_decomp.reader(); | ||
| 23 | |||
| 24 | const endian = builtin.cpu.arch.endian(); | ||
| 25 | |||
| 26 | var self = Graphemes{}; | ||
| 27 | |||
| 28 | const s1_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 29 | self.s1 = try allocator.alloc(u16, s1_len); | ||
| 30 | errdefer allocator.free(self.s1); | ||
| 31 | for (0..s1_len) |i| self.s1[i] = reader.readInt(u16, endian) catch unreachable; | ||
| 32 | |||
| 33 | const s2_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 34 | self.s2 = try allocator.alloc(u16, s2_len); | ||
| 35 | errdefer allocator.free(self.s2); | ||
| 36 | for (0..s2_len) |i| self.s2[i] = reader.readInt(u16, endian) catch unreachable; | ||
| 37 | |||
| 38 | const s3_len: u16 = reader.readInt(u16, endian) catch unreachable; | ||
| 39 | self.s3 = try allocator.alloc(u8, s3_len); | ||
| 40 | errdefer allocator.free(self.s3); | ||
| 41 | _ = reader.readAll(self.s3) catch unreachable; | ||
| 42 | |||
| 43 | return self; | ||
| 44 | } | ||
| 45 | |||
| 46 | pub fn deinit(graphemes: *const Graphemes, allocator: mem.Allocator) void { | ||
| 47 | allocator.free(graphemes.s1); | ||
| 48 | allocator.free(graphemes.s2); | ||
| 49 | allocator.free(graphemes.s3); | ||
| 50 | } | ||
| 51 | |||
| 52 | /// Lookup the grapheme break property for a code point. | ||
| 53 | pub fn gbp(graphemes: Graphemes, cp: u21) Gbp { | ||
| 54 | return @enumFromInt(graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 4); | ||
| 55 | } | ||
| 56 | |||
| 57 | /// Lookup the indic syllable type for a code point. | ||
| 58 | pub fn indic(graphemes: Graphemes, cp: u21) Indic { | ||
| 59 | return @enumFromInt((graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7); | ||
| 60 | } | ||
| 61 | |||
| 62 | /// Lookup the emoji property for a code point. | ||
| 63 | pub fn isEmoji(graphemes: Graphemes, cp: u21) bool { | ||
| 64 | return graphemes.s3[graphemes.s2[graphemes.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1; | ||
| 65 | } | ||
| 66 | |||
| 67 | pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { | ||
| 68 | return Iterator.init(string, graphemes); | ||
| 69 | } | ||
| 70 | |||
| 71 | /// Indic syllable type. | ||
| 72 | pub const Indic = enum { | ||
| 73 | none, | ||
| 74 | |||
| 75 | Consonant, | ||
| 76 | Extend, | ||
| 77 | Linker, | ||
| 78 | }; | ||
| 79 | |||
| 80 | /// Grapheme break property. | ||
| 81 | pub const Gbp = enum { | ||
| 82 | none, | ||
| 83 | Control, | ||
| 84 | CR, | ||
| 85 | Extend, | ||
| 86 | L, | ||
| 87 | LF, | ||
| 88 | LV, | ||
| 89 | LVT, | ||
| 90 | Prepend, | ||
| 91 | Regional_Indicator, | ||
| 92 | SpacingMark, | ||
| 93 | T, | ||
| 94 | V, | ||
| 95 | ZWJ, | ||
| 96 | }; | ||
| 97 | |||
| 98 | /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. | ||
| 99 | pub const Grapheme = struct { | ||
| 100 | len: u8, | ||
| 101 | offset: u32, | ||
| 102 | |||
| 103 | /// `bytes` returns the slice of bytes that correspond to | ||
| 104 | /// this grapheme cluster in `src`. | ||
| 105 | pub fn bytes(self: Grapheme, src: []const u8) []const u8 { | ||
| 106 | return src[self.offset..][0..self.len]; | ||
| 107 | } | ||
| 108 | }; | ||
| 109 | |||
| 110 | /// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time. | ||
| 111 | pub const Iterator = struct { | ||
| 112 | buf: [2]?CodePoint = .{ null, null }, | ||
| 113 | cp_iter: CodePointIterator, | ||
| 114 | data: *const Graphemes, | ||
| 115 | |||
| 116 | const Self = @This(); | ||
| 117 | |||
| 118 | /// Assumes `src` is valid UTF-8. | ||
| 119 | pub fn init(str: []const u8, data: *const Graphemes) Self { | ||
| 120 | var self = Self{ .cp_iter = .{ .bytes = str }, .data = data }; | ||
| 121 | self.advance(); | ||
| 122 | return self; | ||
| 123 | } | ||
| 124 | |||
| 125 | fn advance(self: *Self) void { | ||
| 126 | self.buf[0] = self.buf[1]; | ||
| 127 | self.buf[1] = self.cp_iter.next(); | ||
| 128 | } | ||
| 129 | |||
| 130 | pub fn next(self: *Self) ?Grapheme { | ||
| 131 | self.advance(); | ||
| 132 | |||
| 133 | // If no more | ||
| 134 | if (self.buf[0] == null) return null; | ||
| 135 | // If last one | ||
| 136 | if (self.buf[1] == null) return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset }; | ||
| 137 | // If ASCII | ||
| 138 | if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) { | ||
| 139 | return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset }; | ||
| 140 | } | ||
| 141 | |||
| 142 | const gc_start = self.buf[0].?.offset; | ||
| 143 | var gc_len: u8 = self.buf[0].?.len; | ||
| 144 | var state = State{}; | ||
| 145 | |||
| 146 | if (graphemeBreak( | ||
| 147 | self.buf[0].?.code, | ||
| 148 | self.buf[1].?.code, | ||
| 149 | self.data, | ||
| 150 | &state, | ||
| 151 | )) return Grapheme{ .len = gc_len, .offset = gc_start }; | ||
| 152 | |||
| 153 | while (true) { | ||
| 154 | self.advance(); | ||
| 155 | if (self.buf[0] == null) break; | ||
| 156 | |||
| 157 | gc_len += self.buf[0].?.len; | ||
| 158 | |||
| 159 | if (graphemeBreak( | ||
| 160 | self.buf[0].?.code, | ||
| 161 | if (self.buf[1]) |ncp| ncp.code else 0, | ||
| 162 | self.data, | ||
| 163 | &state, | ||
| 164 | )) break; | ||
| 165 | } | ||
| 166 | |||
| 167 | return Grapheme{ .len = gc_len, .offset = gc_start }; | ||
| 168 | } | ||
| 169 | |||
| 170 | pub fn peek(self: *Self) ?Grapheme { | ||
| 171 | const saved_cp_iter = self.cp_iter; | ||
| 172 | const s0 = self.buf[0]; | ||
| 173 | const s1 = self.buf[1]; | ||
| 174 | |||
| 175 | self.advance(); | ||
| 176 | |||
| 177 | // If no more | ||
| 178 | if (self.buf[0] == null) { | ||
| 179 | self.cp_iter = saved_cp_iter; | ||
| 180 | self.buf[0] = s0; | ||
| 181 | self.buf[1] = s1; | ||
| 182 | return null; | ||
| 183 | } | ||
| 184 | // If last one | ||
| 185 | if (self.buf[1] == null) { | ||
| 186 | const len = self.buf[0].?.len; | ||
| 187 | const offset = self.buf[0].?.offset; | ||
| 188 | self.cp_iter = saved_cp_iter; | ||
| 189 | self.buf[0] = s0; | ||
| 190 | self.buf[1] = s1; | ||
| 191 | return Grapheme{ .len = len, .offset = offset }; | ||
| 192 | } | ||
| 193 | // If ASCII | ||
| 194 | if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) { | ||
| 195 | const len = self.buf[0].?.len; | ||
| 196 | const offset = self.buf[0].?.offset; | ||
| 197 | self.cp_iter = saved_cp_iter; | ||
| 198 | self.buf[0] = s0; | ||
| 199 | self.buf[1] = s1; | ||
| 200 | return Grapheme{ .len = len, .offset = offset }; | ||
| 201 | } | ||
| 202 | |||
| 203 | const gc_start = self.buf[0].?.offset; | ||
| 204 | var gc_len: u8 = self.buf[0].?.len; | ||
| 205 | var state = State{}; | ||
| 206 | |||
| 207 | if (graphemeBreak( | ||
| 208 | self.buf[0].?.code, | ||
| 209 | self.buf[1].?.code, | ||
| 210 | self.data, | ||
| 211 | &state, | ||
| 212 | )) { | ||
| 213 | self.cp_iter = saved_cp_iter; | ||
| 214 | self.buf[0] = s0; | ||
| 215 | self.buf[1] = s1; | ||
| 216 | return Grapheme{ .len = gc_len, .offset = gc_start }; | ||
| 217 | } | ||
| 218 | |||
| 219 | while (true) { | ||
| 220 | self.advance(); | ||
| 221 | if (self.buf[0] == null) break; | ||
| 222 | |||
| 223 | gc_len += self.buf[0].?.len; | ||
| 224 | |||
| 225 | if (graphemeBreak( | ||
| 226 | self.buf[0].?.code, | ||
| 227 | if (self.buf[1]) |ncp| ncp.code else 0, | ||
| 228 | self.data, | ||
| 229 | &state, | ||
| 230 | )) break; | ||
| 231 | } | ||
| 232 | self.cp_iter = saved_cp_iter; | ||
| 233 | self.buf[0] = s0; | ||
| 234 | self.buf[1] = s1; | ||
| 235 | |||
| 236 | return Grapheme{ .len = gc_len, .offset = gc_start }; | ||
| 237 | } | ||
| 238 | }; | ||
| 239 | |||
| 240 | // Predicates | ||
| 241 | fn isBreaker(cp: u21, data: *const Graphemes) bool { | ||
| 242 | // Extract relevant properties. | ||
| 243 | const cp_gbp_prop = data.gbp(cp); | ||
| 244 | return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control; | ||
| 245 | } | ||
| 246 | |||
| 247 | // Grapheme break state. | ||
| 248 | pub const State = struct { | ||
| 249 | bits: u3 = 0, | ||
| 250 | |||
| 251 | // Extended Pictographic (emoji) | ||
| 252 | fn hasXpic(self: State) bool { | ||
| 253 | return self.bits & 1 == 1; | ||
| 254 | } | ||
| 255 | fn setXpic(self: *State) void { | ||
| 256 | self.bits |= 1; | ||
| 257 | } | ||
| 258 | fn unsetXpic(self: *State) void { | ||
| 259 | self.bits ^= 1; | ||
| 260 | } | ||
| 261 | |||
| 262 | // Regional Indicatior (flags) | ||
| 263 | fn hasRegional(self: State) bool { | ||
| 264 | return self.bits & 2 == 2; | ||
| 265 | } | ||
| 266 | fn setRegional(self: *State) void { | ||
| 267 | self.bits |= 2; | ||
| 268 | } | ||
| 269 | fn unsetRegional(self: *State) void { | ||
| 270 | self.bits ^= 2; | ||
| 271 | } | ||
| 272 | |||
| 273 | // Indic Conjunct | ||
| 274 | fn hasIndic(self: State) bool { | ||
| 275 | return self.bits & 4 == 4; | ||
| 276 | } | ||
| 277 | fn setIndic(self: *State) void { | ||
| 278 | self.bits |= 4; | ||
| 279 | } | ||
| 280 | fn unsetIndic(self: *State) void { | ||
| 281 | self.bits ^= 4; | ||
| 282 | } | ||
| 283 | }; | ||
| 284 | |||
| 285 | /// `graphemeBreak` returns true only if a grapheme break point is required | ||
| 286 | /// between `cp1` and `cp2`. `state` should start out as 0. If calling | ||
| 287 | /// iteratively over a sequence of code points, this function must be called | ||
| 288 | /// IN ORDER on ALL potential breaks in a string. | ||
| 289 | /// Modeled after the API of utf8proc's `utf8proc_grapheme_break_stateful`. | ||
| 290 | /// https://github.com/JuliaStrings/utf8proc/blob/2bbb1ba932f727aad1fab14fafdbc89ff9dc4604/utf8proc.h#L599-L617 | ||
| 291 | pub fn graphemeBreak( | ||
| 292 | cp1: u21, | ||
| 293 | cp2: u21, | ||
| 294 | data: *const Graphemes, | ||
| 295 | state: *State, | ||
| 296 | ) bool { | ||
| 297 | // Extract relevant properties. | ||
| 298 | const cp1_gbp_prop = data.gbp(cp1); | ||
| 299 | const cp1_indic_prop = data.indic(cp1); | ||
| 300 | const cp1_is_emoji = data.isEmoji(cp1); | ||
| 301 | |||
| 302 | const cp2_gbp_prop = data.gbp(cp2); | ||
| 303 | const cp2_indic_prop = data.indic(cp2); | ||
| 304 | const cp2_is_emoji = data.isEmoji(cp2); | ||
| 305 | |||
| 306 | // GB11: Emoji Extend* ZWJ x Emoji | ||
| 307 | if (!state.hasXpic() and cp1_is_emoji) state.setXpic(); | ||
| 308 | // GB9c: Indic Conjunct Break | ||
| 309 | if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic(); | ||
| 310 | |||
| 311 | // GB3: CR x LF | ||
| 312 | if (cp1 == '\r' and cp2 == '\n') return false; | ||
| 313 | |||
| 314 | // GB4: Control | ||
| 315 | if (isBreaker(cp1, data)) return true; | ||
| 316 | |||
| 317 | // GB11: Emoji Extend* ZWJ x Emoji | ||
| 318 | if (state.hasXpic() and | ||
| 319 | cp1_gbp_prop == .ZWJ and | ||
| 320 | cp2_is_emoji) | ||
| 321 | { | ||
| 322 | state.unsetXpic(); | ||
| 323 | return false; | ||
| 324 | } | ||
| 325 | |||
| 326 | // GB9b: x (Extend | ZWJ) | ||
| 327 | if (cp2_gbp_prop == .Extend or cp2_gbp_prop == .ZWJ) return false; | ||
| 328 | |||
| 329 | // GB9a: x Spacing | ||
| 330 | if (cp2_gbp_prop == .SpacingMark) return false; | ||
| 331 | |||
| 332 | // GB9b: Prepend x | ||
| 333 | if (cp1_gbp_prop == .Prepend and !isBreaker(cp2, data)) return false; | ||
| 334 | |||
| 335 | // GB12, GB13: RI x RI | ||
| 336 | if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) { | ||
| 337 | if (state.hasRegional()) { | ||
| 338 | state.unsetRegional(); | ||
| 339 | return true; | ||
| 340 | } else { | ||
| 341 | state.setRegional(); | ||
| 342 | return false; | ||
| 343 | } | ||
| 344 | } | ||
| 345 | |||
| 346 | // GB6: Hangul L x (L|V|LV|VT) | ||
| 347 | if (cp1_gbp_prop == .L) { | ||
| 348 | if (cp2_gbp_prop == .L or | ||
| 349 | cp2_gbp_prop == .V or | ||
| 350 | cp2_gbp_prop == .LV or | ||
| 351 | cp2_gbp_prop == .LVT) return false; | ||
| 352 | } | ||
| 353 | |||
| 354 | // GB7: Hangul (LV | V) x (V | T) | ||
| 355 | if (cp1_gbp_prop == .LV or cp1_gbp_prop == .V) { | ||
| 356 | if (cp2_gbp_prop == .V or | ||
| 357 | cp2_gbp_prop == .T) return false; | ||
| 358 | } | ||
| 359 | |||
| 360 | // GB8: Hangul (LVT | T) x T | ||
| 361 | if (cp1_gbp_prop == .LVT or cp1_gbp_prop == .T) { | ||
| 362 | if (cp2_gbp_prop == .T) return false; | ||
| 363 | } | ||
| 364 | |||
| 365 | // GB9c: Indic Conjunct Break | ||
| 366 | if (state.hasIndic() and | ||
| 367 | cp1_indic_prop == .Consonant and | ||
| 368 | (cp2_indic_prop == .Extend or cp2_indic_prop == .Linker)) | ||
| 369 | { | ||
| 370 | return false; | ||
| 371 | } | ||
| 372 | |||
| 373 | if (state.hasIndic() and | ||
| 374 | cp1_indic_prop == .Extend and | ||
| 375 | cp2_indic_prop == .Linker) | ||
| 376 | { | ||
| 377 | return false; | ||
| 378 | } | ||
| 379 | |||
| 380 | if (state.hasIndic() and | ||
| 381 | (cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and | ||
| 382 | cp2_indic_prop == .Consonant) | ||
| 383 | { | ||
| 384 | state.unsetIndic(); | ||
| 385 | return false; | ||
| 386 | } | ||
| 387 | |||
| 388 | return true; | ||
| 389 | } | ||
| 390 | |||
| 391 | test "Segmentation ZWJ and ZWSP emoji sequences" { | ||
| 392 | const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | ||
| 393 | const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}"; | ||
| 394 | const with_zwj = seq_1 ++ "\u{200D}" ++ seq_2; | ||
| 395 | const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2; | ||
| 396 | const no_joiner = seq_1 ++ seq_2; | ||
| 397 | |||
| 398 | const graphemes = try Graphemes.init(std.testing.allocator); | ||
| 399 | defer graphemes.deinit(std.testing.allocator); | ||
| 400 | |||
| 401 | { | ||
| 402 | var iter = graphemes.iterator(with_zwj); | ||
| 403 | var i: usize = 0; | ||
| 404 | while (iter.next()) |_| : (i += 1) {} | ||
| 405 | try std.testing.expectEqual(@as(usize, 1), i); | ||
| 406 | } | ||
| 407 | |||
| 408 | { | ||
| 409 | var iter = graphemes.iterator(with_zwsp); | ||
| 410 | var i: usize = 0; | ||
| 411 | while (iter.next()) |_| : (i += 1) {} | ||
| 412 | try std.testing.expectEqual(@as(usize, 3), i); | ||
| 413 | } | ||
| 414 | |||
| 415 | { | ||
| 416 | var iter = graphemes.iterator(no_joiner); | ||
| 417 | var i: usize = 0; | ||
| 418 | while (iter.next()) |_| : (i += 1) {} | ||
| 419 | try std.testing.expectEqual(@as(usize, 2), i); | ||
| 420 | } | ||
| 421 | } | ||