diff options
| author | 2025-05-15 16:06:35 -0400 | |
|---|---|---|
| committer | 2025-05-15 16:06:35 -0400 | |
| commit | 6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3 (patch) | |
| tree | 61f83982aaab67f862524d46b073fcfee01d56cf | |
| parent | wordAtIndex passes conformance (diff) | |
| parent | Merge branch 'work-branch' into HEAD (diff) | |
| download | zg-6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3.tar.gz zg-6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3.tar.xz zg-6ae1e4aabf3b8d3a0c3c1154c297979d0277e6b3.zip | |
Merge commit 'b5d955f' into develop-next
| -rw-r--r-- | CONTRIBUTORS.md | 2 | ||||
| -rw-r--r-- | src/Graphemes.zig | 226 | ||||
| -rw-r--r-- | src/unicode_tests.zig | 74 |
3 files changed, 299 insertions, 3 deletions
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index c42cb8b..530c921 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md | |||
| @@ -18,3 +18,5 @@ Jonathan Raphaelson <jon@accidental.cc> | |||
| 18 | Sam Atman <atmanistan@gmail.com> | 18 | Sam Atman <atmanistan@gmail.com> |
| 19 | 19 | ||
| 20 | lch361 <author@lch361.net> | 20 | lch361 <author@lch361.net> |
| 21 | |||
| 22 | Matteo Romano <mr.matteo.romano.02@gmail.com> | ||
diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 1f67fc6..0338c04 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig | |||
| @@ -7,6 +7,7 @@ const unicode = std.unicode; | |||
| 7 | 7 | ||
| 8 | const CodePoint = @import("code_point").CodePoint; | 8 | const CodePoint = @import("code_point").CodePoint; |
| 9 | const CodePointIterator = @import("code_point").Iterator; | 9 | const CodePointIterator = @import("code_point").Iterator; |
| 10 | const CodePointReverseIterator = @import("code_point").ReverseIterator; | ||
| 10 | 11 | ||
| 11 | s1: []u16 = undefined, | 12 | s1: []u16 = undefined, |
| 12 | s2: []u16 = undefined, | 13 | s2: []u16 = undefined, |
| @@ -70,6 +71,10 @@ pub fn iterator(graphemes: *const Graphemes, string: []const u8) Iterator { | |||
| 70 | return Iterator.init(string, graphemes); | 71 | return Iterator.init(string, graphemes); |
| 71 | } | 72 | } |
| 72 | 73 | ||
| 74 | pub fn reverseIterator(graphemes: *const Graphemes, string: []const u8) ReverseIterator { | ||
| 75 | return ReverseIterator.init(string, graphemes); | ||
| 76 | } | ||
| 77 | |||
| 73 | /// Indic syllable type. | 78 | /// Indic syllable type. |
| 74 | pub const Indic = enum { | 79 | pub const Indic = enum { |
| 75 | none, | 80 | none, |
| @@ -182,6 +187,221 @@ pub const Iterator = struct { | |||
| 182 | } | 187 | } |
| 183 | }; | 188 | }; |
| 184 | 189 | ||
| 190 | pub const ReverseIterator = struct { | ||
| 191 | buf: [2]?CodePoint = .{ null, null }, | ||
| 192 | cp_iter: CodePointReverseIterator, | ||
| 193 | data: *const Graphemes, | ||
| 194 | /// Codepoint read from `cp_iter` but not returned by `previous` | ||
| 195 | pending: Pending = .{ .none = {} }, | ||
| 196 | |||
| 197 | const Pending = union(enum) { | ||
| 198 | none: void, | ||
| 199 | /// Count of pending RI codepoints, it is an even number | ||
| 200 | ri_count: usize, | ||
| 201 | /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji | ||
| 202 | extend_end: u32, | ||
| 203 | }; | ||
| 204 | |||
| 205 | const Self = @This(); | ||
| 206 | |||
| 207 | pub fn init(str: []const u8, data: *const Graphemes) Self { | ||
| 208 | var self: Self = .{ .cp_iter = .init(str), .data = data }; | ||
| 209 | self.advance(); | ||
| 210 | self.advance(); | ||
| 211 | return self; | ||
| 212 | } | ||
| 213 | |||
| 214 | fn advance(self: *Self) void { | ||
| 215 | self.buf[1] = self.buf[0]; | ||
| 216 | self.buf[0] = self.cp_iter.prev(); | ||
| 217 | } | ||
| 218 | |||
| 219 | pub fn prev(self: *Self) ?Grapheme { | ||
| 220 | if (self.buf[1] == null) return null; | ||
| 221 | |||
| 222 | const grapheme_end: u32 = end: { | ||
| 223 | const codepoint = self.buf[1].?; | ||
| 224 | |||
| 225 | switch (self.pending) { | ||
| 226 | // BUF: [?Any, Any] | ||
| 227 | .none => break :end codepoint.offset + codepoint.len, | ||
| 228 | .ri_count => |ri_count| { | ||
| 229 | std.debug.assert(ri_count > 0); | ||
| 230 | std.debug.assert(ri_count % 2 == 0); | ||
| 231 | |||
| 232 | if (ri_count > 2) { | ||
| 233 | self.pending.ri_count -= 2; | ||
| 234 | |||
| 235 | // Use the fact that all RI have length 4 in utf8 encoding | ||
| 236 | // since they are in range 0x1f1e6...0x1f1ff | ||
| 237 | // https://en.wikipedia.org/wiki/UTF-8#Encoding | ||
| 238 | return Grapheme{ | ||
| 239 | .len = 8, | ||
| 240 | .offset = @intCast(codepoint.offset + self.pending.ri_count * 4), | ||
| 241 | }; | ||
| 242 | } else { | ||
| 243 | self.pending = .{ .none = {} }; | ||
| 244 | break :end codepoint.offset + codepoint.len + 4; | ||
| 245 | } | ||
| 246 | }, | ||
| 247 | // BUF: [?Any, Extend] Extend* ZWJ | ||
| 248 | .extend_end => |extend_end| { | ||
| 249 | self.pending = .{ .none = {} }; | ||
| 250 | break :end extend_end; | ||
| 251 | }, | ||
| 252 | } | ||
| 253 | }; | ||
| 254 | |||
| 255 | while (self.buf[0] != null) { | ||
| 256 | var state: State = .{}; | ||
| 257 | state.setXpic(); | ||
| 258 | state.unsetRegional(); | ||
| 259 | state.setIndic(); | ||
| 260 | |||
| 261 | if (graphemeBreak( | ||
| 262 | self.buf[0].?.code, | ||
| 263 | self.buf[1].?.code, | ||
| 264 | self.data, | ||
| 265 | &state, | ||
| 266 | )) break; | ||
| 267 | |||
| 268 | self.advance(); | ||
| 269 | |||
| 270 | if (!state.hasIndic()) { | ||
| 271 | |||
| 272 | // BUF: [?Any, Extend | Linker] Consonant | ||
| 273 | var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; | ||
| 274 | |||
| 275 | indic: while (true) { | ||
| 276 | if (self.buf[0] == null) { | ||
| 277 | self.pending = .{ .extend_end = indic_offset }; | ||
| 278 | return .{ | ||
| 279 | .len = @intCast(grapheme_end - indic_offset), | ||
| 280 | .offset = indic_offset, | ||
| 281 | }; | ||
| 282 | } | ||
| 283 | |||
| 284 | const codepoint = self.buf[0].?; | ||
| 285 | |||
| 286 | switch (self.data.indic(codepoint.code)) { | ||
| 287 | .Extend, .Linker => { | ||
| 288 | self.advance(); | ||
| 289 | continue :indic; | ||
| 290 | }, | ||
| 291 | .Consonant => { | ||
| 292 | // BUF: [Consonant, Extend | Linker] (Extend | Linker)* Consonant | ||
| 293 | indic_offset = codepoint.offset; | ||
| 294 | self.advance(); | ||
| 295 | |||
| 296 | if (self.buf[0]) |cp1| { | ||
| 297 | state.setIndic(); | ||
| 298 | |||
| 299 | if (graphemeBreak(cp1.code, self.buf[1].?.code, self.data, &state)) break; | ||
| 300 | |||
| 301 | if (!state.hasIndic()) { | ||
| 302 | continue :indic; | ||
| 303 | } else { | ||
| 304 | break :indic; | ||
| 305 | } | ||
| 306 | } else { | ||
| 307 | break :indic; | ||
| 308 | } | ||
| 309 | }, | ||
| 310 | .none => { | ||
| 311 | // BUF: [Any, Extend | Linker] (Extend | Linker)* Consonant | ||
| 312 | self.pending = .{ .extend_end = indic_offset }; | ||
| 313 | return .{ | ||
| 314 | .len = @intCast(grapheme_end - indic_offset), | ||
| 315 | .offset = indic_offset, | ||
| 316 | }; | ||
| 317 | }, | ||
| 318 | } | ||
| 319 | } | ||
| 320 | } | ||
| 321 | |||
| 322 | if (!state.hasXpic()) { | ||
| 323 | // BUF: [?Any, ZWJ] Emoji | ||
| 324 | var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; | ||
| 325 | |||
| 326 | // Look for previous Emoji | ||
| 327 | emoji: while (true) { | ||
| 328 | if (self.buf[0] == null) { | ||
| 329 | self.pending = .{ .extend_end = emoji_offset }; | ||
| 330 | return .{ | ||
| 331 | .len = @intCast(grapheme_end - emoji_offset), | ||
| 332 | .offset = emoji_offset, | ||
| 333 | }; | ||
| 334 | } | ||
| 335 | |||
| 336 | const codepoint = self.buf[0].?; | ||
| 337 | |||
| 338 | if (self.data.gbp(codepoint.code) == .Extend) { | ||
| 339 | self.advance(); | ||
| 340 | continue :emoji; | ||
| 341 | } | ||
| 342 | |||
| 343 | if (self.data.isEmoji(codepoint.code)) { | ||
| 344 | // BUF: [Emoji, Extend] (Extend* ZWJ Emoji)* | ||
| 345 | emoji_offset = codepoint.offset; | ||
| 346 | self.advance(); | ||
| 347 | |||
| 348 | if (self.buf[0] != null and | ||
| 349 | // ZWJ = 0x200d | ||
| 350 | self.buf[0].?.code == 0x200d) | ||
| 351 | { | ||
| 352 | // BUF: [ZWJ, Emoji] (Extend* ZWJ Emoji)* | ||
| 353 | // Back at the beginning of the loop, "recursively" look for emoji | ||
| 354 | self.advance(); | ||
| 355 | continue :emoji; | ||
| 356 | } else { | ||
| 357 | // BUF: [?Any, Emoji] (Extend* ZWJ Emoji)* | ||
| 358 | break :emoji; | ||
| 359 | } | ||
| 360 | } else { | ||
| 361 | // BUF: [Any, Extend] (Extend* ZWJ Emoji)* | ||
| 362 | self.pending = .{ .extend_end = emoji_offset }; | ||
| 363 | return .{ | ||
| 364 | .len = @intCast(grapheme_end - emoji_offset), | ||
| 365 | .offset = emoji_offset, | ||
| 366 | }; | ||
| 367 | } | ||
| 368 | } | ||
| 369 | } | ||
| 370 | |||
| 371 | if (state.hasRegional()) { | ||
| 372 | var ri_count: usize = 0; | ||
| 373 | while (self.buf[0] != null and | ||
| 374 | self.data.gbp(self.buf[0].?.code) == .Regional_Indicator) | ||
| 375 | { | ||
| 376 | ri_count += 1; | ||
| 377 | self.advance(); | ||
| 378 | } | ||
| 379 | |||
| 380 | // Use the fact that all RI have length 4 in utf8 encoding | ||
| 381 | // since they are in range 0x1f1e6...0x1f1ff | ||
| 382 | // https://en.wikipedia.org/wiki/UTF-8#Encoding | ||
| 383 | if (ri_count == 0) { | ||
| 384 | // There are no pending RI codepoints | ||
| 385 | } else if (ri_count % 2 == 0) { | ||
| 386 | self.pending = .{ .ri_count = ri_count }; | ||
| 387 | return .{ .len = 8, .offset = grapheme_end - 8 }; | ||
| 388 | } else { | ||
| 389 | // Add one to count for the unused RI | ||
| 390 | self.pending = .{ .ri_count = ri_count + 1 }; | ||
| 391 | return .{ .len = 4, .offset = grapheme_end - 4 }; | ||
| 392 | } | ||
| 393 | } | ||
| 394 | } | ||
| 395 | |||
| 396 | const grapheme_start = if (self.buf[1]) |codepoint| codepoint.offset else 0; | ||
| 397 | self.advance(); | ||
| 398 | return .{ | ||
| 399 | .len = @intCast(grapheme_end - grapheme_start), | ||
| 400 | .offset = grapheme_start, | ||
| 401 | }; | ||
| 402 | } | ||
| 403 | }; | ||
| 404 | |||
| 185 | // Predicates | 405 | // Predicates |
| 186 | fn isBreaker(cp: u21, data: *const Graphemes) bool { | 406 | fn isBreaker(cp: u21, data: *const Graphemes) bool { |
| 187 | // Extract relevant properties. | 407 | // Extract relevant properties. |
| @@ -201,7 +421,7 @@ pub const State = struct { | |||
| 201 | self.bits |= 1; | 421 | self.bits |= 1; |
| 202 | } | 422 | } |
| 203 | fn unsetXpic(self: *State) void { | 423 | fn unsetXpic(self: *State) void { |
| 204 | self.bits ^= 1; | 424 | self.bits &= ~@as(u3, 1); |
| 205 | } | 425 | } |
| 206 | 426 | ||
| 207 | // Regional Indicatior (flags) | 427 | // Regional Indicatior (flags) |
| @@ -212,7 +432,7 @@ pub const State = struct { | |||
| 212 | self.bits |= 2; | 432 | self.bits |= 2; |
| 213 | } | 433 | } |
| 214 | fn unsetRegional(self: *State) void { | 434 | fn unsetRegional(self: *State) void { |
| 215 | self.bits ^= 2; | 435 | self.bits &= ~@as(u3, 2); |
| 216 | } | 436 | } |
| 217 | 437 | ||
| 218 | // Indic Conjunct | 438 | // Indic Conjunct |
| @@ -223,7 +443,7 @@ pub const State = struct { | |||
| 223 | self.bits |= 4; | 443 | self.bits |= 4; |
| 224 | } | 444 | } |
| 225 | fn unsetIndic(self: *State) void { | 445 | fn unsetIndic(self: *State) void { |
| 226 | self.bits ^= 4; | 446 | self.bits &= ~@as(u3, 4); |
| 227 | } | 447 | } |
| 228 | }; | 448 | }; |
| 229 | 449 | ||
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 8b02e98..0204b92 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig | |||
| @@ -175,6 +175,80 @@ test "Segmentation GraphemeIterator" { | |||
| 175 | } | 175 | } |
| 176 | } | 176 | } |
| 177 | 177 | ||
| 178 | test "Segmentation ReverseGraphemeIterator" { | ||
| 179 | const allocator = std.testing.allocator; | ||
| 180 | var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{}); | ||
| 181 | defer file.close(); | ||
| 182 | var buf_reader = std.io.bufferedReader(file.reader()); | ||
| 183 | var input_stream = buf_reader.reader(); | ||
| 184 | |||
| 185 | const data = try Graphemes.init(allocator); | ||
| 186 | defer data.deinit(allocator); | ||
| 187 | |||
| 188 | var buf: [4096]u8 = undefined; | ||
| 189 | var line_no: usize = 1; | ||
| 190 | |||
| 191 | while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) { | ||
| 192 | // Skip comments or empty lines. | ||
| 193 | if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue; | ||
| 194 | |||
| 195 | // Clean up. | ||
| 196 | var line = std.mem.trimLeft(u8, raw, "÷ "); | ||
| 197 | if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| { | ||
| 198 | line = line[0..octo]; | ||
| 199 | } | ||
| 200 | // Iterate over fields. | ||
| 201 | var want = std.ArrayList(Grapheme).init(allocator); | ||
| 202 | defer want.deinit(); | ||
| 203 | |||
| 204 | var all_bytes = std.ArrayList(u8).init(allocator); | ||
| 205 | defer all_bytes.deinit(); | ||
| 206 | |||
| 207 | var graphemes = std.mem.splitSequence(u8, line, " ÷ "); | ||
| 208 | var bytes_index: u32 = 0; | ||
| 209 | |||
| 210 | while (graphemes.next()) |field| { | ||
| 211 | var code_points = std.mem.splitScalar(u8, field, ' '); | ||
| 212 | var cp_buf: [4]u8 = undefined; | ||
| 213 | var cp_index: u32 = 0; | ||
| 214 | var gc_len: u8 = 0; | ||
| 215 | |||
| 216 | while (code_points.next()) |code_point| { | ||
| 217 | if (std.mem.eql(u8, code_point, "×")) continue; | ||
| 218 | const cp: u21 = try std.fmt.parseInt(u21, code_point, 16); | ||
| 219 | const len = try unicode.utf8Encode(cp, &cp_buf); | ||
| 220 | try all_bytes.appendSlice(cp_buf[0..len]); | ||
| 221 | cp_index += len; | ||
| 222 | gc_len += len; | ||
| 223 | } | ||
| 224 | |||
| 225 | try want.append(Grapheme{ .len = gc_len, .offset = bytes_index }); | ||
| 226 | bytes_index += cp_index; | ||
| 227 | } | ||
| 228 | |||
| 229 | // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); | ||
| 230 | var iter = data.reverseIterator(all_bytes.items); | ||
| 231 | |||
| 232 | // Check. | ||
| 233 | var i: usize = want.items.len; | ||
| 234 | while (i > 0) { | ||
| 235 | i -= 1; | ||
| 236 | const want_gc = want.items[i]; | ||
| 237 | const got_gc = iter.prev() orelse { | ||
| 238 | std.debug.print("line {d} grapheme {d}: expected {any} found null\n", .{ line_no, i, want_gc }); | ||
| 239 | return error.TestExpectedEqual; | ||
| 240 | }; | ||
| 241 | std.testing.expectEqualStrings( | ||
| 242 | want_gc.bytes(all_bytes.items), | ||
| 243 | got_gc.bytes(all_bytes.items), | ||
| 244 | ) catch |err| { | ||
| 245 | std.debug.print("line {d} grapheme {d}: expected {any} found {any}\n", .{ line_no, i, want_gc, got_gc }); | ||
| 246 | return err; | ||
| 247 | }; | ||
| 248 | } | ||
| 249 | } | ||
| 250 | } | ||
| 251 | |||
| 178 | test "Segmentation Word Iterator" { | 252 | test "Segmentation Word Iterator" { |
| 179 | const allocator = std.testing.allocator; | 253 | const allocator = std.testing.allocator; |
| 180 | var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{}); | 254 | var file = try std.fs.cwd().openFile("data/unicode/auxiliary/WordBreakTest.txt", .{}); |