diff options
| author | 2025-07-08 12:15:32 -0400 | |
|---|---|---|
| committer | 2025-07-08 12:15:32 -0400 | |
| commit | 9427a9e53aaa29ee071f4dcb35b809a699d75aa9 (patch) | |
| tree | 2607c185fd8053b84d60041fadc35c05a0225d34 /src/code_point.zig | |
| parent | Merge pull request 'Fix benchmarks' (#56) from jacobsandlund/zg:benchmarks in... (diff) | |
| parent | Add Words.zig example to README (diff) | |
| download | zg-master.tar.gz zg-master.tar.xz zg-master.zip | |
Diffstat (limited to 'src/code_point.zig')
| -rw-r--r-- | src/code_point.zig | 200 |
1 files changed, 188 insertions, 12 deletions
diff --git a/src/code_point.zig b/src/code_point.zig index fe7ad6e..7a638af 100644 --- a/src/code_point.zig +++ b/src/code_point.zig | |||
| @@ -4,18 +4,33 @@ | |||
| 4 | //! Represents invalid data according to the Replacement of Maximal | 4 | //! Represents invalid data according to the Replacement of Maximal |
| 5 | //! Subparts algorithm. | 5 | //! Subparts algorithm. |
| 6 | 6 | ||
| 7 | pub const uoffset = if (@import("config").fat_offset) u64 else u32; | ||
| 8 | |||
| 7 | /// `CodePoint` represents a Unicode code point by its code, | 9 | /// `CodePoint` represents a Unicode code point by its code, |
| 8 | /// length, and offset in the source bytes. | 10 | /// length, and offset in the source bytes. |
| 9 | pub const CodePoint = struct { | 11 | pub const CodePoint = struct { |
| 10 | code: u21, | 12 | code: u21, |
| 11 | len: u3, | 13 | len: u3, |
| 12 | offset: u32, | 14 | offset: uoffset, |
| 15 | |||
| 16 | /// Return the slice of this codepoint, given the original string. | ||
| 17 | pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 { | ||
| 18 | return str[cp.offset..][0..cp.len]; | ||
| 19 | } | ||
| 20 | |||
| 21 | pub fn format(cp: CodePoint, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { | ||
| 22 | try writer.print("CodePoint '{u}' .{{ ", .{cp.code}); | ||
| 23 | try writer.print( | ||
| 24 | ".code = 0x{x}, .offset = {d}, .len = {d} }}", | ||
| 25 | .{ cp.code, cp.offset, cp.len }, | ||
| 26 | ); | ||
| 27 | } | ||
| 13 | }; | 28 | }; |
| 14 | 29 | ||
| 15 | /// This function is deprecated and will be removed in a later release. | 30 | /// This function is deprecated and will be removed in a later release. |
| 16 | /// Use `decodeAtIndex` or `decodeAtCursor`. | 31 | /// Use `decodeAtIndex` or `decodeAtCursor`. |
| 17 | pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { | 32 | pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint { |
| 18 | var off: u32 = 0; | 33 | var off: uoffset = 0; |
| 19 | var maybe_code = decodeAtCursor(bytes, &off); | 34 | var maybe_code = decodeAtCursor(bytes, &off); |
| 20 | if (maybe_code) |*code| { | 35 | if (maybe_code) |*code| { |
| 21 | code.offset = offset; | 36 | code.offset = offset; |
| @@ -24,15 +39,23 @@ pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { | |||
| 24 | return null; | 39 | return null; |
| 25 | } | 40 | } |
| 26 | 41 | ||
| 42 | /// Return the codepoint at `index`, even if `index` is in the middle | ||
| 43 | /// of that codepoint. | ||
| 44 | pub fn codepointAtIndex(bytes: []const u8, index: uoffset) ?CodePoint { | ||
| 45 | var idx = index; | ||
| 46 | while (idx > 0 and 0x80 <= bytes[idx] and bytes[idx] <= 0xbf) : (idx -= 1) {} | ||
| 47 | return decodeAtIndex(bytes, idx); | ||
| 48 | } | ||
| 49 | |||
| 27 | /// Decode the CodePoint, if any, at `bytes[idx]`. | 50 | /// Decode the CodePoint, if any, at `bytes[idx]`. |
| 28 | pub fn decodeAtIndex(bytes: []const u8, idx: u32) ?CodePoint { | 51 | pub fn decodeAtIndex(bytes: []const u8, index: uoffset) ?CodePoint { |
| 29 | var off = idx; | 52 | var off = index; |
| 30 | return decodeAtCursor(bytes, &off); | 53 | return decodeAtCursor(bytes, &off); |
| 31 | } | 54 | } |
| 32 | 55 | ||
| 33 | /// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the | 56 | /// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the |
| 34 | /// cursor will point at the next potential codepoint index. | 57 | /// cursor will point at the next potential codepoint index. |
| 35 | pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { | 58 | pub fn decodeAtCursor(bytes: []const u8, cursor: *uoffset) ?CodePoint { |
| 36 | // EOS | 59 | // EOS |
| 37 | if (cursor.* >= bytes.len) return null; | 60 | if (cursor.* >= bytes.len) return null; |
| 38 | 61 | ||
| @@ -98,6 +121,9 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { | |||
| 98 | } | 121 | } |
| 99 | if (st == RUNE_REJECT or cursor.* == bytes.len) { | 122 | if (st == RUNE_REJECT or cursor.* == bytes.len) { |
| 100 | @branchHint(.cold); | 123 | @branchHint(.cold); |
| 124 | // This, and the branch below, detect truncation, the | ||
| 125 | // only invalid state handled differently by the Maximal | ||
| 126 | // Subparts algorithm. | ||
| 101 | if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { | 127 | if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { |
| 102 | cursor.* -= 2; // +1 | 128 | cursor.* -= 2; // +1 |
| 103 | return .{ | 129 | return .{ |
| @@ -148,7 +174,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { | |||
| 148 | /// `Iterator` iterates a string one `CodePoint` at-a-time. | 174 | /// `Iterator` iterates a string one `CodePoint` at-a-time. |
| 149 | pub const Iterator = struct { | 175 | pub const Iterator = struct { |
| 150 | bytes: []const u8, | 176 | bytes: []const u8, |
| 151 | i: u32 = 0, | 177 | i: uoffset = 0, |
| 152 | 178 | ||
| 153 | pub fn init(bytes: []const u8) Iterator { | 179 | pub fn init(bytes: []const u8) Iterator { |
| 154 | return .{ .bytes = bytes, .i = 0 }; | 180 | return .{ .bytes = bytes, .i = 0 }; |
| @@ -158,10 +184,19 @@ pub const Iterator = struct { | |||
| 158 | return decodeAtCursor(self.bytes, &self.i); | 184 | return decodeAtCursor(self.bytes, &self.i); |
| 159 | } | 185 | } |
| 160 | 186 | ||
| 161 | pub fn peek(self: *Iterator) ?CodePoint { | 187 | pub fn peek(iter: *Iterator) ?CodePoint { |
| 162 | const saved_i = self.i; | 188 | const saved_i = iter.i; |
| 163 | defer self.i = saved_i; | 189 | defer iter.i = saved_i; |
| 164 | return self.next(); | 190 | return iter.next(); |
| 191 | } | ||
| 192 | |||
| 193 | /// Create a backward iterator at this point. It will repeat | ||
| 194 | /// the last CodePoint seen. | ||
| 195 | pub fn reverseIterator(iter: *const Iterator) ReverseIterator { | ||
| 196 | if (iter.i == iter.bytes.len) { | ||
| 197 | return .init(iter.bytes); | ||
| 198 | } | ||
| 199 | return .{ .i = iter.i, .bytes = iter.bytes }; | ||
| 165 | } | 200 | } |
| 166 | }; | 201 | }; |
| 167 | 202 | ||
| @@ -233,6 +268,55 @@ const class_mask: [12]u8 = .{ | |||
| 233 | 0, | 268 | 0, |
| 234 | }; | 269 | }; |
| 235 | 270 | ||
| 271 | pub const ReverseIterator = struct { | ||
| 272 | bytes: []const u8, | ||
| 273 | i: ?uoffset, | ||
| 274 | |||
| 275 | pub fn init(str: []const u8) ReverseIterator { | ||
| 276 | var r_iter: ReverseIterator = undefined; | ||
| 277 | r_iter.bytes = str; | ||
| 278 | r_iter.i = if (str.len == 0) 0 else @intCast(str.len - 1); | ||
| 279 | return r_iter; | ||
| 280 | } | ||
| 281 | |||
| 282 | pub fn prev(iter: *ReverseIterator) ?CodePoint { | ||
| 283 | if (iter.i == null) return null; | ||
| 284 | var i_prev = iter.i.?; | ||
| 285 | |||
| 286 | while (i_prev > 0) : (i_prev -= 1) { | ||
| 287 | if (!followbyte(iter.bytes[i_prev])) break; | ||
| 288 | } | ||
| 289 | |||
| 290 | if (i_prev > 0) | ||
| 291 | iter.i = i_prev - 1 | ||
| 292 | else | ||
| 293 | iter.i = null; | ||
| 294 | |||
| 295 | return decode(iter.bytes[i_prev..], i_prev); | ||
| 296 | } | ||
| 297 | |||
| 298 | pub fn peek(iter: *ReverseIterator) ?CodePoint { | ||
| 299 | const saved_i = iter.i; | ||
| 300 | defer iter.i = saved_i; | ||
| 301 | return iter.prev(); | ||
| 302 | } | ||
| 303 | |||
| 304 | /// Create a forward iterator at this point. It will repeat the | ||
| 305 | /// last CodePoint seen. | ||
| 306 | pub fn forwardIterator(iter: *const ReverseIterator) Iterator { | ||
| 307 | if (iter.i) |i| { | ||
| 308 | var fwd: Iterator = .{ .i = i, .bytes = iter.bytes }; | ||
| 309 | _ = fwd.next(); | ||
| 310 | return fwd; | ||
| 311 | } | ||
| 312 | return .{ .i = 0, .bytes = iter.bytes }; | ||
| 313 | } | ||
| 314 | }; | ||
| 315 | |||
| 316 | inline fn followbyte(b: u8) bool { | ||
| 317 | return 0x80 <= b and b <= 0xbf; | ||
| 318 | } | ||
| 319 | |||
| 236 | test "decode" { | 320 | test "decode" { |
| 237 | const bytes = "๐ฉ๏ธ"; | 321 | const bytes = "๐ฉ๏ธ"; |
| 238 | const res = decode(bytes, 0); | 322 | const res = decode(bytes, 0); |
| @@ -246,7 +330,7 @@ test "decode" { | |||
| 246 | } | 330 | } |
| 247 | } | 331 | } |
| 248 | 332 | ||
| 249 | test "peek" { | 333 | test Iterator { |
| 250 | var iter = Iterator{ .bytes = "Hi" }; | 334 | var iter = Iterator{ .bytes = "Hi" }; |
| 251 | 335 | ||
| 252 | try expectEqual(@as(u21, 'H'), iter.next().?.code); | 336 | try expectEqual(@as(u21, 'H'), iter.next().?.code); |
| @@ -256,6 +340,54 @@ test "peek" { | |||
| 256 | try expectEqual(@as(?CodePoint, null), iter.next()); | 340 | try expectEqual(@as(?CodePoint, null), iter.next()); |
| 257 | } | 341 | } |
| 258 | 342 | ||
| 343 | const code_point = @This(); | ||
| 344 | |||
| 345 | // Keep this in sync with the README | ||
| 346 | test "Code point iterator" { | ||
| 347 | const str = "Hi ๐"; | ||
| 348 | var iter: code_point.Iterator = .init(str); | ||
| 349 | var i: usize = 0; | ||
| 350 | |||
| 351 | while (iter.next()) |cp| : (i += 1) { | ||
| 352 | // The `code` field is the actual code point scalar as a `u21`. | ||
| 353 | if (i == 0) try expect(cp.code == 'H'); | ||
| 354 | if (i == 1) try expect(cp.code == 'i'); | ||
| 355 | if (i == 2) try expect(cp.code == ' '); | ||
| 356 | |||
| 357 | if (i == 3) { | ||
| 358 | try expect(cp.code == '๐'); | ||
| 359 | // The `offset` field is the byte offset in the | ||
| 360 | // source string. | ||
| 361 | try expect(cp.offset == 3); | ||
| 362 | try expectEqual(cp, code_point.decodeAtIndex(str, cp.offset).?); | ||
| 363 | // The `len` field is the length in bytes of the | ||
| 364 | // code point in the source string. | ||
| 365 | try expect(cp.len == 4); | ||
| 366 | // There is also a 'cursor' decode, like so: | ||
| 367 | { | ||
| 368 | var cursor = cp.offset; | ||
| 369 | try expectEqual(cp, code_point.decodeAtCursor(str, &cursor).?); | ||
| 370 | // Which advances the cursor variable to the next possible | ||
| 371 | // offset, in this case, `str.len`. Don't forget to account | ||
| 372 | // for this possibility! | ||
| 373 | try expectEqual(cp.offset + cp.len, cursor); | ||
| 374 | } | ||
| 375 | // There's also this, for when you aren't sure if you have the | ||
| 376 | // correct start for a code point: | ||
| 377 | try expectEqual(cp, code_point.codepointAtIndex(str, cp.offset + 1).?); | ||
| 378 | } | ||
| 379 | // Reverse iteration is also an option: | ||
| 380 | var r_iter: code_point.ReverseIterator = .init(str); | ||
| 381 | // Both iterators can be peeked: | ||
| 382 | try expectEqual('๐', r_iter.peek().?.code); | ||
| 383 | try expectEqual('๐', r_iter.prev().?.code); | ||
| 384 | // Both kinds of iterators can be reversed: | ||
| 385 | var fwd_iter = r_iter.forwardIterator(); // or iter.reverseIterator(); | ||
| 386 | // This will always return the last codepoint from | ||
| 387 | // the prior iterator, _if_ it yielded one: | ||
| 388 | try expectEqual('๐', fwd_iter.next().?.code); | ||
| 389 | } | ||
| 390 | } | ||
| 259 | test "overlongs" { | 391 | test "overlongs" { |
| 260 | // None of these should equal `/`, all should be byte-for-byte | 392 | // None of these should equal `/`, all should be byte-for-byte |
| 261 | // handled as replacement characters. | 393 | // handled as replacement characters. |
| @@ -346,6 +478,50 @@ test "truncation" { | |||
| 346 | } | 478 | } |
| 347 | } | 479 | } |
| 348 | 480 | ||
| 481 | test ReverseIterator { | ||
| 482 | { | ||
| 483 | var r_iter: ReverseIterator = .init("ABC"); | ||
| 484 | try testing.expectEqual(@as(u21, 'C'), r_iter.prev().?.code); | ||
| 485 | try testing.expectEqual(@as(u21, 'B'), r_iter.peek().?.code); | ||
| 486 | try testing.expectEqual(@as(u21, 'B'), r_iter.prev().?.code); | ||
| 487 | try testing.expectEqual(@as(u21, 'A'), r_iter.prev().?.code); | ||
| 488 | try testing.expectEqual(@as(?CodePoint, null), r_iter.peek()); | ||
| 489 | try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); | ||
| 490 | try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); | ||
| 491 | } | ||
| 492 | { | ||
| 493 | var r_iter: ReverseIterator = .init("โ ฮดq๐ฆพฤ"); | ||
| 494 | try testing.expectEqual(@as(u21, 'ฤ'), r_iter.prev().?.code); | ||
| 495 | try testing.expectEqual(@as(u21, '๐ฆพ'), r_iter.prev().?.code); | ||
| 496 | try testing.expectEqual(@as(u21, 'q'), r_iter.prev().?.code); | ||
| 497 | try testing.expectEqual(@as(u21, 'ฮด'), r_iter.peek().?.code); | ||
| 498 | try testing.expectEqual(@as(u21, 'ฮด'), r_iter.prev().?.code); | ||
| 499 | try testing.expectEqual(@as(u21, 'โ '), r_iter.peek().?.code); | ||
| 500 | try testing.expectEqual(@as(u21, 'โ '), r_iter.peek().?.code); | ||
| 501 | try testing.expectEqual(@as(u21, 'โ '), r_iter.prev().?.code); | ||
| 502 | try testing.expectEqual(@as(?CodePoint, null), r_iter.peek()); | ||
| 503 | try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); | ||
| 504 | try testing.expectEqual(@as(?CodePoint, null), r_iter.prev()); | ||
| 505 | } | ||
| 506 | { | ||
| 507 | var r_iter: ReverseIterator = .init("123"); | ||
| 508 | try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code); | ||
| 509 | try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); | ||
| 510 | try testing.expectEqual(@as(u21, '1'), r_iter.prev().?.code); | ||
| 511 | var iter = r_iter.forwardIterator(); | ||
| 512 | try testing.expectEqual(@as(u21, '1'), iter.next().?.code); | ||
| 513 | try testing.expectEqual(@as(u21, '2'), iter.next().?.code); | ||
| 514 | try testing.expectEqual(@as(u21, '3'), iter.next().?.code); | ||
| 515 | r_iter = iter.reverseIterator(); | ||
| 516 | try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code); | ||
| 517 | try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); | ||
| 518 | iter = r_iter.forwardIterator(); | ||
| 519 | r_iter = iter.reverseIterator(); | ||
| 520 | try testing.expectEqual(@as(u21, '2'), iter.next().?.code); | ||
| 521 | try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code); | ||
| 522 | } | ||
| 523 | } | ||
| 524 | |||
| 349 | const std = @import("std"); | 525 | const std = @import("std"); |
| 350 | const testing = std.testing; | 526 | const testing = std.testing; |
| 351 | const expect = testing.expect; | 527 | const expect = testing.expect; |