summaryrefslogtreecommitdiff
path: root/src/code_point.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/code_point.zig')
-rw-r--r--src/code_point.zig200
1 files changed, 188 insertions, 12 deletions
diff --git a/src/code_point.zig b/src/code_point.zig
index fe7ad6e..7a638af 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -4,18 +4,33 @@
4//! Represents invalid data according to the Replacement of Maximal 4//! Represents invalid data according to the Replacement of Maximal
5//! Subparts algorithm. 5//! Subparts algorithm.
6 6
7pub const uoffset = if (@import("config").fat_offset) u64 else u32;
8
7/// `CodePoint` represents a Unicode code point by its code, 9/// `CodePoint` represents a Unicode code point by its code,
8/// length, and offset in the source bytes. 10/// length, and offset in the source bytes.
9pub const CodePoint = struct { 11pub const CodePoint = struct {
10 code: u21, 12 code: u21,
11 len: u3, 13 len: u3,
12 offset: u32, 14 offset: uoffset,
15
16 /// Return the slice of this codepoint, given the original string.
17 pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 {
18 return str[cp.offset..][0..cp.len];
19 }
20
21 pub fn format(cp: CodePoint, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
22 try writer.print("CodePoint '{u}' .{{ ", .{cp.code});
23 try writer.print(
24 ".code = 0x{x}, .offset = {d}, .len = {d} }}",
25 .{ cp.code, cp.offset, cp.len },
26 );
27 }
13}; 28};
14 29
15/// This function is deprecated and will be removed in a later release. 30/// This function is deprecated and will be removed in a later release.
16/// Use `decodeAtIndex` or `decodeAtCursor`. 31/// Use `decodeAtIndex` or `decodeAtCursor`.
17pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { 32pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint {
18 var off: u32 = 0; 33 var off: uoffset = 0;
19 var maybe_code = decodeAtCursor(bytes, &off); 34 var maybe_code = decodeAtCursor(bytes, &off);
20 if (maybe_code) |*code| { 35 if (maybe_code) |*code| {
21 code.offset = offset; 36 code.offset = offset;
@@ -24,15 +39,23 @@ pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
24 return null; 39 return null;
25} 40}
26 41
42/// Return the codepoint at `index`, even if `index` is in the middle
43/// of that codepoint.
44pub fn codepointAtIndex(bytes: []const u8, index: uoffset) ?CodePoint {
45 var idx = index;
46 while (idx > 0 and 0x80 <= bytes[idx] and bytes[idx] <= 0xbf) : (idx -= 1) {}
47 return decodeAtIndex(bytes, idx);
48}
49
27/// Decode the CodePoint, if any, at `bytes[idx]`. 50/// Decode the CodePoint, if any, at `bytes[idx]`.
28pub fn decodeAtIndex(bytes: []const u8, idx: u32) ?CodePoint { 51pub fn decodeAtIndex(bytes: []const u8, index: uoffset) ?CodePoint {
29 var off = idx; 52 var off = index;
30 return decodeAtCursor(bytes, &off); 53 return decodeAtCursor(bytes, &off);
31} 54}
32 55
33/// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the 56/// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the
34/// cursor will point at the next potential codepoint index. 57/// cursor will point at the next potential codepoint index.
35pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { 58pub fn decodeAtCursor(bytes: []const u8, cursor: *uoffset) ?CodePoint {
36 // EOS 59 // EOS
37 if (cursor.* >= bytes.len) return null; 60 if (cursor.* >= bytes.len) return null;
38 61
@@ -98,6 +121,9 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
98 } 121 }
99 if (st == RUNE_REJECT or cursor.* == bytes.len) { 122 if (st == RUNE_REJECT or cursor.* == bytes.len) {
100 @branchHint(.cold); 123 @branchHint(.cold);
124 // This, and the branch below, detect truncation, the
125 // only invalid state handled differently by the Maximal
126 // Subparts algorithm.
101 if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) { 127 if (state_dfa[@intCast(u8dfa[byte])] == RUNE_REJECT) {
102 cursor.* -= 2; // +1 128 cursor.* -= 2; // +1
103 return .{ 129 return .{
@@ -148,7 +174,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
148/// `Iterator` iterates a string one `CodePoint` at-a-time. 174/// `Iterator` iterates a string one `CodePoint` at-a-time.
149pub const Iterator = struct { 175pub const Iterator = struct {
150 bytes: []const u8, 176 bytes: []const u8,
151 i: u32 = 0, 177 i: uoffset = 0,
152 178
153 pub fn init(bytes: []const u8) Iterator { 179 pub fn init(bytes: []const u8) Iterator {
154 return .{ .bytes = bytes, .i = 0 }; 180 return .{ .bytes = bytes, .i = 0 };
@@ -158,10 +184,19 @@ pub const Iterator = struct {
158 return decodeAtCursor(self.bytes, &self.i); 184 return decodeAtCursor(self.bytes, &self.i);
159 } 185 }
160 186
161 pub fn peek(self: *Iterator) ?CodePoint { 187 pub fn peek(iter: *Iterator) ?CodePoint {
162 const saved_i = self.i; 188 const saved_i = iter.i;
163 defer self.i = saved_i; 189 defer iter.i = saved_i;
164 return self.next(); 190 return iter.next();
191 }
192
193 /// Create a backward iterator at this point. It will repeat
194 /// the last CodePoint seen.
195 pub fn reverseIterator(iter: *const Iterator) ReverseIterator {
196 if (iter.i == iter.bytes.len) {
197 return .init(iter.bytes);
198 }
199 return .{ .i = iter.i, .bytes = iter.bytes };
165 } 200 }
166}; 201};
167 202
@@ -233,6 +268,55 @@ const class_mask: [12]u8 = .{
233 0, 268 0,
234}; 269};
235 270
271pub const ReverseIterator = struct {
272 bytes: []const u8,
273 i: ?uoffset,
274
275 pub fn init(str: []const u8) ReverseIterator {
276 var r_iter: ReverseIterator = undefined;
277 r_iter.bytes = str;
278 r_iter.i = if (str.len == 0) 0 else @intCast(str.len - 1);
279 return r_iter;
280 }
281
282 pub fn prev(iter: *ReverseIterator) ?CodePoint {
283 if (iter.i == null) return null;
284 var i_prev = iter.i.?;
285
286 while (i_prev > 0) : (i_prev -= 1) {
287 if (!followbyte(iter.bytes[i_prev])) break;
288 }
289
290 if (i_prev > 0)
291 iter.i = i_prev - 1
292 else
293 iter.i = null;
294
295 return decode(iter.bytes[i_prev..], i_prev);
296 }
297
298 pub fn peek(iter: *ReverseIterator) ?CodePoint {
299 const saved_i = iter.i;
300 defer iter.i = saved_i;
301 return iter.prev();
302 }
303
304 /// Create a forward iterator at this point. It will repeat the
305 /// last CodePoint seen.
306 pub fn forwardIterator(iter: *const ReverseIterator) Iterator {
307 if (iter.i) |i| {
308 var fwd: Iterator = .{ .i = i, .bytes = iter.bytes };
309 _ = fwd.next();
310 return fwd;
311 }
312 return .{ .i = 0, .bytes = iter.bytes };
313 }
314};
315
316inline fn followbyte(b: u8) bool {
317 return 0x80 <= b and b <= 0xbf;
318}
319
236test "decode" { 320test "decode" {
237 const bytes = "๐ŸŒฉ๏ธ"; 321 const bytes = "๐ŸŒฉ๏ธ";
238 const res = decode(bytes, 0); 322 const res = decode(bytes, 0);
@@ -246,7 +330,7 @@ test "decode" {
246 } 330 }
247} 331}
248 332
249test "peek" { 333test Iterator {
250 var iter = Iterator{ .bytes = "Hi" }; 334 var iter = Iterator{ .bytes = "Hi" };
251 335
252 try expectEqual(@as(u21, 'H'), iter.next().?.code); 336 try expectEqual(@as(u21, 'H'), iter.next().?.code);
@@ -256,6 +340,54 @@ test "peek" {
256 try expectEqual(@as(?CodePoint, null), iter.next()); 340 try expectEqual(@as(?CodePoint, null), iter.next());
257} 341}
258 342
343const code_point = @This();
344
345// Keep this in sync with the README
346test "Code point iterator" {
347 const str = "Hi ๐Ÿ˜Š";
348 var iter: code_point.Iterator = .init(str);
349 var i: usize = 0;
350
351 while (iter.next()) |cp| : (i += 1) {
352 // The `code` field is the actual code point scalar as a `u21`.
353 if (i == 0) try expect(cp.code == 'H');
354 if (i == 1) try expect(cp.code == 'i');
355 if (i == 2) try expect(cp.code == ' ');
356
357 if (i == 3) {
358 try expect(cp.code == '๐Ÿ˜Š');
359 // The `offset` field is the byte offset in the
360 // source string.
361 try expect(cp.offset == 3);
362 try expectEqual(cp, code_point.decodeAtIndex(str, cp.offset).?);
363 // The `len` field is the length in bytes of the
364 // code point in the source string.
365 try expect(cp.len == 4);
366 // There is also a 'cursor' decode, like so:
367 {
368 var cursor = cp.offset;
369 try expectEqual(cp, code_point.decodeAtCursor(str, &cursor).?);
370 // Which advances the cursor variable to the next possible
371 // offset, in this case, `str.len`. Don't forget to account
372 // for this possibility!
373 try expectEqual(cp.offset + cp.len, cursor);
374 }
375 // There's also this, for when you aren't sure if you have the
376 // correct start for a code point:
377 try expectEqual(cp, code_point.codepointAtIndex(str, cp.offset + 1).?);
378 }
379 // Reverse iteration is also an option:
380 var r_iter: code_point.ReverseIterator = .init(str);
381 // Both iterators can be peeked:
382 try expectEqual('๐Ÿ˜Š', r_iter.peek().?.code);
383 try expectEqual('๐Ÿ˜Š', r_iter.prev().?.code);
384 // Both kinds of iterators can be reversed:
385 var fwd_iter = r_iter.forwardIterator(); // or iter.reverseIterator();
386 // This will always return the last codepoint from
387 // the prior iterator, _if_ it yielded one:
388 try expectEqual('๐Ÿ˜Š', fwd_iter.next().?.code);
389 }
390}
259test "overlongs" { 391test "overlongs" {
260 // None of these should equal `/`, all should be byte-for-byte 392 // None of these should equal `/`, all should be byte-for-byte
261 // handled as replacement characters. 393 // handled as replacement characters.
@@ -346,6 +478,50 @@ test "truncation" {
346 } 478 }
347} 479}
348 480
481test ReverseIterator {
482 {
483 var r_iter: ReverseIterator = .init("ABC");
484 try testing.expectEqual(@as(u21, 'C'), r_iter.prev().?.code);
485 try testing.expectEqual(@as(u21, 'B'), r_iter.peek().?.code);
486 try testing.expectEqual(@as(u21, 'B'), r_iter.prev().?.code);
487 try testing.expectEqual(@as(u21, 'A'), r_iter.prev().?.code);
488 try testing.expectEqual(@as(?CodePoint, null), r_iter.peek());
489 try testing.expectEqual(@as(?CodePoint, null), r_iter.prev());
490 try testing.expectEqual(@as(?CodePoint, null), r_iter.prev());
491 }
492 {
493 var r_iter: ReverseIterator = .init("โˆ…ฮดq๐Ÿฆพฤƒ");
494 try testing.expectEqual(@as(u21, 'ฤƒ'), r_iter.prev().?.code);
495 try testing.expectEqual(@as(u21, '๐Ÿฆพ'), r_iter.prev().?.code);
496 try testing.expectEqual(@as(u21, 'q'), r_iter.prev().?.code);
497 try testing.expectEqual(@as(u21, 'ฮด'), r_iter.peek().?.code);
498 try testing.expectEqual(@as(u21, 'ฮด'), r_iter.prev().?.code);
499 try testing.expectEqual(@as(u21, 'โˆ…'), r_iter.peek().?.code);
500 try testing.expectEqual(@as(u21, 'โˆ…'), r_iter.peek().?.code);
501 try testing.expectEqual(@as(u21, 'โˆ…'), r_iter.prev().?.code);
502 try testing.expectEqual(@as(?CodePoint, null), r_iter.peek());
503 try testing.expectEqual(@as(?CodePoint, null), r_iter.prev());
504 try testing.expectEqual(@as(?CodePoint, null), r_iter.prev());
505 }
506 {
507 var r_iter: ReverseIterator = .init("123");
508 try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code);
509 try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code);
510 try testing.expectEqual(@as(u21, '1'), r_iter.prev().?.code);
511 var iter = r_iter.forwardIterator();
512 try testing.expectEqual(@as(u21, '1'), iter.next().?.code);
513 try testing.expectEqual(@as(u21, '2'), iter.next().?.code);
514 try testing.expectEqual(@as(u21, '3'), iter.next().?.code);
515 r_iter = iter.reverseIterator();
516 try testing.expectEqual(@as(u21, '3'), r_iter.prev().?.code);
517 try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code);
518 iter = r_iter.forwardIterator();
519 r_iter = iter.reverseIterator();
520 try testing.expectEqual(@as(u21, '2'), iter.next().?.code);
521 try testing.expectEqual(@as(u21, '2'), r_iter.prev().?.code);
522 }
523}
524
349const std = @import("std"); 525const std = @import("std");
350const testing = std.testing; 526const testing = std.testing;
351const expect = testing.expect; 527const expect = testing.expect;