From 3d06cc6566d66f5ff3188da167424699404ef3ad Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Tue, 13 Feb 2024 20:30:29 -0400 Subject: Removed unreachables from CodePointIterator --- src/CodePoint.zig | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 src/CodePoint.zig (limited to 'src/CodePoint.zig') diff --git a/src/CodePoint.zig b/src/CodePoint.zig new file mode 100644 index 0000000..e72823b --- /dev/null +++ b/src/CodePoint.zig @@ -0,0 +1,131 @@ +//! `CodePoint` represents a Unicode code point by its code, length, and offset in the source bytes. + +const std = @import("std"); + +code: u21, +len: u3, +offset: usize, + +const CodePoint = @This(); + +/// `CodePointIterator` iterates a string one `CodePoint` at-a-time. +pub const CodePointIterator = struct { + bytes: []const u8, + i: usize = 0, + + pub fn next(self: *CodePointIterator) ?CodePoint { + if (self.i >= self.bytes.len) return null; + + if (self.bytes[self.i] < 128) { + // ASCII fast path + const cp = CodePoint{ + .code = self.bytes[self.i], + .len = 1, + .offset = self.i, + }; + + self.i += 1; + + return cp; + } + + var cp = CodePoint{ + .code = undefined, + .len = blk: { + break :blk switch (self.bytes[self.i]) { + 0b1100_0000...0b1101_1111 => 2, + 0b1110_0000...0b1110_1111 => 3, + 0b1111_0000...0b1111_0111 => 4, + else => @panic("CodePointIterator.next: Ivalid code point start byte."), + }; + }, + .offset = self.i, + }; + + const cp_bytes = self.bytes[self.i..][0..cp.len]; + self.i += cp.len; + + cp.code = switch (cp.len) { + 2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111), + + 3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) | + (cp_bytes[1] & 0b00111111)) << 6) | + (cp_bytes[2] & 0b00111111), + + 4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) | + (cp_bytes[1] & 0b00111111)) << 6) | + (cp_bytes[2] & 0b00111111)) << 6) | + (cp_bytes[3] & 0b00111111), + + else => @panic("CodePointIterator.next invalid code point length."), + }; + + return cp; + } + + pub fn peek(self: *CodePointIterator) ?CodePoint { + const saved_i = self.i; + defer self.i = saved_i; + return self.next(); + } +}; + +test "CodePointIterator peek" { + var iter = CodePointIterator{ .bytes = "Hi" }; + + try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code); + try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code); + try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code); + try std.testing.expectEqual(@as(?CodePoint, null), iter.peek()); + try std.testing.expectEqual(@as(?CodePoint, null), iter.next()); +} + +/// `readCodePoint` returns the next code point code as a `u21` in the given reader, or null at end-of-input. +pub fn readCodePoint(reader: anytype) !?u21 { + var buf: [4]u8 = undefined; + + buf[0] = reader.readByte() catch |err| switch (err) { + error.EndOfStream => return null, + else => return err, + }; + + if (buf[0] < 128) return @as(u21, buf[0]); + + const len: u3 = switch (buf[0]) { + 0b1100_0000...0b1101_1111 => 2, + 0b1110_0000...0b1110_1111 => 3, + 0b1111_0000...0b1111_0111 => 4, + else => return error.InvalidUtf8, + }; + + const read = try reader.read(buf[1..len]); + + if (read < len - 1) return error.InvalidUtf8; + + return switch (len) { + 2 => (@as(u21, (buf[0] & 0b00011111)) << 6) | (buf[1] & 0b00111111), + + 3 => (((@as(u21, (buf[0] & 0b00001111)) << 6) | + (buf[1] & 0b00111111)) << 6) | + (buf[2] & 0b00111111), + + 4 => (((((@as(u21, (buf[0] & 0b00000111)) << 6) | + (buf[1] & 0b00111111)) << 6) | + (buf[2] & 0b00111111)) << 6) | + (buf[3] & 0b00111111), + + else => @panic("readCodePoint invalid code point length."), + }; +} + +test "readCodePoint" { + var buf = "abé😹".*; + var fis = std.io.fixedBufferStream(&buf); + const reader = fis.reader(); + + try std.testing.expectEqual(@as(u21, 'a'), (try readCodePoint(reader)).?); + try std.testing.expectEqual(@as(u21, 'b'), (try readCodePoint(reader)).?); + try std.testing.expectEqual(@as(u21, 'é'), (try readCodePoint(reader)).?); + try std.testing.expectEqual(@as(u21, '😹'), (try readCodePoint(reader)).?); + try std.testing.expectEqual(@as(?u21, null), try readCodePoint(reader)); +} -- cgit v1.2.3