From c9a1b3392973ee30e6a9a532f1da8605619b5b06 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 23 May 2025 18:46:30 -0400 Subject: Make offset size configurable Hopefully I can talk users out of taking advantage of this configuration but I'll have better luck with that if it's available. --- src/Graphemes.zig | 20 +++++++++++--------- src/Words.zig | 14 ++++++++------ src/code_point.zig | 16 +++++++++------- src/unicode_tests.zig | 10 ++++++---- 4 files changed, 34 insertions(+), 26 deletions(-) (limited to 'src') diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 0338c04..49fdbf3 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -5,9 +5,11 @@ const Allocator = mem.Allocator; const compress = std.compress; const unicode = std.unicode; -const CodePoint = @import("code_point").CodePoint; -const CodePointIterator = @import("code_point").Iterator; -const CodePointReverseIterator = @import("code_point").ReverseIterator; +const code_point = @import("code_point"); +const CodePoint = code_point.CodePoint; +const CodePointIterator = code_point.Iterator; +const CodePointReverseIterator = code_point.ReverseIterator; +const uoffset = code_point.uoffset; s1: []u16 = undefined, s2: []u16 = undefined, @@ -104,8 +106,8 @@ pub const Gbp = enum { /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. pub const Grapheme = struct { - len: u32, - offset: u32, + len: uoffset, + offset: uoffset, /// `bytes` returns the slice of bytes that correspond to /// this grapheme cluster in `src`. @@ -199,7 +201,7 @@ pub const ReverseIterator = struct { /// Count of pending RI codepoints, it is an even number ri_count: usize, /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji - extend_end: u32, + extend_end: uoffset, }; const Self = @This(); @@ -219,7 +221,7 @@ pub const ReverseIterator = struct { pub fn prev(self: *Self) ?Grapheme { if (self.buf[1] == null) return null; - const grapheme_end: u32 = end: { + const grapheme_end: uoffset = end: { const codepoint = self.buf[1].?; switch (self.pending) { @@ -270,7 +272,7 @@ pub const ReverseIterator = struct { if (!state.hasIndic()) { // BUF: [?Any, Extend | Linker] Consonant - var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; + var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; indic: while (true) { if (self.buf[0] == null) { @@ -321,7 +323,7 @@ pub const ReverseIterator = struct { if (!state.hasXpic()) { // BUF: [?Any, ZWJ] Emoji - var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; + var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; // Look for previous Emoji emoji: while (true) { diff --git a/src/Words.zig b/src/Words.zig index 1d10b2a..1707881 100644 --- a/src/Words.zig +++ b/src/Words.zig @@ -53,8 +53,8 @@ pub fn deinit(words: *const Words, allocator: mem.Allocator) void { /// Represents a Unicode word span, as an offset into the source string /// and the length of the word. pub const Word = struct { - offset: u32, - len: u32, + offset: uoffset, + len: uoffset, /// Returns a slice of the word given the source string. pub fn bytes(word: Word, src: []const u8) []const u8 { @@ -183,7 +183,7 @@ pub const Iterator = struct { if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; const word_start = iter.this.?.offset; - var word_len: u32 = 0; + var word_len: uoffset = 0; // State variables. var last_p: WordBreakProperty = .none; @@ -364,7 +364,7 @@ pub const ReverseIterator = struct { if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 }; const word_end = iter.after.?.offset + iter.after.?.len; - var word_len: u32 = 0; + var word_len: uoffset = 0; // State variables. var last_p: WordBreakProperty = .none; @@ -518,7 +518,7 @@ pub const ReverseIterator = struct { /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator { - var idx: u32 = @intCast(index); + var idx: uoffset = @intCast(index); // Find the next lead byte: while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} if (idx == string.len) return words.reverseIterator(string); @@ -537,7 +537,7 @@ fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) Rever } fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator { - var idx: u32 = @intCast(index); + var idx: uoffset = @intCast(index); if (idx == string.len) { return .{ .cp_iter = .{ .bytes = string, .i = idx }, @@ -746,6 +746,8 @@ const Allocator = mem.Allocator; const assert = std.debug.assert; const testing = std.testing; +const uoffset = code_point.uoffset; + const code_point = @import("code_point"); const CodepointIterator = code_point.Iterator; const ReverseCodepointIterator = code_point.ReverseIterator; diff --git a/src/code_point.zig b/src/code_point.zig index 9a84080..8bd3d5b 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -4,12 +4,14 @@ //! Represents invalid data according to the Replacement of Maximal //! Subparts algorithm. +pub const uoffset = if (@import("config").fat_offset) u64 else u32; + /// `CodePoint` represents a Unicode code point by its code, /// length, and offset in the source bytes. pub const CodePoint = struct { code: u21, len: u3, - offset: u32, + offset: uoffset, /// Return the slice of this codepoint, given the original string. pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 { @@ -27,8 +29,8 @@ pub const CodePoint = struct { /// This function is deprecated and will be removed in a later release. /// Use `decodeAtIndex` or `decodeAtCursor`. -pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { - var off: u32 = 0; +pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint { + var off: uoffset = 0; var maybe_code = decodeAtCursor(bytes, &off); if (maybe_code) |*code| { code.offset = offset; @@ -38,14 +40,14 @@ pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { } /// Decode the CodePoint, if any, at `bytes[idx]`. -pub fn decodeAtIndex(bytes: []const u8, idx: u32) ?CodePoint { +pub fn decodeAtIndex(bytes: []const u8, idx: uoffset) ?CodePoint { var off = idx; return decodeAtCursor(bytes, &off); } /// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the /// cursor will point at the next potential codepoint index. -pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { +pub fn decodeAtCursor(bytes: []const u8, cursor: *uoffset) ?CodePoint { // EOS if (cursor.* >= bytes.len) return null; @@ -161,7 +163,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { /// `Iterator` iterates a string one `CodePoint` at-a-time. pub const Iterator = struct { bytes: []const u8, - i: u32 = 0, + i: uoffset = 0, pub fn init(bytes: []const u8) Iterator { return .{ .bytes = bytes, .i = 0 }; @@ -257,7 +259,7 @@ const class_mask: [12]u8 = .{ pub const ReverseIterator = struct { bytes: []const u8, - i: ?u32, + i: ?uoffset, pub fn init(str: []const u8) ReverseIterator { var r_iter: ReverseIterator = undefined; diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 195fdcb..c463dcc 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -141,12 +141,12 @@ test "Segmentation GraphemeIterator" { defer all_bytes.deinit(); var graphemes = std.mem.splitSequence(u8, line, " ÷ "); - var bytes_index: u32 = 0; + var bytes_index: uoffset = 0; while (graphemes.next()) |field| { var code_points = std.mem.splitScalar(u8, field, ' '); var cp_buf: [4]u8 = undefined; - var cp_index: u32 = 0; + var cp_index: uoffset = 0; var gc_len: u8 = 0; while (code_points.next()) |code_point| { @@ -231,12 +231,12 @@ test "Segmentation Word Iterator" { defer all_bytes.deinit(); var words = std.mem.splitSequence(u8, line, " ÷ "); - var bytes_index: u32 = 0; + var bytes_index: uoffset = 0; while (words.next()) |field| { var code_points = std.mem.splitScalar(u8, field, ' '); var cp_buf: [4]u8 = undefined; - var cp_index: u32 = 0; + var cp_index: uoffset = 0; var gc_len: u8 = 0; while (code_points.next()) |code_point| { @@ -425,6 +425,8 @@ const debug = std.debug; const testing = std.testing; const unicode = std.unicode; +const uoffset = @FieldType(Word, "offset"); + const Grapheme = @import("Graphemes").Grapheme; const Graphemes = @import("Graphemes"); const GraphemeIterator = @import("Graphemes").Iterator; -- cgit v1.2.3