From c9a1b3392973ee30e6a9a532f1da8605619b5b06 Mon Sep 17 00:00:00 2001
From: Sam Atman
Date: Fri, 23 May 2025 18:46:30 -0400
Subject: Make offset size configurable

Hopefully I can talk users out of taking advantage of this configuration
but I'll have better luck with that if it's available.
---
 build.zig             | 54 ++++++++++++++++++++++++++++++---------------------
 src/Graphemes.zig     | 20 ++++++++++---------
 src/Words.zig         | 14 +++++++------
 src/code_point.zig    | 16 ++++++++-------
 src/unicode_tests.zig | 10 ++++++----
 5 files changed, 66 insertions(+), 48 deletions(-)
diff --git a/build.zig b/build.zig
index 8cfa039..648571b 100644
--- a/build.zig
+++ b/build.zig
@@ -11,7 +11,34 @@ pub fn build(b: *std.Build) void {
         .optimize = optimize,
     });
 
-    // Code generation
+    //| Options
+
+    // Display width
+    const cjk = b.option(bool, "cjk", "Ambiguous code points are wide (display width: 2).") orelse false;
+    const dwp_options = b.addOptions();
+    dwp_options.addOption(bool, "cjk", cjk);
+
+    // Visible Controls
+    const c0_width = b.option(
+        i4,
+        "c0_width",
+        "C0 controls have this width (default: 0, <BS> <Del> default -1)",
+    );
+    dwp_options.addOption(?i4, "c0_width", c0_width);
+    const c1_width = b.option(
+        i4,
+        "c1_width",
+        "C1 controls have this width (default: 0)",
+    );
+    dwp_options.addOption(?i4, "c1_width", c1_width);
+
+    //| Offset size
+    const fat_offset = b.option(bool, "fat_offset", "Offsets in Iterators and data structures will be u64") orelse false;
+    const size_config = b.addOptions();
+    size_config.addOption(bool, "fat_offset", fat_offset);
+
+    //| Code generation
+
     // Grapheme break
     const gbp_gen_exe = b.addExecutable(.{
         .name = "gbp",
@@ -31,32 +58,13 @@ pub fn build(b: *std.Build) void {
     const run_wbp_gen_exe = b.addRunArtifact(wbp_gen_exe);
     const wbp_gen_out = run_wbp_gen_exe.addOutputFileArg("wbp.bin.z");
 
-    // Display width
-    const cjk = b.option(bool, "cjk", "Ambiguous code points are wide (display width: 2).") orelse false;
-    const options = b.addOptions();
-    options.addOption(bool, "cjk", cjk);
-
-    // Visible Controls
-    const c0_width = b.option(
-        i4,
-        "c0_width",
-        "C0 controls have this width (default: 0, <BS> <Del> default -1)",
-    );
-    options.addOption(?i4, "c0_width", c0_width);
-    const c1_width = b.option(
-        i4,
-        "c1_width",
-        "C1 controls have this width (default: 0)",
-    );
-    options.addOption(?i4, "c1_width", c1_width);
-
     const dwp_gen_exe = b.addExecutable(.{
         .name = "dwp",
         .root_source_file = b.path("codegen/dwp.zig"),
         .target = b.graph.host,
         .optimize = .Debug,
     });
-    dwp_gen_exe.root_module.addOptions("options", options);
+    dwp_gen_exe.root_module.addOptions("options", dwp_options);
     const run_dwp_gen_exe = b.addRunArtifact(dwp_gen_exe);
     const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.bin.z");
 
@@ -199,6 +207,7 @@ pub fn build(b: *std.Build) void {
         .target = target,
         .optimize = optimize,
     });
+    code_point.addOptions("config", size_config);
 
     const code_point_t = b.addTest(.{
         .name = "code_point",
@@ -216,6 +225,7 @@ pub fn build(b: *std.Build) void {
     });
     graphemes.addAnonymousImport("gbp", .{ .root_source_file = gbp_gen_out });
     graphemes.addImport("code_point", code_point);
+    graphemes.addOptions("config", size_config);
 
     const grapheme_t = b.addTest(.{
         .name = "Graphemes",
@@ -267,7 +277,7 @@ pub fn build(b: *std.Build) void {
     display_width.addImport("ascii", ascii);
     display_width.addImport("code_point", code_point);
     display_width.addImport("Graphemes", graphemes);
-    display_width.addOptions("options", options); // For testing
+    display_width.addOptions("options", dwp_options); // For testing
 
     const display_width_t = b.addTest(.{
         .name = "display_width",
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 0338c04..49fdbf3 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -5,9 +5,11 @@ const Allocator = mem.Allocator;
 const compress = std.compress;
 const unicode = std.unicode;
 
-const CodePoint = @import("code_point").CodePoint;
-const CodePointIterator = @import("code_point").Iterator;
-const CodePointReverseIterator = @import("code_point").ReverseIterator;
+const code_point = @import("code_point");
+const CodePoint = code_point.CodePoint;
+const CodePointIterator = code_point.Iterator;
+const CodePointReverseIterator = code_point.ReverseIterator;
+const uoffset = code_point.uoffset;
 
 s1: []u16 = undefined,
 s2: []u16 = undefined,
@@ -104,8 +106,8 @@ pub const Gbp = enum {
 
 /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
 pub const Grapheme = struct {
-    len: u32,
-    offset: u32,
+    len: uoffset,
+    offset: uoffset,
 
     /// `bytes` returns the slice of bytes that correspond to
     /// this grapheme cluster in `src`.
@@ -199,7 +201,7 @@ pub const ReverseIterator = struct {
         /// Count of pending RI codepoints, it is an even number
         ri_count: usize,
         /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji
-        extend_end: u32,
+        extend_end: uoffset,
     };
 
     const Self = @This();
@@ -219,7 +221,7 @@ pub const ReverseIterator = struct {
     pub fn prev(self: *Self) ?Grapheme {
         if (self.buf[1] == null) return null;
 
-        const grapheme_end: u32 = end: {
+        const grapheme_end: uoffset = end: {
             const codepoint = self.buf[1].?;
 
             switch (self.pending) {
@@ -270,7 +272,7 @@ pub const ReverseIterator = struct {
             if (!state.hasIndic()) {
 
                 // BUF: [?Any, Extend | Linker] Consonant
-                var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len;
+                var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
 
                 indic: while (true) {
                     if (self.buf[0] == null) {
@@ -321,7 +323,7 @@ pub const ReverseIterator = struct {
 
             if (!state.hasXpic()) {
                 // BUF: [?Any, ZWJ] Emoji
-                var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len;
+                var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
 
                 // Look for previous Emoji
                 emoji: while (true) {
diff --git a/src/Words.zig b/src/Words.zig
index 1d10b2a..1707881 100644
--- a/src/Words.zig
+++ b/src/Words.zig
@@ -53,8 +53,8 @@ pub fn deinit(words: *const Words, allocator: mem.Allocator) void {
 /// Represents a Unicode word span, as an offset into the source string
 /// and the length of the word.
 pub const Word = struct {
-    offset: u32,
-    len: u32,
+    offset: uoffset,
+    len: uoffset,
 
     /// Returns a slice of the word given the source string.
     pub fn bytes(word: Word, src: []const u8) []const u8 {
@@ -183,7 +183,7 @@ pub const Iterator = struct {
         if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
 
         const word_start = iter.this.?.offset;
-        var word_len: u32 = 0;
+        var word_len: uoffset = 0;
 
         // State variables.
         var last_p: WordBreakProperty = .none;
@@ -364,7 +364,7 @@ pub const ReverseIterator = struct {
         if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 };
 
         const word_end = iter.after.?.offset + iter.after.?.len;
-        var word_len: u32 = 0;
+        var word_len: uoffset = 0;
 
         // State variables.
         var last_p: WordBreakProperty = .none;
@@ -518,7 +518,7 @@ pub const ReverseIterator = struct {
 
 /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`.
 fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator {
-    var idx: u32 = @intCast(index);
+    var idx: uoffset = @intCast(index);
     // Find the next lead byte:
     while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
     if (idx == string.len) return words.reverseIterator(string);
@@ -537,7 +537,7 @@ fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) Rever
 }
 
 fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator {
-    var idx: u32 = @intCast(index);
+    var idx: uoffset = @intCast(index);
     if (idx == string.len) {
         return .{
             .cp_iter = .{ .bytes = string, .i = idx },
@@ -746,6 +746,8 @@ const Allocator = mem.Allocator;
 const assert = std.debug.assert;
 const testing = std.testing;
 
+const uoffset = code_point.uoffset;
+
 const code_point = @import("code_point");
 const CodepointIterator = code_point.Iterator;
 const ReverseCodepointIterator = code_point.ReverseIterator;
diff --git a/src/code_point.zig b/src/code_point.zig
index 9a84080..8bd3d5b 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -4,12 +4,14 @@
 //! Represents invalid data according to the Replacement of Maximal
 //! Subparts algorithm.
 
+pub const uoffset = if (@import("config").fat_offset) u64 else u32;
+
 /// `CodePoint` represents a Unicode code point by its code,
 /// length, and offset in the source bytes.
 pub const CodePoint = struct {
     code: u21,
     len: u3,
-    offset: u32,
+    offset: uoffset,
 
     /// Return the slice of this codepoint, given the original string.
     pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 {
@@ -27,8 +29,8 @@ pub const CodePoint = struct {
 
 /// This function is deprecated and will be removed in a later release.
 /// Use `decodeAtIndex` or `decodeAtCursor`.
-pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
-    var off: u32 = 0;
+pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint {
+    var off: uoffset = 0;
     var maybe_code = decodeAtCursor(bytes, &off);
     if (maybe_code) |*code| {
         code.offset = offset;
@@ -38,14 +40,14 @@ pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
 }
 
 /// Decode the CodePoint, if any, at `bytes[idx]`.
-pub fn decodeAtIndex(bytes: []const u8, idx: u32) ?CodePoint {
+pub fn decodeAtIndex(bytes: []const u8, idx: uoffset) ?CodePoint {
     var off = idx;
     return decodeAtCursor(bytes, &off);
 }
 
 /// Decode the CodePoint, if any, at `bytes[cursor.*]`.  After, the
 /// cursor will point at the next potential codepoint index.
-pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
+pub fn decodeAtCursor(bytes: []const u8, cursor: *uoffset) ?CodePoint {
     // EOS
     if (cursor.* >= bytes.len) return null;
 
@@ -161,7 +163,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
 /// `Iterator` iterates a string one `CodePoint` at-a-time.
 pub const Iterator = struct {
     bytes: []const u8,
-    i: u32 = 0,
+    i: uoffset = 0,
 
     pub fn init(bytes: []const u8) Iterator {
         return .{ .bytes = bytes, .i = 0 };
@@ -257,7 +259,7 @@ const class_mask: [12]u8 = .{
 
 pub const ReverseIterator = struct {
     bytes: []const u8,
-    i: ?u32,
+    i: ?uoffset,
 
     pub fn init(str: []const u8) ReverseIterator {
         var r_iter: ReverseIterator = undefined;
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 195fdcb..c463dcc 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -141,12 +141,12 @@ test "Segmentation GraphemeIterator" {
         defer all_bytes.deinit();
 
         var graphemes = std.mem.splitSequence(u8, line, " ÷ ");
-        var bytes_index: u32 = 0;
+        var bytes_index: uoffset = 0;
 
         while (graphemes.next()) |field| {
             var code_points = std.mem.splitScalar(u8, field, ' ');
             var cp_buf: [4]u8 = undefined;
-            var cp_index: u32 = 0;
+            var cp_index: uoffset = 0;
             var gc_len: u8 = 0;
 
             while (code_points.next()) |code_point| {
@@ -231,12 +231,12 @@ test "Segmentation Word Iterator" {
         defer all_bytes.deinit();
 
         var words = std.mem.splitSequence(u8, line, " ÷ ");
-        var bytes_index: u32 = 0;
+        var bytes_index: uoffset = 0;
 
         while (words.next()) |field| {
             var code_points = std.mem.splitScalar(u8, field, ' ');
             var cp_buf: [4]u8 = undefined;
-            var cp_index: u32 = 0;
+            var cp_index: uoffset = 0;
             var gc_len: u8 = 0;
 
             while (code_points.next()) |code_point| {
@@ -425,6 +425,8 @@ const debug = std.debug;
 const testing = std.testing;
 const unicode = std.unicode;
 
+const uoffset = @FieldType(Word, "offset");
+
 const Grapheme = @import("Graphemes").Grapheme;
 const Graphemes = @import("Graphemes");
 const GraphemeIterator = @import("Graphemes").Iterator;
-- 
cgit v1.2.3