Make offset size configurable

Hopefully I can talk users out of taking advantage of this configuration but I'll have better luck with that if it's available.
author: Sam Atman 2025-05-23 18:46:30 -0400
committer: Sam Atman 2025-05-23 18:46:30 -0400
commit: c9a1b3392973ee30e6a9a532f1da8605619b5b06 (patch)
tree: 1198b2fcb544bcef9f634cf507d848d82548f00a
parent: Add iterateBefore and iterateAfter (diff)
download: zg-c9a1b3392973ee30e6a9a532f1da8605619b5b06.tar.gz
zg-c9a1b3392973ee30e6a9a532f1da8605619b5b06.tar.xz
zg-c9a1b3392973ee30e6a9a532f1da8605619b5b06.zip
5 files changed, 66 insertions, 48 deletions
diff --git a/build.zig b/build.zig
index 8cfa039..648571b 100644
--- a/build.zig
+++ b/build.zig
@@ -11,7 +11,34 @@ pub fn build(b: *std.Build) void {
        .optimize = optimize,
    });
-    // Code generation
+    //| Options
+    // Display width
+    const cjk = b.option(bool, "cjk", "Ambiguous code points are wide (display width: 2).") orelse false;
+    const dwp_options = b.addOptions();
+    dwp_options.addOption(bool, "cjk", cjk);
+    // Visible Controls
+    const c0_width = b.option(
+        i4,
+        "c0_width",
+        "C0 controls have this width (default: 0, <BS> <Del> default -1)",
+    );
+    dwp_options.addOption(?i4, "c0_width", c0_width);
+    const c1_width = b.option(
+        i4,
+        "c1_width",
+        "C1 controls have this width (default: 0)",
+    );
+    dwp_options.addOption(?i4, "c1_width", c1_width);
+    //| Offset size
+    const fat_offset = b.option(bool, "fat_offset", "Offsets in Iterators and data structures will be u64") orelse false;
+    const size_config = b.addOptions();
+    size_config.addOption(bool, "fat_offset", fat_offset);
+    //| Code generation
    // Grapheme break
    const gbp_gen_exe = b.addExecutable(.{
        .name = "gbp",
@@ -31,32 +58,13 @@ pub fn build(b: *std.Build) void {
    const run_wbp_gen_exe = b.addRunArtifact(wbp_gen_exe);
    const wbp_gen_out = run_wbp_gen_exe.addOutputFileArg("wbp.bin.z");
-    // Display width
-    const cjk = b.option(bool, "cjk", "Ambiguous code points are wide (display width: 2).") orelse false;
-    const options = b.addOptions();
-    options.addOption(bool, "cjk", cjk);
-    // Visible Controls
-    const c0_width = b.option(
-        i4,
-        "c0_width",
-        "C0 controls have this width (default: 0, <BS> <Del> default -1)",
-    );
-    options.addOption(?i4, "c0_width", c0_width);
-    const c1_width = b.option(
-        i4,
-        "c1_width",
-        "C1 controls have this width (default: 0)",
-    );
-    options.addOption(?i4, "c1_width", c1_width);
    const dwp_gen_exe = b.addExecutable(.{
        .name = "dwp",
        .root_source_file = b.path("codegen/dwp.zig"),
        .target = b.graph.host,
        .optimize = .Debug,
    });
-    dwp_gen_exe.root_module.addOptions("options", options);
+    dwp_gen_exe.root_module.addOptions("options", dwp_options);
    const run_dwp_gen_exe = b.addRunArtifact(dwp_gen_exe);
    const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.bin.z");
@@ -199,6 +207,7 @@ pub fn build(b: *std.Build) void {
        .target = target,
        .optimize = optimize,
    });
+    code_point.addOptions("config", size_config);
    const code_point_t = b.addTest(.{
        .name = "code_point",
@@ -216,6 +225,7 @@ pub fn build(b: *std.Build) void {
    });
    graphemes.addAnonymousImport("gbp", .{ .root_source_file = gbp_gen_out });
    graphemes.addImport("code_point", code_point);
+    graphemes.addOptions("config", size_config);
    const grapheme_t = b.addTest(.{
        .name = "Graphemes",
@@ -267,7 +277,7 @@ pub fn build(b: *std.Build) void {
    display_width.addImport("ascii", ascii);
    display_width.addImport("code_point", code_point);
    display_width.addImport("Graphemes", graphemes);
-    display_width.addOptions("options", options); // For testing
+    display_width.addOptions("options", dwp_options); // For testing
    const display_width_t = b.addTest(.{
        .name = "display_width",
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 0338c04..49fdbf3 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -5,9 +5,11 @@ const Allocator = mem.Allocator;
 const compress = std.compress;
 const unicode = std.unicode;
-const CodePoint = @import("code_point").CodePoint;
+const code_point = @import("code_point");
-const CodePointIterator = @import("code_point").Iterator;
+const CodePoint = code_point.CodePoint;
-const CodePointReverseIterator = @import("code_point").ReverseIterator;
+const CodePointIterator = code_point.Iterator;
+const CodePointReverseIterator = code_point.ReverseIterator;
+const uoffset = code_point.uoffset;
 s1: []u16 = undefined,
 s2: []u16 = undefined,
@@ -104,8 +106,8 @@ pub const Gbp = enum {
 /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
 pub const Grapheme = struct {
-    len: u32,
+    len: uoffset,
-    offset: u32,
+    offset: uoffset,
    /// `bytes` returns the slice of bytes that correspond to
    /// this grapheme cluster in `src`.
@@ -199,7 +201,7 @@ pub const ReverseIterator = struct {
        /// Count of pending RI codepoints, it is an even number
        ri_count: usize,
        /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji
-        extend_end: u32,
+        extend_end: uoffset,
    };
    const Self = @This();
@@ -219,7 +221,7 @@ pub const ReverseIterator = struct {
    pub fn prev(self: *Self) ?Grapheme {
        if (self.buf[1] == null) return null;
-        const grapheme_end: u32 = end: {
+        const grapheme_end: uoffset = end: {
            const codepoint = self.buf[1].?;
            switch (self.pending) {
@@ -270,7 +272,7 @@ pub const ReverseIterator = struct {
            if (!state.hasIndic()) {
                // BUF: [?Any, Extend | Linker] Consonant
-                var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len;
+                var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
                indic: while (true) {
                    if (self.buf[0] == null) {
@@ -321,7 +323,7 @@ pub const ReverseIterator = struct {
            if (!state.hasXpic()) {
                // BUF: [?Any, ZWJ] Emoji
-                var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len;
+                var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
                // Look for previous Emoji
                emoji: while (true) {
diff --git a/src/Words.zig b/src/Words.zig
index 1d10b2a..1707881 100644
--- a/src/Words.zig
+++ b/src/Words.zig
@@ -53,8 +53,8 @@ pub fn deinit(words: *const Words, allocator: mem.Allocator) void {
 /// Represents a Unicode word span, as an offset into the source string
 /// and the length of the word.
 pub const Word = struct {
-    offset: u32,
+    offset: uoffset,
-    len: u32,
+    len: uoffset,
    /// Returns a slice of the word given the source string.
    pub fn bytes(word: Word, src: []const u8) []const u8 {
@@ -183,7 +183,7 @@ pub const Iterator = struct {
        if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
        const word_start = iter.this.?.offset;
-        var word_len: u32 = 0;
+        var word_len: uoffset = 0;
        // State variables.
        var last_p: WordBreakProperty = .none;
@@ -364,7 +364,7 @@ pub const ReverseIterator = struct {
        if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 };
        const word_end = iter.after.?.offset + iter.after.?.len;
-        var word_len: u32 = 0;
+        var word_len: uoffset = 0;
        // State variables.
        var last_p: WordBreakProperty = .none;
@@ -518,7 +518,7 @@ pub const ReverseIterator = struct {
 /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`.
 fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator {
-    var idx: u32 = @intCast(index);
+    var idx: uoffset = @intCast(index);
    // Find the next lead byte:
    while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
    if (idx == string.len) return words.reverseIterator(string);
@@ -537,7 +537,7 @@ fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) Rever
 }
 fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator {
-    var idx: u32 = @intCast(index);
+    var idx: uoffset = @intCast(index);
    if (idx == string.len) {
        return .{
            .cp_iter = .{ .bytes = string, .i = idx },
@@ -746,6 +746,8 @@ const Allocator = mem.Allocator;
 const assert = std.debug.assert;
 const testing = std.testing;
+const uoffset = code_point.uoffset;
 const code_point = @import("code_point");
 const CodepointIterator = code_point.Iterator;
 const ReverseCodepointIterator = code_point.ReverseIterator;
diff --git a/src/code_point.zig b/src/code_point.zig
index 9a84080..8bd3d5b 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -4,12 +4,14 @@
 //! Represents invalid data according to the Replacement of Maximal
 //! Subparts algorithm.
+pub const uoffset = if (@import("config").fat_offset) u64 else u32;
 /// `CodePoint` represents a Unicode code point by its code,
 /// length, and offset in the source bytes.
 pub const CodePoint = struct {
    code: u21,
    len: u3,
-    offset: u32,
+    offset: uoffset,
    /// Return the slice of this codepoint, given the original string.
    pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 {
@@ -27,8 +29,8 @@ pub const CodePoint = struct {
 /// This function is deprecated and will be removed in a later release.
 /// Use `decodeAtIndex` or `decodeAtCursor`.
-pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
+pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint {
-    var off: u32 = 0;
+    var off: uoffset = 0;
    var maybe_code = decodeAtCursor(bytes, &off);
    if (maybe_code) |*code| {
        code.offset = offset;
@@ -38,14 +40,14 @@ pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
 }
 /// Decode the CodePoint, if any, at `bytes[idx]`.
-pub fn decodeAtIndex(bytes: []const u8, idx: u32) ?CodePoint {
+pub fn decodeAtIndex(bytes: []const u8, idx: uoffset) ?CodePoint {
    var off = idx;
    return decodeAtCursor(bytes, &off);
 }
 /// Decode the CodePoint, if any, at `bytes[cursor.*]`.  After, the
 /// cursor will point at the next potential codepoint index.
-pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
+pub fn decodeAtCursor(bytes: []const u8, cursor: *uoffset) ?CodePoint {
    // EOS
    if (cursor.* >= bytes.len) return null;
@@ -161,7 +163,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
 /// `Iterator` iterates a string one `CodePoint` at-a-time.
 pub const Iterator = struct {
    bytes: []const u8,
-    i: u32 = 0,
+    i: uoffset = 0,
    pub fn init(bytes: []const u8) Iterator {
        return .{ .bytes = bytes, .i = 0 };
@@ -257,7 +259,7 @@ const class_mask: [12]u8 = .{
 pub const ReverseIterator = struct {
    bytes: []const u8,
-    i: ?u32,
+    i: ?uoffset,
    pub fn init(str: []const u8) ReverseIterator {
        var r_iter: ReverseIterator = undefined;
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 195fdcb..c463dcc 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -141,12 +141,12 @@ test "Segmentation GraphemeIterator" {
        defer all_bytes.deinit();
        var graphemes = std.mem.splitSequence(u8, line, " ÷ ");
-        var bytes_index: u32 = 0;
+        var bytes_index: uoffset = 0;
        while (graphemes.next()) |field| {
            var code_points = std.mem.splitScalar(u8, field, ' ');
            var cp_buf: [4]u8 = undefined;
-            var cp_index: u32 = 0;
+            var cp_index: uoffset = 0;
            var gc_len: u8 = 0;
            while (code_points.next()) |code_point| {
@@ -231,12 +231,12 @@ test "Segmentation Word Iterator" {
        defer all_bytes.deinit();
        var words = std.mem.splitSequence(u8, line, " ÷ ");
-        var bytes_index: u32 = 0;
+        var bytes_index: uoffset = 0;
        while (words.next()) |field| {
            var code_points = std.mem.splitScalar(u8, field, ' ');
            var cp_buf: [4]u8 = undefined;
-            var cp_index: u32 = 0;
+            var cp_index: uoffset = 0;
            var gc_len: u8 = 0;
            while (code_points.next()) |code_point| {
@@ -425,6 +425,8 @@ const debug = std.debug;
 const testing = std.testing;
 const unicode = std.unicode;
+const uoffset = @FieldType(Word, "offset");
 const Grapheme = @import("Graphemes").Grapheme;
 const Graphemes = @import("Graphemes");
 const GraphemeIterator = @import("Graphemes").Iterator;
author	Sam Atman	2025-05-23 18:46:30 -0400
committer	Sam Atman	2025-05-23 18:46:30 -0400
commit	c9a1b3392973ee30e6a9a532f1da8605619b5b06 (patch)
tree	1198b2fcb544bcef9f634cf507d848d82548f00a
parent	Add iterateBefore and iterateAfter (diff)
download	zg-c9a1b3392973ee30e6a9a532f1da8605619b5b06.tar.gz zg-c9a1b3392973ee30e6a9a532f1da8605619b5b06.tar.xz zg-c9a1b3392973ee30e6a9a532f1da8605619b5b06.zip