From f4a174e27052e38aec09840e9195981cc2f24c88 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 23 May 2025 19:01:57 -0400 Subject: Document "fat_offset" in README --- README.md | 21 +++++++++++++++++++++ build.zig | 4 ++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1d3899c..1da50f3 100644 --- a/README.md +++ b/README.md @@ -519,3 +519,24 @@ test "Scripts" { try expect(scripts.script('צ') == .Hebrew); } ``` + +## Limits + +Iterators, and fragment types such as `CodePoint`, `Grapheme` and `Word`, use a +`u32` to store the offset into a string, and the length of the fragment +(`CodePoint` uses a `u3` for length, actually). + +4GiB is a lot of string. There are a few reasons to work with that much +string, log files primarily, but fewer to bring it all into memory at once, and +practically no reason at all to do anything to such a string without breaking +it into smaller piece to work with. + +Also, Zig compiles on 32 bit systems, where `usize` is 32. Code running on +such systems has no choice but to handle slices in smaller pieces. In general, +if you want code to perform correctly when encountering multi- gigabyte +strings, you'll need to code for that, at a level one or two steps above that +in which you'll want to, for example, iterate some graphemes of that string. + +That all said, `zg` modules can be passed the Boolean config option +`fat_offset`, which will make all of those data structures use a `u64` instead. +You don't actually want to do this. But you can. diff --git a/build.zig b/build.zig index 648571b..ca0eeef 100644 --- a/build.zig +++ b/build.zig @@ -14,7 +14,7 @@ pub fn build(b: *std.Build) void { //| Options // Display width - const cjk = b.option(bool, "cjk", "Ambiguous code points are wide (display width: 2).") orelse false; + const cjk = b.option(bool, "cjk", "Ambiguous code points are wide (display width: 2)") orelse false; const dwp_options = b.addOptions(); dwp_options.addOption(bool, "cjk", cjk); @@ -33,7 +33,7 @@ pub fn build(b: *std.Build) void { dwp_options.addOption(?i4, "c1_width", c1_width); //| Offset size - const fat_offset = b.option(bool, "fat_offset", "Offsets in Iterators and data structures will be u64") orelse false; + const fat_offset = b.option(bool, "fat_offset", "Offsets in iterators and data structures will be u64") orelse false; const size_config = b.addOptions(); size_config.addOption(bool, "fat_offset", fat_offset); -- cgit v1.2.3