From c9a1b3392973ee30e6a9a532f1da8605619b5b06 Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Fri, 23 May 2025 18:46:30 -0400 Subject: Make offset size configurable Hopefully I can talk users out of taking advantage of this configuration but I'll have better luck with that if it's available. --- build.zig | 54 ++++++++++++++++++++++++++++++--------------------- src/Graphemes.zig | 20 ++++++++++--------- src/Words.zig | 14 +++++++------ src/code_point.zig | 16 ++++++++------- src/unicode_tests.zig | 10 ++++++---- 5 files changed, 66 insertions(+), 48 deletions(-) diff --git a/build.zig b/build.zig index 8cfa039..648571b 100644 --- a/build.zig +++ b/build.zig @@ -11,7 +11,34 @@ pub fn build(b: *std.Build) void { .optimize = optimize, }); - // Code generation + //| Options + + // Display width + const cjk = b.option(bool, "cjk", "Ambiguous code points are wide (display width: 2).") orelse false; + const dwp_options = b.addOptions(); + dwp_options.addOption(bool, "cjk", cjk); + + // Visible Controls + const c0_width = b.option( + i4, + "c0_width", + "C0 controls have this width (default: 0, default -1)", + ); + dwp_options.addOption(?i4, "c0_width", c0_width); + const c1_width = b.option( + i4, + "c1_width", + "C1 controls have this width (default: 0)", + ); + dwp_options.addOption(?i4, "c1_width", c1_width); + + //| Offset size + const fat_offset = b.option(bool, "fat_offset", "Offsets in Iterators and data structures will be u64") orelse false; + const size_config = b.addOptions(); + size_config.addOption(bool, "fat_offset", fat_offset); + + //| Code generation + // Grapheme break const gbp_gen_exe = b.addExecutable(.{ .name = "gbp", @@ -31,32 +58,13 @@ pub fn build(b: *std.Build) void { const run_wbp_gen_exe = b.addRunArtifact(wbp_gen_exe); const wbp_gen_out = run_wbp_gen_exe.addOutputFileArg("wbp.bin.z"); - // Display width - const cjk = b.option(bool, "cjk", "Ambiguous code points are wide (display width: 2).") orelse false; - const options = b.addOptions(); - options.addOption(bool, "cjk", cjk); - - // Visible Controls - const c0_width = b.option( - i4, - "c0_width", - "C0 controls have this width (default: 0, default -1)", - ); - options.addOption(?i4, "c0_width", c0_width); - const c1_width = b.option( - i4, - "c1_width", - "C1 controls have this width (default: 0)", - ); - options.addOption(?i4, "c1_width", c1_width); - const dwp_gen_exe = b.addExecutable(.{ .name = "dwp", .root_source_file = b.path("codegen/dwp.zig"), .target = b.graph.host, .optimize = .Debug, }); - dwp_gen_exe.root_module.addOptions("options", options); + dwp_gen_exe.root_module.addOptions("options", dwp_options); const run_dwp_gen_exe = b.addRunArtifact(dwp_gen_exe); const dwp_gen_out = run_dwp_gen_exe.addOutputFileArg("dwp.bin.z"); @@ -199,6 +207,7 @@ pub fn build(b: *std.Build) void { .target = target, .optimize = optimize, }); + code_point.addOptions("config", size_config); const code_point_t = b.addTest(.{ .name = "code_point", @@ -216,6 +225,7 @@ pub fn build(b: *std.Build) void { }); graphemes.addAnonymousImport("gbp", .{ .root_source_file = gbp_gen_out }); graphemes.addImport("code_point", code_point); + graphemes.addOptions("config", size_config); const grapheme_t = b.addTest(.{ .name = "Graphemes", @@ -267,7 +277,7 @@ pub fn build(b: *std.Build) void { display_width.addImport("ascii", ascii); display_width.addImport("code_point", code_point); display_width.addImport("Graphemes", graphemes); - display_width.addOptions("options", options); // For testing + display_width.addOptions("options", dwp_options); // For testing const display_width_t = b.addTest(.{ .name = "display_width", diff --git a/src/Graphemes.zig b/src/Graphemes.zig index 0338c04..49fdbf3 100644 --- a/src/Graphemes.zig +++ b/src/Graphemes.zig @@ -5,9 +5,11 @@ const Allocator = mem.Allocator; const compress = std.compress; const unicode = std.unicode; -const CodePoint = @import("code_point").CodePoint; -const CodePointIterator = @import("code_point").Iterator; -const CodePointReverseIterator = @import("code_point").ReverseIterator; +const code_point = @import("code_point"); +const CodePoint = code_point.CodePoint; +const CodePointIterator = code_point.Iterator; +const CodePointReverseIterator = code_point.ReverseIterator; +const uoffset = code_point.uoffset; s1: []u16 = undefined, s2: []u16 = undefined, @@ -104,8 +106,8 @@ pub const Gbp = enum { /// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. pub const Grapheme = struct { - len: u32, - offset: u32, + len: uoffset, + offset: uoffset, /// `bytes` returns the slice of bytes that correspond to /// this grapheme cluster in `src`. @@ -199,7 +201,7 @@ pub const ReverseIterator = struct { /// Count of pending RI codepoints, it is an even number ri_count: usize, /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji - extend_end: u32, + extend_end: uoffset, }; const Self = @This(); @@ -219,7 +221,7 @@ pub const ReverseIterator = struct { pub fn prev(self: *Self) ?Grapheme { if (self.buf[1] == null) return null; - const grapheme_end: u32 = end: { + const grapheme_end: uoffset = end: { const codepoint = self.buf[1].?; switch (self.pending) { @@ -270,7 +272,7 @@ pub const ReverseIterator = struct { if (!state.hasIndic()) { // BUF: [?Any, Extend | Linker] Consonant - var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; + var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; indic: while (true) { if (self.buf[0] == null) { @@ -321,7 +323,7 @@ pub const ReverseIterator = struct { if (!state.hasXpic()) { // BUF: [?Any, ZWJ] Emoji - var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; + var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len; // Look for previous Emoji emoji: while (true) { diff --git a/src/Words.zig b/src/Words.zig index 1d10b2a..1707881 100644 --- a/src/Words.zig +++ b/src/Words.zig @@ -53,8 +53,8 @@ pub fn deinit(words: *const Words, allocator: mem.Allocator) void { /// Represents a Unicode word span, as an offset into the source string /// and the length of the word. pub const Word = struct { - offset: u32, - len: u32, + offset: uoffset, + len: uoffset, /// Returns a slice of the word given the source string. pub fn bytes(word: Word, src: []const u8) []const u8 { @@ -183,7 +183,7 @@ pub const Iterator = struct { if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; const word_start = iter.this.?.offset; - var word_len: u32 = 0; + var word_len: uoffset = 0; // State variables. var last_p: WordBreakProperty = .none; @@ -364,7 +364,7 @@ pub const ReverseIterator = struct { if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 }; const word_end = iter.after.?.offset + iter.after.?.len; - var word_len: u32 = 0; + var word_len: uoffset = 0; // State variables. var last_p: WordBreakProperty = .none; @@ -518,7 +518,7 @@ pub const ReverseIterator = struct { /// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator { - var idx: u32 = @intCast(index); + var idx: uoffset = @intCast(index); // Find the next lead byte: while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} if (idx == string.len) return words.reverseIterator(string); @@ -537,7 +537,7 @@ fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) Rever } fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator { - var idx: u32 = @intCast(index); + var idx: uoffset = @intCast(index); if (idx == string.len) { return .{ .cp_iter = .{ .bytes = string, .i = idx }, @@ -746,6 +746,8 @@ const Allocator = mem.Allocator; const assert = std.debug.assert; const testing = std.testing; +const uoffset = code_point.uoffset; + const code_point = @import("code_point"); const CodepointIterator = code_point.Iterator; const ReverseCodepointIterator = code_point.ReverseIterator; diff --git a/src/code_point.zig b/src/code_point.zig index 9a84080..8bd3d5b 100644 --- a/src/code_point.zig +++ b/src/code_point.zig @@ -4,12 +4,14 @@ //! Represents invalid data according to the Replacement of Maximal //! Subparts algorithm. +pub const uoffset = if (@import("config").fat_offset) u64 else u32; + /// `CodePoint` represents a Unicode code point by its code, /// length, and offset in the source bytes. pub const CodePoint = struct { code: u21, len: u3, - offset: u32, + offset: uoffset, /// Return the slice of this codepoint, given the original string. pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 { @@ -27,8 +29,8 @@ pub const CodePoint = struct { /// This function is deprecated and will be removed in a later release. /// Use `decodeAtIndex` or `decodeAtCursor`. -pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { - var off: u32 = 0; +pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint { + var off: uoffset = 0; var maybe_code = decodeAtCursor(bytes, &off); if (maybe_code) |*code| { code.offset = offset; @@ -38,14 +40,14 @@ pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { } /// Decode the CodePoint, if any, at `bytes[idx]`. -pub fn decodeAtIndex(bytes: []const u8, idx: u32) ?CodePoint { +pub fn decodeAtIndex(bytes: []const u8, idx: uoffset) ?CodePoint { var off = idx; return decodeAtCursor(bytes, &off); } /// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the /// cursor will point at the next potential codepoint index. -pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { +pub fn decodeAtCursor(bytes: []const u8, cursor: *uoffset) ?CodePoint { // EOS if (cursor.* >= bytes.len) return null; @@ -161,7 +163,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { /// `Iterator` iterates a string one `CodePoint` at-a-time. pub const Iterator = struct { bytes: []const u8, - i: u32 = 0, + i: uoffset = 0, pub fn init(bytes: []const u8) Iterator { return .{ .bytes = bytes, .i = 0 }; @@ -257,7 +259,7 @@ const class_mask: [12]u8 = .{ pub const ReverseIterator = struct { bytes: []const u8, - i: ?u32, + i: ?uoffset, pub fn init(str: []const u8) ReverseIterator { var r_iter: ReverseIterator = undefined; diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 195fdcb..c463dcc 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -141,12 +141,12 @@ test "Segmentation GraphemeIterator" { defer all_bytes.deinit(); var graphemes = std.mem.splitSequence(u8, line, " ÷ "); - var bytes_index: u32 = 0; + var bytes_index: uoffset = 0; while (graphemes.next()) |field| { var code_points = std.mem.splitScalar(u8, field, ' '); var cp_buf: [4]u8 = undefined; - var cp_index: u32 = 0; + var cp_index: uoffset = 0; var gc_len: u8 = 0; while (code_points.next()) |code_point| { @@ -231,12 +231,12 @@ test "Segmentation Word Iterator" { defer all_bytes.deinit(); var words = std.mem.splitSequence(u8, line, " ÷ "); - var bytes_index: u32 = 0; + var bytes_index: uoffset = 0; while (words.next()) |field| { var code_points = std.mem.splitScalar(u8, field, ' '); var cp_buf: [4]u8 = undefined; - var cp_index: u32 = 0; + var cp_index: uoffset = 0; var gc_len: u8 = 0; while (code_points.next()) |code_point| { @@ -425,6 +425,8 @@ const debug = std.debug; const testing = std.testing; const unicode = std.unicode; +const uoffset = @FieldType(Word, "offset"); + const Grapheme = @import("Graphemes").Grapheme; const Graphemes = @import("Graphemes"); const GraphemeIterator = @import("Graphemes").Iterator; -- cgit v1.2.3