summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Sam Atman2025-05-23 18:46:30 -0400
committerGravatar Sam Atman2025-05-23 18:46:30 -0400
commitc9a1b3392973ee30e6a9a532f1da8605619b5b06 (patch)
tree1198b2fcb544bcef9f634cf507d848d82548f00a /src
parentAdd iterateBefore and iterateAfter (diff)
downloadzg-c9a1b3392973ee30e6a9a532f1da8605619b5b06.tar.gz
zg-c9a1b3392973ee30e6a9a532f1da8605619b5b06.tar.xz
zg-c9a1b3392973ee30e6a9a532f1da8605619b5b06.zip
Make offset size configurable
Hopefully I can talk users out of taking advantage of this configuration but I'll have better luck with that if it's available.
Diffstat (limited to 'src')
-rw-r--r--src/Graphemes.zig20
-rw-r--r--src/Words.zig14
-rw-r--r--src/code_point.zig16
-rw-r--r--src/unicode_tests.zig10
4 files changed, 34 insertions, 26 deletions
diff --git a/src/Graphemes.zig b/src/Graphemes.zig
index 0338c04..49fdbf3 100644
--- a/src/Graphemes.zig
+++ b/src/Graphemes.zig
@@ -5,9 +5,11 @@ const Allocator = mem.Allocator;
5const compress = std.compress; 5const compress = std.compress;
6const unicode = std.unicode; 6const unicode = std.unicode;
7 7
8const CodePoint = @import("code_point").CodePoint; 8const code_point = @import("code_point");
9const CodePointIterator = @import("code_point").Iterator; 9const CodePoint = code_point.CodePoint;
10const CodePointReverseIterator = @import("code_point").ReverseIterator; 10const CodePointIterator = code_point.Iterator;
11const CodePointReverseIterator = code_point.ReverseIterator;
12const uoffset = code_point.uoffset;
11 13
12s1: []u16 = undefined, 14s1: []u16 = undefined,
13s2: []u16 = undefined, 15s2: []u16 = undefined,
@@ -104,8 +106,8 @@ pub const Gbp = enum {
104 106
105/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes. 107/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
106pub const Grapheme = struct { 108pub const Grapheme = struct {
107 len: u32, 109 len: uoffset,
108 offset: u32, 110 offset: uoffset,
109 111
110 /// `bytes` returns the slice of bytes that correspond to 112 /// `bytes` returns the slice of bytes that correspond to
111 /// this grapheme cluster in `src`. 113 /// this grapheme cluster in `src`.
@@ -199,7 +201,7 @@ pub const ReverseIterator = struct {
199 /// Count of pending RI codepoints, it is an even number 201 /// Count of pending RI codepoints, it is an even number
200 ri_count: usize, 202 ri_count: usize,
201 /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji 203 /// End of (Extend* ZWJ) sequence pending from failed GB11: !Emoji Extend* ZWJ x Emoji
202 extend_end: u32, 204 extend_end: uoffset,
203 }; 205 };
204 206
205 const Self = @This(); 207 const Self = @This();
@@ -219,7 +221,7 @@ pub const ReverseIterator = struct {
219 pub fn prev(self: *Self) ?Grapheme { 221 pub fn prev(self: *Self) ?Grapheme {
220 if (self.buf[1] == null) return null; 222 if (self.buf[1] == null) return null;
221 223
222 const grapheme_end: u32 = end: { 224 const grapheme_end: uoffset = end: {
223 const codepoint = self.buf[1].?; 225 const codepoint = self.buf[1].?;
224 226
225 switch (self.pending) { 227 switch (self.pending) {
@@ -270,7 +272,7 @@ pub const ReverseIterator = struct {
270 if (!state.hasIndic()) { 272 if (!state.hasIndic()) {
271 273
272 // BUF: [?Any, Extend | Linker] Consonant 274 // BUF: [?Any, Extend | Linker] Consonant
273 var indic_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; 275 var indic_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
274 276
275 indic: while (true) { 277 indic: while (true) {
276 if (self.buf[0] == null) { 278 if (self.buf[0] == null) {
@@ -321,7 +323,7 @@ pub const ReverseIterator = struct {
321 323
322 if (!state.hasXpic()) { 324 if (!state.hasXpic()) {
323 // BUF: [?Any, ZWJ] Emoji 325 // BUF: [?Any, ZWJ] Emoji
324 var emoji_offset: u32 = self.buf[1].?.offset + self.buf[1].?.len; 326 var emoji_offset: uoffset = self.buf[1].?.offset + self.buf[1].?.len;
325 327
326 // Look for previous Emoji 328 // Look for previous Emoji
327 emoji: while (true) { 329 emoji: while (true) {
diff --git a/src/Words.zig b/src/Words.zig
index 1d10b2a..1707881 100644
--- a/src/Words.zig
+++ b/src/Words.zig
@@ -53,8 +53,8 @@ pub fn deinit(words: *const Words, allocator: mem.Allocator) void {
53/// Represents a Unicode word span, as an offset into the source string 53/// Represents a Unicode word span, as an offset into the source string
54/// and the length of the word. 54/// and the length of the word.
55pub const Word = struct { 55pub const Word = struct {
56 offset: u32, 56 offset: uoffset,
57 len: u32, 57 len: uoffset,
58 58
59 /// Returns a slice of the word given the source string. 59 /// Returns a slice of the word given the source string.
60 pub fn bytes(word: Word, src: []const u8) []const u8 { 60 pub fn bytes(word: Word, src: []const u8) []const u8 {
@@ -183,7 +183,7 @@ pub const Iterator = struct {
183 if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset }; 183 if (iter.that == null) return Word{ .len = iter.this.?.len, .offset = iter.this.?.offset };
184 184
185 const word_start = iter.this.?.offset; 185 const word_start = iter.this.?.offset;
186 var word_len: u32 = 0; 186 var word_len: uoffset = 0;
187 187
188 // State variables. 188 // State variables.
189 var last_p: WordBreakProperty = .none; 189 var last_p: WordBreakProperty = .none;
@@ -364,7 +364,7 @@ pub const ReverseIterator = struct {
364 if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 }; 364 if (iter.before == null) return Word{ .len = iter.after.?.len, .offset = 0 };
365 365
366 const word_end = iter.after.?.offset + iter.after.?.len; 366 const word_end = iter.after.?.offset + iter.after.?.len;
367 var word_len: u32 = 0; 367 var word_len: uoffset = 0;
368 368
369 // State variables. 369 // State variables.
370 var last_p: WordBreakProperty = .none; 370 var last_p: WordBreakProperty = .none;
@@ -518,7 +518,7 @@ pub const ReverseIterator = struct {
518 518
519/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`. 519/// Initialize a ReverseIterator at the provided index. Used in `wordAtIndex`.
520fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator { 520fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) ReverseIterator {
521 var idx: u32 = @intCast(index); 521 var idx: uoffset = @intCast(index);
522 // Find the next lead byte: 522 // Find the next lead byte:
523 while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {} 523 while (idx < string.len and 0x80 <= string[idx] and string[idx] <= 0xBf) : (idx += 1) {}
524 if (idx == string.len) return words.reverseIterator(string); 524 if (idx == string.len) return words.reverseIterator(string);
@@ -537,7 +537,7 @@ fn reverseFromIndex(words: *const Words, string: []const u8, index: usize) Rever
537} 537}
538 538
539fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator { 539fn forwardFromIndex(words: *const Words, string: []const u8, index: usize) Iterator {
540 var idx: u32 = @intCast(index); 540 var idx: uoffset = @intCast(index);
541 if (idx == string.len) { 541 if (idx == string.len) {
542 return .{ 542 return .{
543 .cp_iter = .{ .bytes = string, .i = idx }, 543 .cp_iter = .{ .bytes = string, .i = idx },
@@ -746,6 +746,8 @@ const Allocator = mem.Allocator;
746const assert = std.debug.assert; 746const assert = std.debug.assert;
747const testing = std.testing; 747const testing = std.testing;
748 748
749const uoffset = code_point.uoffset;
750
749const code_point = @import("code_point"); 751const code_point = @import("code_point");
750const CodepointIterator = code_point.Iterator; 752const CodepointIterator = code_point.Iterator;
751const ReverseCodepointIterator = code_point.ReverseIterator; 753const ReverseCodepointIterator = code_point.ReverseIterator;
diff --git a/src/code_point.zig b/src/code_point.zig
index 9a84080..8bd3d5b 100644
--- a/src/code_point.zig
+++ b/src/code_point.zig
@@ -4,12 +4,14 @@
4//! Represents invalid data according to the Replacement of Maximal 4//! Represents invalid data according to the Replacement of Maximal
5//! Subparts algorithm. 5//! Subparts algorithm.
6 6
7pub const uoffset = if (@import("config").fat_offset) u64 else u32;
8
7/// `CodePoint` represents a Unicode code point by its code, 9/// `CodePoint` represents a Unicode code point by its code,
8/// length, and offset in the source bytes. 10/// length, and offset in the source bytes.
9pub const CodePoint = struct { 11pub const CodePoint = struct {
10 code: u21, 12 code: u21,
11 len: u3, 13 len: u3,
12 offset: u32, 14 offset: uoffset,
13 15
14 /// Return the slice of this codepoint, given the original string. 16 /// Return the slice of this codepoint, given the original string.
15 pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 { 17 pub inline fn bytes(cp: CodePoint, str: []const u8) []const u8 {
@@ -27,8 +29,8 @@ pub const CodePoint = struct {
27 29
28/// This function is deprecated and will be removed in a later release. 30/// This function is deprecated and will be removed in a later release.
29/// Use `decodeAtIndex` or `decodeAtCursor`. 31/// Use `decodeAtIndex` or `decodeAtCursor`.
30pub fn decode(bytes: []const u8, offset: u32) ?CodePoint { 32pub fn decode(bytes: []const u8, offset: uoffset) ?CodePoint {
31 var off: u32 = 0; 33 var off: uoffset = 0;
32 var maybe_code = decodeAtCursor(bytes, &off); 34 var maybe_code = decodeAtCursor(bytes, &off);
33 if (maybe_code) |*code| { 35 if (maybe_code) |*code| {
34 code.offset = offset; 36 code.offset = offset;
@@ -38,14 +40,14 @@ pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
38} 40}
39 41
40/// Decode the CodePoint, if any, at `bytes[idx]`. 42/// Decode the CodePoint, if any, at `bytes[idx]`.
41pub fn decodeAtIndex(bytes: []const u8, idx: u32) ?CodePoint { 43pub fn decodeAtIndex(bytes: []const u8, idx: uoffset) ?CodePoint {
42 var off = idx; 44 var off = idx;
43 return decodeAtCursor(bytes, &off); 45 return decodeAtCursor(bytes, &off);
44} 46}
45 47
46/// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the 48/// Decode the CodePoint, if any, at `bytes[cursor.*]`. After, the
47/// cursor will point at the next potential codepoint index. 49/// cursor will point at the next potential codepoint index.
48pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint { 50pub fn decodeAtCursor(bytes: []const u8, cursor: *uoffset) ?CodePoint {
49 // EOS 51 // EOS
50 if (cursor.* >= bytes.len) return null; 52 if (cursor.* >= bytes.len) return null;
51 53
@@ -161,7 +163,7 @@ pub fn decodeAtCursor(bytes: []const u8, cursor: *u32) ?CodePoint {
161/// `Iterator` iterates a string one `CodePoint` at-a-time. 163/// `Iterator` iterates a string one `CodePoint` at-a-time.
162pub const Iterator = struct { 164pub const Iterator = struct {
163 bytes: []const u8, 165 bytes: []const u8,
164 i: u32 = 0, 166 i: uoffset = 0,
165 167
166 pub fn init(bytes: []const u8) Iterator { 168 pub fn init(bytes: []const u8) Iterator {
167 return .{ .bytes = bytes, .i = 0 }; 169 return .{ .bytes = bytes, .i = 0 };
@@ -257,7 +259,7 @@ const class_mask: [12]u8 = .{
257 259
258pub const ReverseIterator = struct { 260pub const ReverseIterator = struct {
259 bytes: []const u8, 261 bytes: []const u8,
260 i: ?u32, 262 i: ?uoffset,
261 263
262 pub fn init(str: []const u8) ReverseIterator { 264 pub fn init(str: []const u8) ReverseIterator {
263 var r_iter: ReverseIterator = undefined; 265 var r_iter: ReverseIterator = undefined;
diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig
index 195fdcb..c463dcc 100644
--- a/src/unicode_tests.zig
+++ b/src/unicode_tests.zig
@@ -141,12 +141,12 @@ test "Segmentation GraphemeIterator" {
141 defer all_bytes.deinit(); 141 defer all_bytes.deinit();
142 142
143 var graphemes = std.mem.splitSequence(u8, line, " ÷ "); 143 var graphemes = std.mem.splitSequence(u8, line, " ÷ ");
144 var bytes_index: u32 = 0; 144 var bytes_index: uoffset = 0;
145 145
146 while (graphemes.next()) |field| { 146 while (graphemes.next()) |field| {
147 var code_points = std.mem.splitScalar(u8, field, ' '); 147 var code_points = std.mem.splitScalar(u8, field, ' ');
148 var cp_buf: [4]u8 = undefined; 148 var cp_buf: [4]u8 = undefined;
149 var cp_index: u32 = 0; 149 var cp_index: uoffset = 0;
150 var gc_len: u8 = 0; 150 var gc_len: u8 = 0;
151 151
152 while (code_points.next()) |code_point| { 152 while (code_points.next()) |code_point| {
@@ -231,12 +231,12 @@ test "Segmentation Word Iterator" {
231 defer all_bytes.deinit(); 231 defer all_bytes.deinit();
232 232
233 var words = std.mem.splitSequence(u8, line, " ÷ "); 233 var words = std.mem.splitSequence(u8, line, " ÷ ");
234 var bytes_index: u32 = 0; 234 var bytes_index: uoffset = 0;
235 235
236 while (words.next()) |field| { 236 while (words.next()) |field| {
237 var code_points = std.mem.splitScalar(u8, field, ' '); 237 var code_points = std.mem.splitScalar(u8, field, ' ');
238 var cp_buf: [4]u8 = undefined; 238 var cp_buf: [4]u8 = undefined;
239 var cp_index: u32 = 0; 239 var cp_index: uoffset = 0;
240 var gc_len: u8 = 0; 240 var gc_len: u8 = 0;
241 241
242 while (code_points.next()) |code_point| { 242 while (code_points.next()) |code_point| {
@@ -425,6 +425,8 @@ const debug = std.debug;
425const testing = std.testing; 425const testing = std.testing;
426const unicode = std.unicode; 426const unicode = std.unicode;
427 427
428const uoffset = @FieldType(Word, "offset");
429
428const Grapheme = @import("Graphemes").Grapheme; 430const Grapheme = @import("Graphemes").Grapheme;
429const Graphemes = @import("Graphemes"); 431const Graphemes = @import("Graphemes");
430const GraphemeIterator = @import("Graphemes").Iterator; 432const GraphemeIterator = @import("Graphemes").Iterator;