From 10048b0d31d0db923ae39c6bbd67139ed6252f6f Mon Sep 17 00:00:00 2001 From: Sam Atman Date: Wed, 30 Apr 2025 20:30:39 -0400 Subject: Allocation Failure Tests These turned up an excessive amount of allocations in CanonData and CompatData, which have been reduced to two through the somewhat squirrely use of 'magic numbers'. There are now allocation tests for every allocated structure in the library, and they run to completion in a reasonable amount of time. So, that's nice. --- src/CanonData.zig | 24 ++++++++++---- src/CaseFolding.zig | 15 +++++---- src/CompatData.zig | 20 +++++++++--- src/DisplayWidth.zig | 72 ++++++++++++++++++++++++++---------------- src/GeneralCategories.zig | 79 +++++++++++++++++++++++++++++------------------ src/LetterCasing.zig | 9 ++++++ src/Normalize.zig | 15 --------- src/Properties.zig | 9 ++++++ src/Scripts.zig | 9 ++++++ src/magic_numbers.zig | 15 +++++++++ src/unicode_tests.zig | 2 +- 11 files changed, 178 insertions(+), 91 deletions(-) create mode 100644 src/magic_numbers.zig (limited to 'src') diff --git a/src/CanonData.zig b/src/CanonData.zig index d95a5be..5d2332a 100644 --- a/src/CanonData.zig +++ b/src/CanonData.zig @@ -2,6 +2,7 @@ nfc: std.AutoHashMapUnmanaged([2]u21, u21), nfd: [][]u21 = undefined, +cps: []u21 = undefined, const CanonData = @This(); @@ -17,23 +18,29 @@ pub fn init(allocator: mem.Allocator) !CanonData { .nfc = .empty, .nfd = try allocator.alloc([]u21, 0x110000), }; - var _cp: u24 = undefined; + { + errdefer allocator.free(cdata.nfd); + cdata.cps = try allocator.alloc(u21, magic.canon_size); + } + + var total_cp: u24 = undefined; errdefer { cdata.nfc.deinit(allocator); - for (cdata.nfd[0.._cp]) |slice| allocator.free(slice); + allocator.free(cdata.cps); allocator.free(cdata.nfd); } @memset(cdata.nfd, &.{}); + var total_len: usize = 0; + while (true) { const len: u8 = try reader.readInt(u8, endian); if (len == 0) break; const cp = try reader.readInt(u24, endian); - _cp = cp; - const nfd_cp = try allocator.alloc(u21, len - 1); - errdefer allocator.free(nfd_cp); + total_cp = cp; + const nfd_cp = cdata.cps[total_len..][0 .. len - 1]; for (0..len - 1) |i| { nfd_cp[i] = @intCast(try reader.readInt(u24, endian)); } @@ -41,14 +48,17 @@ pub fn init(allocator: mem.Allocator) !CanonData { try cdata.nfc.put(allocator, nfd_cp[0..2].*, @intCast(cp)); } cdata.nfd[cp] = nfd_cp; + total_len += len - 1; } + if (comptime magic.print) std.debug.print("CanonData magic number: {d}\n", .{total_len}); + return cdata; } pub fn deinit(cdata: *CanonData, allocator: mem.Allocator) void { cdata.nfc.deinit(allocator); - for (cdata.nfd) |slice| allocator.free(slice); + allocator.free(cdata.cps); allocator.free(cdata.nfd); } @@ -66,3 +76,5 @@ const std = @import("std"); const builtin = @import("builtin"); const compress = std.compress; const mem = std.mem; +const magic = @import("magic"); +const options = @import("options"); diff --git a/src/CaseFolding.zig b/src/CaseFolding.zig index 2e53bfa..f63b860 100644 --- a/src/CaseFolding.zig +++ b/src/CaseFolding.zig @@ -310,14 +310,13 @@ fn testAllocations(allocator: Allocator) !void { } } -// test "Allocation Failures" { -// if (true) return error.SkipZigTest; // XXX: remove -// try testing.checkAllAllocationFailures( -// testing.allocator, -// testAllocations, -// .{}, -// ); -// } +test "Allocation Failures" { + try testing.checkAllAllocationFailures( + testing.allocator, + testAllocations, + .{}, + ); +} const std = @import("std"); const builtin = @import("builtin"); diff --git a/src/CompatData.zig b/src/CompatData.zig index d787103..794abca 100644 --- a/src/CompatData.zig +++ b/src/CompatData.zig @@ -1,6 +1,7 @@ //! Compatibility Data nfkd: [][]u21 = undefined, +cps: []u21 = undefined, const CompatData = @This(); @@ -15,27 +16,35 @@ pub fn init(allocator: mem.Allocator) !CompatData { var cpdata = CompatData{ .nfkd = try allocator.alloc([]u21, 0x110000), }; + { + errdefer allocator.free(cpdata.nfkd); + cpdata.cps = try allocator.alloc(u21, magic.compat_size); + } errdefer cpdata.deinit(allocator); @memset(cpdata.nfkd, &.{}); + var total_len: usize = 0; + while (true) { const len: u8 = try reader.readInt(u8, endian); if (len == 0) break; const cp = try reader.readInt(u24, endian); - cpdata.nfkd[cp] = try allocator.alloc(u21, len - 1); + const nk_s = cpdata.cps[total_len..][0 .. len - 1]; for (0..len - 1) |i| { - cpdata.nfkd[cp][i] = @intCast(try reader.readInt(u24, endian)); + nk_s[i] = @intCast(try reader.readInt(u24, endian)); } + cpdata.nfkd[cp] = nk_s; + total_len += len - 1; } + if (comptime magic.print) std.debug.print("CompatData magic number: {d}", .{total_len}); + return cpdata; } pub fn deinit(cpdata: *const CompatData, allocator: mem.Allocator) void { - for (cpdata.nfkd) |slice| { - if (slice.len != 0) allocator.free(slice); - } + allocator.free(cpdata.cps); allocator.free(cpdata.nfkd); } @@ -48,3 +57,4 @@ const std = @import("std"); const builtin = @import("builtin"); const compress = std.compress; const mem = std.mem; +const magic = @import("magic"); diff --git a/src/DisplayWidth.zig b/src/DisplayWidth.zig index c0d6d96..4c63be4 100644 --- a/src/DisplayWidth.zig +++ b/src/DisplayWidth.zig @@ -1,27 +1,18 @@ -const std = @import("std"); -const builtin = @import("builtin"); -const options = @import("options"); -const ArrayList = std.ArrayList; -const compress = std.compress; -const mem = std.mem; -const simd = std.simd; -const testing = std.testing; - -const ascii = @import("ascii"); -const CodePointIterator = @import("code_point").Iterator; -pub const DisplayWidthData = @import("DisplayWidthData"); +//! Display Width module +//! +//! Answers questions about the printable width in monospaced fonts of the +//! string of interest. -const Graphemes = @import("Graphemes"); - -graphemes: Graphemes, +graphemes: Graphemes = undefined, s1: []u16 = undefined, s2: []i4 = undefined, -owns_graphemes: bool, +owns_graphemes: bool = true, const DisplayWidth = @This(); -pub fn init(allocator: mem.Allocator) mem.Allocator.Error!DisplayWidth { - var dw: DisplayWidth = try DisplayWidth.setup(allocator); +pub fn init(allocator: Allocator) Allocator.Error!DisplayWidth { + var dw = DisplayWidth{}; + try dw.setup(allocator); errdefer { allocator.free(dw.s1); allocator.free(dw.s2); @@ -32,15 +23,16 @@ pub fn init(allocator: mem.Allocator) mem.Allocator.Error!DisplayWidth { return dw; } -pub fn initWithGraphemes(allocator: mem.Allocator, graphemes: Graphemes) mem.Allocator.Error!DisplayWidth { - var dw = try DisplayWidth.setup(allocator); +pub fn initWithGraphemes(allocator: Allocator, graphemes: Graphemes) Allocator.Error!DisplayWidth { + var dw = DisplayWidth{}; + try dw.setup(allocator); dw.graphemes = graphemes; dw.owns_graphemes = false; return dw; } // Sets up the DisplayWidthData, leaving the GraphemeData undefined. -fn setup(allocator: mem.Allocator) mem.Allocator.Error!DisplayWidth { +fn setup(dw: *DisplayWidth, allocator: Allocator) Allocator.Error!void { const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("dwp"); var in_fbs = std.io.fixedBufferStream(in_bytes); @@ -49,8 +41,6 @@ fn setup(allocator: mem.Allocator) mem.Allocator.Error!DisplayWidth { const endian = builtin.cpu.arch.endian(); - var dw: DisplayWidth = undefined; - const stage_1_len: u16 = reader.readInt(u16, endian) catch unreachable; dw.s1 = try allocator.alloc(u16, stage_1_len); errdefer allocator.free(dw.s1); @@ -60,11 +50,9 @@ fn setup(allocator: mem.Allocator) mem.Allocator.Error!DisplayWidth { dw.s2 = try allocator.alloc(i4, stage_2_len); errdefer allocator.free(dw.s2); for (0..stage_2_len) |i| dw.s2[i] = @intCast(reader.readInt(i8, endian) catch unreachable); - - return dw; } -pub fn deinit(dw: *const DisplayWidth, allocator: mem.Allocator) void { +pub fn deinit(dw: *const DisplayWidth, allocator: Allocator) void { allocator.free(dw.s1); allocator.free(dw.s2); if (dw.owns_graphemes) dw.graphemes.deinit(allocator); @@ -445,3 +433,35 @@ test "wrap" { const want = "The quick \nbrown fox \njumped \nover the \nlazy dog!"; try testing.expectEqualStrings(want, got); } + +fn testAllocation(allocator: Allocator) !void { + { + var dw = try DisplayWidth.init(allocator); + dw.deinit(allocator); + } + { + var graph = try Graphemes.init(allocator); + defer graph.deinit(allocator); + var dw = try DisplayWidth.initWithGraphemes(allocator, graph); + dw.deinit(allocator); + } +} + +test "allocation test" { + try testing.checkAllAllocationFailures(testing.allocator, testAllocation, .{}); +} + +const std = @import("std"); +const builtin = @import("builtin"); +const options = @import("options"); +const ArrayList = std.ArrayList; +const compress = std.compress; +const mem = std.mem; +const Allocator = mem.Allocator; +const simd = std.simd; +const testing = std.testing; + +const ascii = @import("ascii"); +const CodePointIterator = @import("code_point").Iterator; + +const Graphemes = @import("Graphemes"); diff --git a/src/GeneralCategories.zig b/src/GeneralCategories.zig index b7c82c0..3e76d82 100644 --- a/src/GeneralCategories.zig +++ b/src/GeneralCategories.zig @@ -46,7 +46,16 @@ pub fn init(allocator: Allocator) Allocator.Error!GeneralCategories { return gencat; } -pub fn setup(self: *GeneralCategories, allocator: Allocator) Allocator.Error!void { +pub fn setup(gencat: *GeneralCategories, allocator: Allocator) Allocator.Error!void { + gencat.setupInner(allocator) catch |err| { + switch (err) { + error.OutOfMemory => |e| return e, + else => unreachable, + } + }; +} + +inline fn setupInner(gencat: *GeneralCategories, allocator: Allocator) !void { const decompressor = compress.flate.inflate.decompressor; const in_bytes = @embedFile("gencat"); var in_fbs = std.io.fixedBufferStream(in_bytes); @@ -56,35 +65,35 @@ pub fn setup(self: *GeneralCategories, allocator: Allocator) Allocator.Error!voi const endian = builtin.cpu.arch.endian(); const s1_len: u16 = reader.readInt(u16, endian) catch unreachable; - self.s1 = try allocator.alloc(u16, s1_len); - errdefer allocator.free(self.s1); - for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian); + gencat.s1 = try allocator.alloc(u16, s1_len); + errdefer allocator.free(gencat.s1); + for (0..s1_len) |i| gencat.s1[i] = try reader.readInt(u16, endian); const s2_len: u16 = reader.readInt(u16, endian) catch unreachable; - self.s2 = try allocator.alloc(u5, s2_len); - errdefer allocator.free(self.s2); - for (0..s2_len) |i| self.s2[i] = @intCast(reader.readInt(u8, endian) catch unreachable); + gencat.s2 = try allocator.alloc(u5, s2_len); + errdefer allocator.free(gencat.s2); + for (0..s2_len) |i| gencat.s2[i] = @intCast(reader.readInt(u8, endian) catch unreachable); const s3_len: u16 = reader.readInt(u8, endian) catch unreachable; - self.s3 = try allocator.alloc(u5, s3_len); - errdefer allocator.free(self.s3); - for (0..s3_len) |i| self.s3[i] = @intCast(reader.readInt(u8, endian) catch unreachable); + gencat.s3 = try allocator.alloc(u5, s3_len); + errdefer allocator.free(gencat.s3); + for (0..s3_len) |i| gencat.s3[i] = @intCast(reader.readInt(u8, endian) catch unreachable); } -pub fn deinit(self: *const GeneralCategories, allocator: mem.Allocator) void { - allocator.free(self.s1); - allocator.free(self.s2); - allocator.free(self.s3); +pub fn deinit(gencat: *const GeneralCategories, allocator: mem.Allocator) void { + allocator.free(gencat.s1); + allocator.free(gencat.s2); + allocator.free(gencat.s3); } /// Lookup the General Category for `cp`. -pub fn gc(self: GeneralCategories, cp: u21) Gc { - return @enumFromInt(self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]]); +pub fn gc(gencat: GeneralCategories, cp: u21) Gc { + return @enumFromInt(gencat.s3[gencat.s2[gencat.s1[cp >> 8] + (cp & 0xff)]]); } /// True if `cp` has an C general category. -pub fn isControl(self: GeneralCategories, cp: u21) bool { - return switch (self.gc(cp)) { +pub fn isControl(gencat: GeneralCategories, cp: u21) bool { + return switch (gencat.gc(cp)) { .Cc, .Cf, .Cn, @@ -96,8 +105,8 @@ pub fn isControl(self: GeneralCategories, cp: u21) bool { } /// True if `cp` has an L general category. -pub fn isLetter(self: GeneralCategories, cp: u21) bool { - return switch (self.gc(cp)) { +pub fn isLetter(gencat: GeneralCategories, cp: u21) bool { + return switch (gencat.gc(cp)) { .Ll, .Lm, .Lo, @@ -109,8 +118,8 @@ pub fn isLetter(self: GeneralCategories, cp: u21) bool { } /// True if `cp` has an M general category. -pub fn isMark(self: GeneralCategories, cp: u21) bool { - return switch (self.gc(cp)) { +pub fn isMark(gencat: GeneralCategories, cp: u21) bool { + return switch (gencat.gc(cp)) { .Mc, .Me, .Mn, @@ -120,8 +129,8 @@ pub fn isMark(self: GeneralCategories, cp: u21) bool { } /// True if `cp` has an N general category. -pub fn isNumber(self: GeneralCategories, cp: u21) bool { - return switch (self.gc(cp)) { +pub fn isNumber(gencat: GeneralCategories, cp: u21) bool { + return switch (gencat.gc(cp)) { .Nd, .Nl, .No, @@ -131,8 +140,8 @@ pub fn isNumber(self: GeneralCategories, cp: u21) bool { } /// True if `cp` has an P general category. -pub fn isPunctuation(self: GeneralCategories, cp: u21) bool { - return switch (self.gc(cp)) { +pub fn isPunctuation(gencat: GeneralCategories, cp: u21) bool { + return switch (gencat.gc(cp)) { .Pc, .Pd, .Pe, @@ -146,8 +155,8 @@ pub fn isPunctuation(self: GeneralCategories, cp: u21) bool { } /// True if `cp` has an S general category. -pub fn isSymbol(self: GeneralCategories, cp: u21) bool { - return switch (self.gc(cp)) { +pub fn isSymbol(gencat: GeneralCategories, cp: u21) bool { + return switch (gencat.gc(cp)) { .Sc, .Sk, .Sm, @@ -158,8 +167,8 @@ pub fn isSymbol(self: GeneralCategories, cp: u21) bool { } /// True if `cp` has an Z general category. -pub fn isSeparator(self: GeneralCategories, cp: u21) bool { - return switch (self.gc(cp)) { +pub fn isSeparator(gencat: GeneralCategories, cp: u21) bool { + return switch (gencat.gc(cp)) { .Zl, .Zp, .Zs, @@ -168,8 +177,18 @@ pub fn isSeparator(self: GeneralCategories, cp: u21) bool { }; } +fn testAllocator(allocator: Allocator) !void { + var gen_cat = try GeneralCategories.init(allocator); + gen_cat.deinit(allocator); +} + +test "Allocation failure" { + try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{}); +} + const std = @import("std"); const builtin = @import("builtin"); const compress = std.compress; const mem = std.mem; +const testing = std.testing; const Allocator = mem.Allocator; diff --git a/src/LetterCasing.zig b/src/LetterCasing.zig index a7260b8..11a3e96 100644 --- a/src/LetterCasing.zig +++ b/src/LetterCasing.zig @@ -203,6 +203,15 @@ test "toLowerStr" { try testing.expectEqualStrings("hello, world 2112!", lowered); } +fn testAllocator(allocator: Allocator) !void { + var prop = try LetterCasing.init(allocator); + prop.deinit(allocator); +} + +test "Allocation failure" { + try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{}); +} + const std = @import("std"); const builtin = @import("builtin"); const compress = std.compress; diff --git a/src/Normalize.zig b/src/Normalize.zig index 1500b4c..989ec29 100644 --- a/src/Normalize.zig +++ b/src/Normalize.zig @@ -657,21 +657,6 @@ test "isLatin1Only" { try testing.expect(!isLatin1Only(not_latin1_only)); } -// NOTE: These tests take way waaaaay too long to run, because -// the amount of allocations in a couple of the inflators is -// completely excessive and is also costing memory for metadata. -// I'm leaving this here for when I fix that. -// -// fn testAllocations(allocator: Allocator) !void { -// const norm = try Normalize.init(allocator); -// norm.deinit(allocator); -// } -// -// test "allocation failures" { -// if (true) return error.SkipZigTest; -// try testing.checkAllAllocationFailures(testing.allocator, testAllocations, .{}); -// } - const std = @import("std"); const debug = std.debug; const assert = debug.assert; diff --git a/src/Properties.zig b/src/Properties.zig index f7e57ec..73602a0 100644 --- a/src/Properties.zig +++ b/src/Properties.zig @@ -169,6 +169,15 @@ test "Props" { try testing.expect(!self.isDecimal('g')); } +fn testAllocator(allocator: Allocator) !void { + var prop = try Properties.init(allocator); + prop.deinit(allocator); +} + +test "Allocation failure" { + try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{}); +} + const std = @import("std"); const builtin = @import("builtin"); const compress = std.compress; diff --git a/src/Scripts.zig b/src/Scripts.zig index f71a2b5..fd5fde9 100644 --- a/src/Scripts.zig +++ b/src/Scripts.zig @@ -233,6 +233,15 @@ test "script" { try testing.expectEqual(Script.Latin, self.script('A').?); } +fn testAllocator(allocator: Allocator) !void { + var prop = try Scripts.init(allocator); + prop.deinit(allocator); +} + +test "Allocation failure" { + try testing.checkAllAllocationFailures(testing.allocator, testAllocator, .{}); +} + const std = @import("std"); const builtin = @import("builtin"); const compress = std.compress; diff --git a/src/magic_numbers.zig b/src/magic_numbers.zig new file mode 100644 index 0000000..203bdfd --- /dev/null +++ b/src/magic_numbers.zig @@ -0,0 +1,15 @@ +//! 'Magic' numbers for codegen sizing +//! +//! These need to be updated for each Unicode version. + +// Whether to print the magic numbers +pub const print = false; + +// Don't want to crash while printing magic... +const fudge = if (print) 1000 else 0; + +// Number of codepoints in CanonData.zig +pub const canon_size: usize = 3127 + fudge; + +// Number of codepoitns in CompatData.zig +pub const compat_size: usize = 5612 + fudge; diff --git a/src/unicode_tests.zig b/src/unicode_tests.zig index 8b9069a..1c4b888 100644 --- a/src/unicode_tests.zig +++ b/src/unicode_tests.zig @@ -208,7 +208,7 @@ test "Segmentation GraphemeIterator" { // std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items }); var iter = data.iterator(all_bytes.items); - // Chaeck. + // Check. for (want.items) |want_gc| { const got_gc = (iter.next()).?; try std.testing.expectEqualStrings( -- cgit v1.2.3