malloc() is general-purpose and slow. System calls, fragmentation, thread synchronization, bookkeeping overhead. Custom allocators exploit allocation patterns for 10-100x speedups in allocation-heavy code.
General-purpose allocators pay for flexibility you don't need:
// malloc() costs
Thread synchronization: ~50-200 cycles (contended)
Free list traversal: ~10-50 cycles per search
Metadata overhead: 16-32 bytes per allocation
System calls: ~1000+ cycles (sbrk/mmap)
Fragmentation: Memory waste + cache pollution
// Pathological case: many small allocations
for (int i = 0; i < 1000000; i++) {
char* ptr = (char*)malloc(16); // Tiny allocation
// Use ptr...
free(ptr); // Immediate deallocation
}
// Result: malloc overhead dominates useful work
LIFO allocation pattern—allocation is just pointer increment:
class StackAllocator {
char* memory;
size_t size;
size_t offset = 0;
public:
StackAllocator(size_t size) : size(size) {
memory = (char*)aligned_alloc(64, size); // 64-byte aligned
}
~StackAllocator() { free(memory); }
template
T* allocate(size_t count = 1) {
size_t bytes = count * sizeof(T);
size_t aligned_bytes = (bytes + alignof(T) - 1) & ~(alignof(T) - 1);
if (offset + aligned_bytes > size) return nullptr; // OOM
T* result = reinterpret_cast(memory + offset);
offset += aligned_bytes;
return result;
}
void reset() { offset = 0; } // Reset entire allocator - O(1)
size_t bytes_used() const { return offset; }
};
// Usage pattern
void render_frame() {
StackAllocator frame_allocator(1 << 20); // 1MB stack
// Allocate temporary objects
Vertex* vertices = frame_allocator.allocate(1000);
Matrix* transforms = frame_allocator.allocate(100);
// Process frame...
// All memory freed automatically at function exit
// No individual free() calls needed
}
Fixed-size allocation—O(1) alloc/free with perfect cache locality:
templateclass PoolAllocator { union FreeNode { T object; // When allocated FreeNode* next; // When free }; alignas(64) FreeNode pool[PoolSize]; // Cache-line aligned FreeNode* free_head = nullptr; size_t allocated_count = 0; public: PoolAllocator() { // Initialize free list for (size_t i = 0; i < PoolSize - 1; i++) { pool[i].next = &pool[i + 1]; } pool[PoolSize - 1].next = nullptr; free_head = &pool[0]; } T* allocate() { if (!free_head) return nullptr; // Pool exhausted FreeNode* node = free_head; free_head = free_head->next; allocated_count++; return reinterpret_cast (node); // O(1) allocation } void deallocate(T* ptr) { FreeNode* node = reinterpret_cast (ptr); node->next = free_head; free_head = node; allocated_count--; // O(1) deallocation } bool empty() const { return allocated_count == 0; } bool full() const { return allocated_count == PoolSize; } }; // Perfect for fixed-size objects PoolAllocator game_objects; PoolAllocator particles; GameObject* obj = game_objects.allocate(); // ... use object ... game_objects.deallocate(obj);
Bump pointer allocation—extremely fast, bulk deallocation:
class ArenaAllocator {
char* memory;
size_t size;
size_t offset = 0;
public:
ArenaAllocator(size_t size) : size(size) {
memory = (char*)mmap(nullptr, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
}
~ArenaAllocator() { munmap(memory, size); }
void* allocate(size_t bytes, size_t alignment = sizeof(void*)) {
// Align offset to required boundary
size_t aligned_offset = (offset + alignment - 1) & ~(alignment - 1);
if (aligned_offset + bytes > size) return nullptr;
void* result = memory + aligned_offset;
offset = aligned_offset + bytes;
return result;
}
template
T* allocate(size_t count = 1) {
return static_cast(allocate(count * sizeof(T), alignof(T)));
}
void clear() { offset = 0; } // Free all allocations - O(1)
// Create checkpoint for partial resets
size_t checkpoint() const { return offset; }
void revert(size_t checkpoint) { offset = checkpoint; }
};
// Usage: per-frame or per-level allocation
ArenaAllocator level_arena(64 << 20); // 64MB arena
void load_level() {
// All level data allocated from arena
Mesh* meshes = level_arena.allocate(1000);
Texture* textures = level_arena.allocate(500);
// ... load level data ...
}
void unload_level() {
level_arena.clear(); // Free entire level instantly
}
Variable-size allocation with explicit free list management:
class FreeListAllocator {
struct FreeBlock {
size_t size;
FreeBlock* next;
};
char* memory;
size_t total_size;
FreeBlock* free_list = nullptr;
public:
FreeListAllocator(size_t size) : total_size(size) {
memory = (char*)aligned_alloc(64, size);
// Initialize with single large free block
free_list = reinterpret_cast(memory);
free_list->size = size - sizeof(size_t); // Save space for header
free_list->next = nullptr;
}
void* allocate(size_t bytes) {
bytes = (bytes + 7) & ~7; // 8-byte alignment
bytes += sizeof(size_t); // Space for size header
FreeBlock** current = &free_list;
while (*current) {
if ((*current)->size >= bytes) {
FreeBlock* block = *current;
// Remove from free list
*current = block->next;
// Split block if much larger than needed
if (block->size >= bytes + sizeof(FreeBlock) + 64) {
FreeBlock* remainder = reinterpret_cast(
(char*)block + bytes);
remainder->size = block->size - bytes;
remainder->next = free_list;
free_list = remainder;
block->size = bytes;
}
// Store size in header, return user pointer
*(size_t*)block = bytes;
return (char*)block + sizeof(size_t);
}
current = &(*current)->next;
}
return nullptr; // Out of memory
}
void deallocate(void* ptr) {
if (!ptr) return;
char* block_start = (char*)ptr - sizeof(size_t);
size_t block_size = *(size_t*)block_start;
// Add to free list
FreeBlock* free_block = reinterpret_cast(block_start);
free_block->size = block_size;
free_block->next = free_list;
free_list = free_block;
// TODO: Coalesce adjacent free blocks
}
};
Circular allocation for streaming data:
class RingBufferAllocator {
char* memory;
size_t size;
size_t head = 0; // Allocation pointer
size_t tail = 0; // Deallocation pointer
public:
RingBufferAllocator(size_t size) : size(size) {
memory = (char*)aligned_alloc(64, size);
}
void* allocate(size_t bytes) {
bytes = (bytes + 7) & ~7; // 8-byte alignment
// Check if allocation fits
if (head >= tail) {
if (head + bytes <= size) {
void* result = memory + head;
head += bytes;
return result;
} else if (bytes <= tail) {
// Wrap around
head = bytes;
return memory;
}
} else {
if (head + bytes <= tail) {
void* result = memory + head;
head += bytes;
return result;
}
}
return nullptr; // Not enough space
}
void advance_tail(size_t bytes) {
bytes = (bytes + 7) & ~7;
tail = (tail + bytes) % size;
}
size_t used_bytes() const {
return head >= tail ? head - tail : size - (tail - head);
}
};
// Perfect for audio/network buffers
RingBufferAllocator audio_buffer(1 << 16); // 64KB ring
void process_audio_frame() {
float* samples = (float*)audio_buffer.allocate(1024 * sizeof(float));
// Process audio...
// Old samples automatically overwritten when buffer wraps
}
Kernel-style allocator for frequently allocated objects:
templateclass SlabAllocator { struct Slab { static const size_t OBJECTS_PER_SLAB = 64; alignas(64) T objects[OBJECTS_PER_SLAB]; uint64_t free_mask = ~0ULL; // Bitset: 1 = free, 0 = allocated size_t free_count = OBJECTS_PER_SLAB; Slab* next = nullptr; }; Slab* partial_slabs = nullptr; // Slabs with some free objects Slab* full_slabs = nullptr; // Slabs with no free objects public: T* allocate() { if (!partial_slabs) { // Allocate new slab partial_slabs = new Slab(); } Slab* slab = partial_slabs; // Find first free bit int free_index = __builtin_ctzll(slab->free_mask); slab->free_mask &= ~(1ULL << free_index); slab->free_count--; // Move to full list if slab is now full if (slab->free_count == 0) { partial_slabs = slab->next; slab->next = full_slabs; full_slabs = slab; } return &slab->objects[free_index]; } void deallocate(T* ptr) { // Find which slab this object belongs to Slab* slab = reinterpret_cast ( (uintptr_t)ptr & ~(sizeof(Slab) - 1)); // Calculate object index within slab int index = ptr - slab->objects; // Mark as free slab->free_mask |= (1ULL << index); slab->free_count++; // Move from full to partial if needed if (slab->free_count == 1) { // Remove from full list, add to partial list // ... list manipulation code ... } } };
Direct system memory mapping for large allocations:
class MmapAllocator {
public:
static void* allocate(size_t bytes) {
// Round up to page size
size_t page_size = getpagesize();
bytes = (bytes + page_size - 1) & ~(page_size - 1);
void* ptr = mmap(nullptr, bytes, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (ptr == MAP_FAILED) return nullptr;
// Optional: advise kernel about usage patterns
madvise(ptr, bytes, MADV_SEQUENTIAL); // Sequential access
return ptr;
}
static void deallocate(void* ptr, size_t bytes) {
if (ptr) munmap(ptr, bytes);
}
// Huge page allocation for better TLB performance
static void* allocate_huge(size_t bytes) {
bytes = (bytes + (2 << 20) - 1) & ~((2 << 20) - 1); // 2MB alignment
void* ptr = mmap(nullptr, bytes, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
return ptr != MAP_FAILED ? ptr : nullptr;
}
};
// Large buffer allocation
void* big_buffer = MmapAllocator::allocate_huge(1 << 30); // 1GB with huge pages
Combine allocators for different allocation patterns:
class CompositeAllocator {
StackAllocator temp_stack{1 << 20}; // 1MB temp allocations
PoolAllocator entities; // Game entities
ArenaAllocator level_arena{64 << 20}; // 64MB level data
public:
// Route allocations based on usage pattern
template
T* allocate_temp(size_t count = 1) {
return temp_stack.allocate(count);
}
Entity* allocate_entity() {
return entities.allocate();
}
template
T* allocate_level_data(size_t count = 1) {
return level_arena.allocate(count);
}
void frame_reset() {
temp_stack.reset(); // Clear temporary allocations
}
void level_reset() {
level_arena.clear(); // Clear level data
// Entities persist across levels
}
};
Choose allocator based on usage pattern:
// Temporal patterns Stack: LIFO, short-lived (function locals, temporaries) Arena: Bulk allocation/deallocation (per-frame, per-level) Ring: Streaming data (audio, network packets) // Size patterns Pool: Fixed-size objects (entities, particles) Slab: Few different sizes (kernel objects) Free List: Variable sizes (general purpose) // Access patterns Sequential: Arena, Stack (cache-friendly) Random: Pool, Slab (cache-line aligned) // Lifetime patterns Short: Stack, Arena (batch deallocation) Long: Pool, Free List (individual management) Mixed: Composite allocator
# Memory allocation profiling
valgrind --tool=massif ./program # Heap usage over time
perf record -e kmem:* ./program # Kernel memory events
gperftools (tcmalloc) # Allocation profiler
# Custom instrumentation
#define PROFILE_ALLOC 1
#ifdef PROFILE_ALLOC
thread_local size_t alloc_count = 0;
thread_local uint64_t alloc_cycles = 0;
void* tracked_malloc(size_t size) {
uint64_t start = __rdtsc();
void* ptr = malloc(size);
uint64_t end = __rdtsc();
alloc_count++;
alloc_cycles += (end - start);
return ptr;
}
#endif
malloc() is slow and general-purpose. Custom allocators exploit allocation patterns for massive speedups. Stack allocator for temporaries. Pool allocator for fixed sizes. Arena allocator for bulk operations. Ring buffer for streaming. Profile allocation patterns first—then pick the right allocator. Composition beats one-size-fits-all.