Custom Allocators for Performance

2025-07-22 • ~7 min read

malloc() is general-purpose and slow. System calls, fragmentation, thread synchronization, bookkeeping overhead. Custom allocators exploit allocation patterns for 10-100x speedups in allocation-heavy code.

malloc() Overhead

General-purpose allocators pay for flexibility you don't need:

// malloc() costs
Thread synchronization: ~50-200 cycles (contended)
Free list traversal: ~10-50 cycles per search
Metadata overhead: 16-32 bytes per allocation
System calls: ~1000+ cycles (sbrk/mmap)
Fragmentation: Memory waste + cache pollution

// Pathological case: many small allocations
for (int i = 0; i < 1000000; i++) {
    char* ptr = (char*)malloc(16);    // Tiny allocation
    // Use ptr...
    free(ptr);                       // Immediate deallocation
}
// Result: malloc overhead dominates useful work

Stack Allocator

LIFO allocation pattern—allocation is just pointer increment:

class StackAllocator {
    char* memory;
    size_t size;
    size_t offset = 0;

public:
    StackAllocator(size_t size) : size(size) {
        memory = (char*)aligned_alloc(64, size);  // 64-byte aligned
    }
    
    ~StackAllocator() { free(memory); }
    
    template
    T* allocate(size_t count = 1) {
        size_t bytes = count * sizeof(T);
        size_t aligned_bytes = (bytes + alignof(T) - 1) & ~(alignof(T) - 1);
        
        if (offset + aligned_bytes > size) return nullptr;  // OOM
        
        T* result = reinterpret_cast(memory + offset);
        offset += aligned_bytes;
        return result;
    }
    
    void reset() { offset = 0; }  // Reset entire allocator - O(1)
    
    size_t bytes_used() const { return offset; }
};

// Usage pattern
void render_frame() {
    StackAllocator frame_allocator(1 << 20);  // 1MB stack
    
    // Allocate temporary objects
    Vertex* vertices = frame_allocator.allocate(1000);
    Matrix* transforms = frame_allocator.allocate(100);
    
    // Process frame...
    
    // All memory freed automatically at function exit
    // No individual free() calls needed
}

Pool Allocator

Fixed-size allocation—O(1) alloc/free with perfect cache locality:

template
class PoolAllocator {
    union FreeNode {
        T object;                    // When allocated
        FreeNode* next;             // When free
    };
    
    alignas(64) FreeNode pool[PoolSize];  // Cache-line aligned
    FreeNode* free_head = nullptr;
    size_t allocated_count = 0;

public:
    PoolAllocator() {
        // Initialize free list
        for (size_t i = 0; i < PoolSize - 1; i++) {
            pool[i].next = &pool[i + 1];
        }
        pool[PoolSize - 1].next = nullptr;
        free_head = &pool[0];
    }
    
    T* allocate() {
        if (!free_head) return nullptr;  // Pool exhausted
        
        FreeNode* node = free_head;
        free_head = free_head->next;
        allocated_count++;
        
        return reinterpret_cast(node);  // O(1) allocation
    }
    
    void deallocate(T* ptr) {
        FreeNode* node = reinterpret_cast(ptr);
        node->next = free_head;
        free_head = node;
        allocated_count--;                  // O(1) deallocation
    }
    
    bool empty() const { return allocated_count == 0; }
    bool full() const { return allocated_count == PoolSize; }
};

// Perfect for fixed-size objects
PoolAllocator game_objects;
PoolAllocator particles;

GameObject* obj = game_objects.allocate();
// ... use object ...
game_objects.deallocate(obj);

Arena Allocator

Bump pointer allocation—extremely fast, bulk deallocation:

class ArenaAllocator {
    char* memory;
    size_t size;
    size_t offset = 0;

public:
    ArenaAllocator(size_t size) : size(size) {
        memory = (char*)mmap(nullptr, size, PROT_READ | PROT_WRITE,
                           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    }
    
    ~ArenaAllocator() { munmap(memory, size); }
    
    void* allocate(size_t bytes, size_t alignment = sizeof(void*)) {
        // Align offset to required boundary
        size_t aligned_offset = (offset + alignment - 1) & ~(alignment - 1);
        
        if (aligned_offset + bytes > size) return nullptr;
        
        void* result = memory + aligned_offset;
        offset = aligned_offset + bytes;
        return result;
    }
    
    template
    T* allocate(size_t count = 1) {
        return static_cast(allocate(count * sizeof(T), alignof(T)));
    }
    
    void clear() { offset = 0; }  // Free all allocations - O(1)
    
    // Create checkpoint for partial resets
    size_t checkpoint() const { return offset; }
    void revert(size_t checkpoint) { offset = checkpoint; }
};

// Usage: per-frame or per-level allocation
ArenaAllocator level_arena(64 << 20);  // 64MB arena

void load_level() {
    // All level data allocated from arena
    Mesh* meshes = level_arena.allocate(1000);
    Texture* textures = level_arena.allocate(500);
    
    // ... load level data ...
}

void unload_level() {
    level_arena.clear();  // Free entire level instantly
}

Free List Allocator

Variable-size allocation with explicit free list management:

class FreeListAllocator {
    struct FreeBlock {
        size_t size;
        FreeBlock* next;
    };
    
    char* memory;
    size_t total_size;
    FreeBlock* free_list = nullptr;

public:
    FreeListAllocator(size_t size) : total_size(size) {
        memory = (char*)aligned_alloc(64, size);
        
        // Initialize with single large free block
        free_list = reinterpret_cast(memory);
        free_list->size = size - sizeof(size_t);  // Save space for header
        free_list->next = nullptr;
    }
    
    void* allocate(size_t bytes) {
        bytes = (bytes + 7) & ~7;  // 8-byte alignment
        bytes += sizeof(size_t);   // Space for size header
        
        FreeBlock** current = &free_list;
        while (*current) {
            if ((*current)->size >= bytes) {
                FreeBlock* block = *current;
                
                // Remove from free list
                *current = block->next;
                
                // Split block if much larger than needed
                if (block->size >= bytes + sizeof(FreeBlock) + 64) {
                    FreeBlock* remainder = reinterpret_cast(
                        (char*)block + bytes);
                    remainder->size = block->size - bytes;
                    remainder->next = free_list;
                    free_list = remainder;
                    
                    block->size = bytes;
                }
                
                // Store size in header, return user pointer
                *(size_t*)block = bytes;
                return (char*)block + sizeof(size_t);
            }
            current = &(*current)->next;
        }
        
        return nullptr;  // Out of memory
    }
    
    void deallocate(void* ptr) {
        if (!ptr) return;
        
        char* block_start = (char*)ptr - sizeof(size_t);
        size_t block_size = *(size_t*)block_start;
        
        // Add to free list
        FreeBlock* free_block = reinterpret_cast(block_start);
        free_block->size = block_size;
        free_block->next = free_list;
        free_list = free_block;
        
        // TODO: Coalesce adjacent free blocks
    }
};

Ring Buffer Allocator

Circular allocation for streaming data:

class RingBufferAllocator {
    char* memory;
    size_t size;
    size_t head = 0;    // Allocation pointer
    size_t tail = 0;    // Deallocation pointer

public:
    RingBufferAllocator(size_t size) : size(size) {
        memory = (char*)aligned_alloc(64, size);
    }
    
    void* allocate(size_t bytes) {
        bytes = (bytes + 7) & ~7;  // 8-byte alignment
        
        // Check if allocation fits
        if (head >= tail) {
            if (head + bytes <= size) {
                void* result = memory + head;
                head += bytes;
                return result;
            } else if (bytes <= tail) {
                // Wrap around
                head = bytes;
                return memory;
            }
        } else {
            if (head + bytes <= tail) {
                void* result = memory + head;
                head += bytes;
                return result;
            }
        }
        
        return nullptr;  // Not enough space
    }
    
    void advance_tail(size_t bytes) {
        bytes = (bytes + 7) & ~7;
        tail = (tail + bytes) % size;
    }
    
    size_t used_bytes() const {
        return head >= tail ? head - tail : size - (tail - head);
    }
};

// Perfect for audio/network buffers
RingBufferAllocator audio_buffer(1 << 16);  // 64KB ring

void process_audio_frame() {
    float* samples = (float*)audio_buffer.allocate(1024 * sizeof(float));
    // Process audio...
    // Old samples automatically overwritten when buffer wraps
}

SLAB Allocator

Kernel-style allocator for frequently allocated objects:

template
class SlabAllocator {
    struct Slab {
        static const size_t OBJECTS_PER_SLAB = 64;
        alignas(64) T objects[OBJECTS_PER_SLAB];
        uint64_t free_mask = ~0ULL;  // Bitset: 1 = free, 0 = allocated
        size_t free_count = OBJECTS_PER_SLAB;
        Slab* next = nullptr;
    };
    
    Slab* partial_slabs = nullptr;  // Slabs with some free objects
    Slab* full_slabs = nullptr;     // Slabs with no free objects

public:
    T* allocate() {
        if (!partial_slabs) {
            // Allocate new slab
            partial_slabs = new Slab();
        }
        
        Slab* slab = partial_slabs;
        
        // Find first free bit
        int free_index = __builtin_ctzll(slab->free_mask);
        slab->free_mask &= ~(1ULL << free_index);
        slab->free_count--;
        
        // Move to full list if slab is now full
        if (slab->free_count == 0) {
            partial_slabs = slab->next;
            slab->next = full_slabs;
            full_slabs = slab;
        }
        
        return &slab->objects[free_index];
    }
    
    void deallocate(T* ptr) {
        // Find which slab this object belongs to
        Slab* slab = reinterpret_cast(
            (uintptr_t)ptr & ~(sizeof(Slab) - 1));
        
        // Calculate object index within slab
        int index = ptr - slab->objects;
        
        // Mark as free
        slab->free_mask |= (1ULL << index);
        slab->free_count++;
        
        // Move from full to partial if needed
        if (slab->free_count == 1) {
            // Remove from full list, add to partial list
            // ... list manipulation code ...
        }
    }
};

Memory Mapping Allocator

Direct system memory mapping for large allocations:

class MmapAllocator {
public:
    static void* allocate(size_t bytes) {
        // Round up to page size
        size_t page_size = getpagesize();
        bytes = (bytes + page_size - 1) & ~(page_size - 1);
        
        void* ptr = mmap(nullptr, bytes, PROT_READ | PROT_WRITE,
                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        
        if (ptr == MAP_FAILED) return nullptr;
        
        // Optional: advise kernel about usage patterns
        madvise(ptr, bytes, MADV_SEQUENTIAL);  // Sequential access
        
        return ptr;
    }
    
    static void deallocate(void* ptr, size_t bytes) {
        if (ptr) munmap(ptr, bytes);
    }
    
    // Huge page allocation for better TLB performance
    static void* allocate_huge(size_t bytes) {
        bytes = (bytes + (2 << 20) - 1) & ~((2 << 20) - 1);  // 2MB alignment
        
        void* ptr = mmap(nullptr, bytes, PROT_READ | PROT_WRITE,
                        MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
        
        return ptr != MAP_FAILED ? ptr : nullptr;
    }
};

// Large buffer allocation
void* big_buffer = MmapAllocator::allocate_huge(1 << 30);  // 1GB with huge pages

Allocator Composition

Combine allocators for different allocation patterns:

class CompositeAllocator {
    StackAllocator temp_stack{1 << 20};      // 1MB temp allocations
    PoolAllocator entities;   // Game entities  
    ArenaAllocator level_arena{64 << 20};    // 64MB level data
    
public:
    // Route allocations based on usage pattern
    template
    T* allocate_temp(size_t count = 1) {
        return temp_stack.allocate(count);
    }
    
    Entity* allocate_entity() {
        return entities.allocate();
    }
    
    template
    T* allocate_level_data(size_t count = 1) {
        return level_arena.allocate(count);
    }
    
    void frame_reset() {
        temp_stack.reset();  // Clear temporary allocations
    }
    
    void level_reset() {
        level_arena.clear();  // Clear level data
        // Entities persist across levels
    }
};
Allocation benchmark (1M allocations of 64 bytes)
malloc/free: 180ms
Stack allocator: 8ms (22.5x faster)
Pool allocator: 12ms (15x faster)
Arena allocator: 6ms (30x faster)
Custom allocators dominate for specific patterns.

Allocation Patterns

Choose allocator based on usage pattern:

// Temporal patterns
Stack:     LIFO, short-lived (function locals, temporaries)
Arena:     Bulk allocation/deallocation (per-frame, per-level)
Ring:      Streaming data (audio, network packets)

// Size patterns  
Pool:      Fixed-size objects (entities, particles)
Slab:      Few different sizes (kernel objects)
Free List: Variable sizes (general purpose)

// Access patterns
Sequential: Arena, Stack (cache-friendly)
Random:     Pool, Slab (cache-line aligned)

// Lifetime patterns
Short:      Stack, Arena (batch deallocation)  
Long:       Pool, Free List (individual management)
Mixed:      Composite allocator

Profiling Allocation Performance

# Memory allocation profiling
valgrind --tool=massif ./program       # Heap usage over time
perf record -e kmem:* ./program        # Kernel memory events
gperftools (tcmalloc)                  # Allocation profiler

# Custom instrumentation
#define PROFILE_ALLOC 1
#ifdef PROFILE_ALLOC
thread_local size_t alloc_count = 0;
thread_local uint64_t alloc_cycles = 0;

void* tracked_malloc(size_t size) {
    uint64_t start = __rdtsc();
    void* ptr = malloc(size);
    uint64_t end = __rdtsc();
    
    alloc_count++;
    alloc_cycles += (end - start);
    return ptr;
}
#endif

Bottom Line

malloc() is slow and general-purpose. Custom allocators exploit allocation patterns for massive speedups. Stack allocator for temporaries. Pool allocator for fixed sizes. Arena allocator for bulk operations. Ring buffer for streaming. Profile allocation patterns first—then pick the right allocator. Composition beats one-size-fits-all.

← Back