A Modern GPGPU API & wip linux RDNA2+ Driver
rdna driver linux gpu
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

amdgpu: implement TLSF allocator for GPU memory

This fix now uses a TLSF allocator for GPU memory allocation.
This allows us to make use of proper large blocks, but still
allow small allocations. I think this is a good design; and also
is circumvented when allocating large BOs, so user-space manual
smarter systems will probably not be slower?

The TLSF (ext) implementation is a bit of a hack. We extracted the
relevant source code from Vulkan VMA, fed it to Claude, and tried
to remove all vulkan-related stuff. It seems to work and have
been tested in another repo, but maybe we want to add tests here.

We will see. :^)

+1055 -78
+17 -78
drivers/amdgpu/mem.cpp
··· 1 1 #include "impl.h" 2 2 3 + #include "memory/mem_pool.h" 4 + 3 5 struct AllocationImpl { 4 6 amdgpu_bo_handle bo; 5 7 amdgpu_va_handle va_handle; 6 8 }; 7 9 8 - #define VEK_HUGE_PAGE_SIZE (2ULL * 1024 * 1024) 9 - 10 10 KesAllocation amdgpu_malloc(KesDevice pd, size_t size, size_t align, KesMemory memory) { 11 11 auto *dev = reinterpret_cast<DeviceImpl *>(pd); 12 12 13 - auto aligned_size = (size + align - 1) & ~(align - 1); 14 - auto alignment = align; 15 - 16 - KesAllocation alloc = {}; 17 - auto *impl = new AllocationImpl; 18 - alloc._internal = impl; 19 - 20 - alloc.size = aligned_size; 21 - impl->bo = nullptr; 22 - impl->va_handle = nullptr; 23 - 24 - amdgpu_bo_alloc_request req = {}; 25 - req.alloc_size = aligned_size; 26 - req.phys_alignment = alignment; 27 - req.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM; 13 + int domain = AMDGPU_GEM_DOMAIN_VRAM; 14 + int flags = 0; 28 15 29 16 switch(memory) { 30 17 case KesMemoryDefault: 31 - req.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED | 18 + flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED | 32 19 AMDGPU_GEM_CREATE_VRAM_CLEARED; 33 20 break; 34 21 case KesMemoryGpu: 35 - req.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 22 + flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 36 23 break; 37 24 case KesMemoryReadback: 38 - req.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED | 25 + flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED | 39 26 AMDGPU_GEM_CREATE_COHERENT; 40 27 break; 41 28 } 42 29 43 - // some systems (DCE) require contiguous addresses as they don't use the MMU Infinity Cache. 30 + // @todo: some systems (DCE) require contiguous addresses as they don't use the MMU Infinity Cache. 44 31 // req.flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; 45 32 46 - if (amdgpu_bo_alloc(dev->amd_handle, &req, &impl->bo) != 0) { 47 - log("amdgpu_bo_alloc failed"); 48 - return alloc; 49 - } 33 + auto res = pool_alloc(dev, size, align, domain, flags); 50 34 51 - uint64_t va_base; 52 - int r = amdgpu_va_range_alloc(dev->amd_handle, 53 - amdgpu_gpu_va_range_general, 54 - aligned_size, alignment, 0, 55 - &va_base, &impl->va_handle, 0); 56 - if (r != 0) { 57 - log("amdgpu_va_range_alloc failed"); 58 - amdgpu_bo_free(impl->bo); 59 - impl->bo = nullptr; 60 - return alloc; 61 - } 62 - alloc.gpu = va_base; 63 - 64 - auto va_flags = AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE; 65 - r = amdgpu_bo_va_op(impl->bo, 0, aligned_size, va_base, va_flags, AMDGPU_VA_OP_MAP); 66 - if (r != 0) { 67 - log("amdgpu_bo_va_op failed"); 68 - amdgpu_va_range_free(impl->va_handle); 69 - amdgpu_bo_free(impl->bo); 70 - impl->bo = nullptr; 71 - impl->va_handle = nullptr; 72 - return alloc; 35 + KesAllocation alloc = {}; 36 + if (res.gpu) { 37 + alloc.size = res.size; 38 + alloc.gpu = res.gpu; 39 + alloc.cpu = res.cpu; 40 + alloc._internal = res._internal; 73 41 } 74 42 75 - if (memory != KesMemoryGpu) { 76 - r = amdgpu_bo_cpu_map(impl->bo, &alloc.cpu); 77 - if (r != 0) { 78 - log("amdgpu_bo_cpu_map failed"); 79 - amdgpu_bo_va_op(impl->bo, 0, aligned_size, va_base, 0, AMDGPU_VA_OP_UNMAP); 80 - amdgpu_va_range_free(impl->va_handle); 81 - amdgpu_bo_free(impl->bo); 82 - impl->bo = nullptr; 83 - impl->va_handle = nullptr; 84 - alloc.gpu = 0; 85 - return alloc; 86 - } 87 - } 88 - 89 - device_register_allocation(dev, impl->bo); 90 - 91 43 return alloc; 92 44 } 93 45 94 46 void amdgpu_free(KesDevice pd, struct KesAllocation *alloc) { 95 - AllocationImpl *impl = reinterpret_cast<AllocationImpl*>(alloc->_internal); 96 - 97 - if (alloc->cpu) { 98 - amdgpu_bo_cpu_unmap(impl->bo); 99 - alloc->cpu = nullptr; 100 - } 101 - if (impl->bo) { 102 - amdgpu_bo_va_op(impl->bo, 0, alloc->size, alloc->gpu, 0, AMDGPU_VA_OP_UNMAP); 103 - amdgpu_bo_free(impl->bo); 104 - impl->bo = nullptr; 105 - } 106 - if (impl->va_handle) { 107 - amdgpu_va_range_free(impl->va_handle); 108 - impl->va_handle = nullptr; 109 - } 47 + pool_free(alloc->_internal); 110 48 111 49 alloc->gpu = 0; 50 + alloc->cpu = 0; 112 51 }
+195
drivers/amdgpu/memory/alloc.h
··· 1 + #pragma once 2 + #include <cstdlib> 3 + #include <vector> 4 + #include <cassert> 5 + #include <cstdint> 6 + #include <new> 7 + 8 + // random access allocator for predetermined blocks. 9 + // 10 + // tip: elements must be at least 4-bytes sized, else they will be padded to 4-byte boundary. 11 + // 12 + // @todo: consider adding a list of available blocks; when frees occur on older blocks we never reuse that memory. 13 + template<typename T, uint32_t BlockSize = 65536> 14 + class SlabAllocator { 15 + static_assert(std::is_trivially_destructible<T>::value, 16 + "SlabAllocator only supports trivially destructible types to prevent memory leaks on destruction."); 17 + static_assert((BlockSize & (BlockSize - 1)) == 0, "BlockSize must be a power of 2"); 18 + public: 19 + explicit SlabAllocator() : m_head(nullptr), m_avail(nullptr) { 20 + create_next_block(); 21 + } 22 + 23 + ~SlabAllocator() { 24 + Block *curr = m_head; 25 + while(curr) { 26 + Block* next = curr->next; 27 + ::free(curr); 28 + curr = next; 29 + } 30 + } 31 + 32 + template<typename... Args> 33 + T* alloc(Args&&... args) { 34 + if (!m_avail) { 35 + create_next_block(); 36 + } 37 + 38 + Block *b = m_avail; 39 + auto items = get_items(b); 40 + auto item = &items[b->first_free]; 41 + b->first_free = item->next; 42 + 43 + if (b->first_free == UINT32_MAX) { 44 + m_avail = b->next_avail; 45 + b->next_avail = nullptr; 46 + } 47 + 48 + T* obj = reinterpret_cast<T*>(item->storage); 49 + new (obj) T{std::forward<Args>(args)...}; 50 + return obj; 51 + } 52 + 53 + void free(T* ptr) { 54 + if (!ptr) return; 55 + 56 + Block *b = get_block(ptr); 57 + Item *item = reinterpret_cast<Item*>(ptr); 58 + Item *items_base = get_items(b); 59 + 60 + if (b->first_free == UINT32_MAX) { 61 + b->next_avail = m_avail; 62 + m_avail = b; 63 + } 64 + 65 + uint32_t index = static_cast<uint32_t>(item - items_base); 66 + item->next = b->first_free; 67 + b->first_free = index; 68 + } 69 + 70 + private: 71 + union Item { 72 + uint32_t next; 73 + alignas(T) unsigned char storage[sizeof(T)]; 74 + }; 75 + 76 + struct Block { 77 + Block *next; 78 + Block *next_avail; 79 + uint32_t first_free; 80 + }; 81 + 82 + static Block *get_block(T *p) { 83 + uintptr_t mask = ~(uintptr_t)(BlockSize - 1); 84 + Block* b = reinterpret_cast<Block*>((uintptr_t)p & mask); 85 + return b; 86 + } 87 + 88 + static Item *get_items(Block *b) { 89 + uintptr_t header_end = reinterpret_cast<uintptr_t>(b) + sizeof(Block); 90 + uintptr_t aligned_start = (header_end + alignof(Item) - 1) & ~(uintptr_t)(alignof(Item) - 1); 91 + return reinterpret_cast<Item*>(aligned_start); 92 + } 93 + 94 + static uint32_t get_capacity(Block *b) { 95 + uintptr_t block_end = reinterpret_cast<uintptr_t>(b) + BlockSize; 96 + uintptr_t header_end = reinterpret_cast<uintptr_t>(b) + sizeof(Block); 97 + uintptr_t aligned_start = (header_end + alignof(Item) - 1) & ~(uintptr_t)(alignof(Item) - 1); 98 + return (block_end - aligned_start) / sizeof(Item); 99 + } 100 + 101 + Block *m_head; 102 + Block *m_avail; 103 + 104 + void create_next_block() { 105 + void *block; 106 + posix_memalign(&block, BlockSize, BlockSize); 107 + 108 + Block *header = new (block) Block(); 109 + header->next = m_head; 110 + header->next_avail = m_avail; 111 + header->first_free = 0; 112 + m_head = header; 113 + m_avail = header; 114 + 115 + Item *items = get_items(header); 116 + uint32_t cap = get_capacity(header); 117 + 118 + for (uint32_t i = 0; i < cap - 1; ++i) 119 + items[i].next = i + 1; 120 + items[cap - 1].next = UINT32_MAX; 121 + } 122 + }; 123 + 124 + // ordered, arbitrary size allocator 125 + template<size_t BlockSize = 65536> 126 + class LinearAllocator { 127 + public: 128 + LinearAllocator() : m_head(nullptr) { 129 + allocate_block(); 130 + } 131 + 132 + ~LinearAllocator() { 133 + Block* b = m_head; 134 + while (b) { 135 + Block* next = b->next; 136 + ::free(b); 137 + b = next; 138 + } 139 + } 140 + 141 + LinearAllocator(const LinearAllocator&) = delete; 142 + LinearAllocator& operator=(const LinearAllocator&) = delete; 143 + 144 + template<typename T, typename... Args> 145 + T* alloc(Args&&... args) { 146 + static_assert(sizeof(T) <= BlockSize - sizeof(Block), "Type too large for Arena block"); 147 + static_assert(std::is_trivially_destructible<T>::value, 148 + "LinearAllocator only supports trivially destructible types to prevent memory leaks on destruction."); 149 + 150 + size_t alignment = alignof(T); 151 + 152 + uintptr_t curr_ptr = reinterpret_cast<uintptr_t>(m_head->curr); 153 + uintptr_t aligned_ptr = (curr_ptr + (alignment - 1)) & ~(uintptr_t)(alignment - 1); 154 + 155 + if (aligned_ptr + sizeof(T) > reinterpret_cast<uintptr_t>(m_head->end)) { 156 + allocate_block(); 157 + curr_ptr = reinterpret_cast<uintptr_t>(m_head->curr); 158 + aligned_ptr = (curr_ptr + (alignment - 1)) & ~(uintptr_t)(alignment - 1); 159 + } 160 + 161 + m_head->curr = reinterpret_cast<uint8_t*>(aligned_ptr + sizeof(T)); 162 + 163 + return new (reinterpret_cast<void*>(aligned_ptr)) T(std::forward<Args>(args)...); 164 + } 165 + 166 + void reset() { 167 + Block* b = m_head; 168 + while (b) { 169 + b->curr = b->data(); 170 + b = b->next; 171 + } 172 + } 173 + 174 + private: 175 + struct Block { 176 + Block* next; 177 + uint8_t* curr; 178 + uint8_t* end; 179 + 180 + uint8_t* data() { return reinterpret_cast<uint8_t*>(this + 1); } 181 + }; 182 + 183 + Block* m_head; 184 + 185 + void allocate_block() { 186 + void* raw = std::malloc(BlockSize); 187 + 188 + Block* new_block = new (raw) Block(); 189 + new_block->next = m_head; 190 + new_block->curr = new_block->data(); 191 + new_block->end = reinterpret_cast<uint8_t*>(raw) + BlockSize; 192 + 193 + m_head = new_block; 194 + } 195 + };
+230
drivers/amdgpu/memory/mem_pool.cpp
··· 1 + #include "mem_pool.h" 2 + #include "tlsf.h" 3 + 4 + #include <cassert> 5 + #include <cstring> 6 + #include <mutex> 7 + #include <vector> 8 + #include <unordered_map> 9 + 10 + // Allocations larger than SLAB_SIZE get a dedicated BO with no sub-allocator. 11 + static constexpr uint64_t SLAB_SIZE = 64ull * 1024 * 1024; 12 + static constexpr uint64_t MIN_VA_ALIGNMENT = 4096; 13 + 14 + struct Pool; 15 + struct Slab; 16 + 17 + struct Slab { 18 + Pool* pool = nullptr; 19 + DeviceImpl *dev = nullptr; 20 + amdgpu_bo_handle bo = nullptr; 21 + amdgpu_va_handle va_handle = nullptr; 22 + uint64_t gpu_base = 0; 23 + void* cpu_base = nullptr; 24 + uint64_t size = 0; 25 + TlsfAllocator* tlsf = nullptr; // null for dedicated slabs 26 + bool dedicated = false; 27 + }; 28 + 29 + struct AllocationImpl { 30 + Slab* slab; 31 + TlsfAllocator::Allocation tlsf; // unused for dedicated slabs 32 + }; 33 + 34 + struct PoolKey { 35 + uint32_t domain; 36 + uint32_t flags; 37 + bool operator==(const PoolKey& o) const { 38 + return domain == o.domain && flags == o.flags; 39 + } 40 + }; 41 + 42 + struct PoolKeyHash { 43 + size_t operator()(const PoolKey& k) const { 44 + return std::hash<uint64_t>()((uint64_t)k.domain << 32 | k.flags); 45 + } 46 + }; 47 + 48 + struct Pool { 49 + PoolKey key; 50 + std::mutex mutex; 51 + std::vector<Slab*> slabs; // pooled slabs only; dedicated slabs are not listed 52 + }; 53 + 54 + static std::mutex g_registry_mutex; 55 + static std::unordered_map<PoolKey, Pool*, PoolKeyHash> g_pools; 56 + 57 + static Pool* get_or_create_pool(const PoolKey& key) { 58 + std::lock_guard<std::mutex> lk(g_registry_mutex); 59 + auto it = g_pools.find(key); 60 + if (it != g_pools.end()) return it->second; 61 + 62 + Pool* pool = new Pool; 63 + pool->key = key; 64 + g_pools[key] = pool; 65 + return pool; 66 + } 67 + 68 + #include <iostream> 69 + 70 + static Slab* slab_create(Pool* pool, 71 + DeviceImpl *dev, 72 + uint64_t size, 73 + uint32_t domain, 74 + uint32_t flags, 75 + bool dedicated) { 76 + 77 + std::cout << "create slab " << std::endl; 78 + 79 + amdgpu_bo_alloc_request req = {}; 80 + req.alloc_size = size; 81 + req.phys_alignment = MIN_VA_ALIGNMENT; 82 + req.preferred_heap = domain; 83 + req.flags = flags; 84 + 85 + amdgpu_bo_handle bo = nullptr; 86 + if (amdgpu_bo_alloc(dev->amd_handle, &req, &bo) != 0) 87 + return nullptr; 88 + 89 + uint64_t gpu_base = 0; 90 + amdgpu_va_handle va_handle = nullptr; 91 + if (amdgpu_va_range_alloc(dev->amd_handle, 92 + amdgpu_gpu_va_range_general, 93 + size, MIN_VA_ALIGNMENT, 0, 94 + &gpu_base, &va_handle, 0) != 0) { 95 + amdgpu_bo_free(bo); 96 + return nullptr; 97 + } 98 + 99 + const uint32_t va_flags = AMDGPU_VM_PAGE_READABLE | 100 + AMDGPU_VM_PAGE_WRITEABLE | 101 + AMDGPU_VM_PAGE_EXECUTABLE; 102 + if (amdgpu_bo_va_op(bo, 0, size, gpu_base, va_flags, AMDGPU_VA_OP_MAP) != 0) { 103 + amdgpu_va_range_free(va_handle); 104 + amdgpu_bo_free(bo); 105 + return nullptr; 106 + } 107 + 108 + void* cpu_base = nullptr; 109 + if (!(flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS)) { 110 + if (amdgpu_bo_cpu_map(bo, &cpu_base) != 0) { 111 + amdgpu_bo_va_op(bo, 0, size, gpu_base, 0, AMDGPU_VA_OP_UNMAP); 112 + amdgpu_va_range_free(va_handle); 113 + amdgpu_bo_free(bo); 114 + return nullptr; 115 + } 116 + } 117 + 118 + device_register_allocation(dev, bo); 119 + 120 + Slab* slab = new Slab; 121 + slab->pool = pool; 122 + slab->dev = dev; 123 + slab->bo = bo; 124 + slab->va_handle = va_handle; 125 + slab->gpu_base = gpu_base; 126 + slab->cpu_base = cpu_base; 127 + slab->size = size; 128 + slab->dedicated = dedicated; 129 + slab->tlsf = dedicated ? nullptr : new TlsfAllocator(size); 130 + return slab; 131 + } 132 + 133 + static void slab_destroy(Slab* slab) { 134 + if (slab->cpu_base) 135 + amdgpu_bo_cpu_unmap(slab->bo); 136 + 137 + amdgpu_bo_va_op(slab->bo, 0, slab->size, 138 + slab->gpu_base, 0, AMDGPU_VA_OP_UNMAP); 139 + amdgpu_va_range_free(slab->va_handle); 140 + amdgpu_bo_free(slab->bo); 141 + 142 + delete slab->tlsf; 143 + delete slab; 144 + } 145 + 146 + PoolAllocation pool_alloc(DeviceImpl *dev, 147 + size_t size, 148 + size_t align, 149 + uint32_t domain, 150 + uint32_t flags) { 151 + PoolAllocation out = {}; 152 + if (!dev || size == 0) return out; 153 + 154 + assert((align & (align - 1)) == 0 && "alignment must be a power of two"); 155 + 156 + Pool* pool = get_or_create_pool({ domain, flags }); 157 + 158 + // Large allocations bypass the pool entirely: one BO, one allocation, no 159 + // TLSF overhead. We still need the pool pointer for the mutex on free. 160 + if ((uint64_t)size > SLAB_SIZE) { 161 + Slab* slab = slab_create(pool, dev, (uint64_t)size, domain, flags, 162 + true); 163 + if (!slab) return out; 164 + 165 + AllocationImpl* impl = new AllocationImpl; 166 + impl->slab = slab; 167 + 168 + out.gpu = slab->gpu_base; 169 + out.cpu = slab->cpu_base; 170 + out.size = slab->size; 171 + out._internal = impl; 172 + return out; 173 + } 174 + 175 + std::lock_guard<std::mutex> lk(pool->mutex); 176 + 177 + TlsfAllocator::Allocation tlsf; 178 + Slab* chosen = nullptr; 179 + 180 + // Try existing pooled slabs, newest first. 181 + for (int i = (int)pool->slabs.size() - 1; i >= 0; --i) { 182 + if (pool->slabs[i]->tlsf->allocate(size, align, tlsf)) { 183 + chosen = pool->slabs[i]; 184 + break; 185 + } 186 + } 187 + 188 + if (!chosen) { 189 + Slab* slab = slab_create(pool, dev, SLAB_SIZE, domain, flags, false); 190 + if (!slab) return out; 191 + 192 + pool->slabs.push_back(slab); 193 + 194 + if (!slab->tlsf->allocate(size, align, tlsf)) { 195 + assert(false && "fresh slab could not satisfy allocation"); 196 + return out; 197 + } 198 + chosen = slab; 199 + } 200 + 201 + AllocationImpl* impl = new AllocationImpl{ chosen, tlsf }; 202 + 203 + out.gpu = chosen->gpu_base + tlsf.offset; 204 + out.cpu = chosen->cpu_base 205 + ? (uint8_t*)chosen->cpu_base + tlsf.offset 206 + : nullptr; 207 + out.size = tlsf.size; 208 + out._internal = impl; 209 + return out; 210 + } 211 + 212 + void pool_free(void *internal) 213 + { 214 + if (!internal) return; 215 + 216 + AllocationImpl* impl = static_cast<AllocationImpl*>(internal); 217 + Slab* slab = impl->slab; 218 + 219 + if (slab->dedicated) { 220 + // No pool lock needed — dedicated slabs are not shared. 221 + // Destroy the slab immediately; its only allocation is gone. 222 + slab_destroy(slab); 223 + } else { 224 + Pool* pool = slab->pool; 225 + std::lock_guard<std::mutex> lk(pool->mutex); 226 + slab->tlsf->free(impl->tlsf); 227 + } 228 + 229 + delete impl; 230 + }
+26
drivers/amdgpu/memory/mem_pool.h
··· 1 + #pragma once 2 + 3 + #include "impl.h" 4 + #include <amdgpu.h> 5 + #include <amdgpu_drm.h> 6 + 7 + #include <cstddef> 8 + #include <cstdint> 9 + 10 + // @todo: come up with a better name. 11 + // @todo: move out the pool concept outside; we may want to manage our own pools 12 + // sometimes (like one for shaders etc, not interlaced with user-allocations). 13 + struct PoolAllocation { 14 + uint64_t gpu; 15 + void* cpu; 16 + uint64_t size; 17 + void* _internal; 18 + }; 19 + 20 + PoolAllocation pool_alloc(DeviceImpl *dev, 21 + size_t size, 22 + size_t align, 23 + uint32_t domain, 24 + uint32_t flags); 25 + 26 + void pool_free(void *internal);
+567
drivers/amdgpu/memory/tlsf.h
··· 1 + #pragma once 2 + #include <cstdint> 3 + #include <cassert> 4 + #include <cstring> 5 + #include <iostream> 6 + #include "alloc.h" 7 + 8 + // Two-Level Segregated Fit allocator. 9 + // 10 + // Architecture (based on VMA's VmaBlockMetadata_TLSF): 11 + // 12 + // Null block — a permanent sentinel that always occupies the tail of the 13 + // address space. It is never placed in the free lists; instead it is 14 + // checked explicitly as a last resort. Its size shrinks as allocations 15 + // consume it and grows as adjacent freed blocks are merged back into it. 16 + // Having a null block means nextPhys is never nullptr for any real (non- 17 + // null) block, which simplifies every split/merge path. 18 + // 19 + // Free/taken encoding — a block's free state is stored inside prevFree_: 20 + // prevFree_ == this → block is live ("taken") 21 + // prevFree_ == nullptr → block is free AND is the head of its bucket list 22 + // prevFree_ == other → block is free AND has a predecessor in the bucket 23 + // nextFree_ and userData_ share a union: a taken block carries caller 24 + // user-data; a free block carries its next-in-list pointer. This keeps 25 + // sizeof(Block) minimal with no separate bool field. 26 + // 27 + // Alignment / front-padding (from VMA's Alloc()): 28 + // When a chosen block starts before the required aligned offset, the gap 29 + // bytes are handled by preferentially growing the previous free block into 30 + // the gap (cheap, no new block needed). Only if the previous block is not 31 + // free is a new taken "gap block" created. This avoids filling the free 32 + // lists with tiny padding blocks. 33 + 34 + class TlsfAllocator { 35 + public: 36 + // ------------------------------------------------------------------------- 37 + // Public types 38 + // ------------------------------------------------------------------------- 39 + 40 + // Allocation is the sole currency: keep it alive for the lifetime of the 41 + // allocation and pass it to free(). _block is opaque — do not touch. 42 + struct Allocation { 43 + uint64_t offset = 0; 44 + uint64_t size = 0; 45 + void* userData = nullptr; 46 + void* _block = nullptr; // opaque Block*, do not use directly 47 + }; 48 + 49 + struct Stats { 50 + uint64_t totalSize = 0; 51 + uint64_t freeBytes = 0; // free blocks, excludes null block 52 + uint64_t usedBytes = 0; 53 + uint64_t nullBlockSize = 0; // uncommitted tail 54 + uint64_t largestFreeBlock = 0; // includes null block 55 + uint32_t freeBlockCount = 0; // excludes null block 56 + uint32_t usedBlockCount = 0; 57 + }; 58 + 59 + // ------------------------------------------------------------------------- 60 + // Construction 61 + // ------------------------------------------------------------------------- 62 + 63 + explicit TlsfAllocator(uint64_t size) 64 + : m_totalSize(size) 65 + { 66 + assert(size > 0 && "heap size must be > 0"); 67 + 68 + memset(&m_flBitmap, 0, sizeof(m_flBitmap)); 69 + memset(m_slBitmap, 0, sizeof(m_slBitmap)); 70 + memset(m_freeLists, 0, sizeof(m_freeLists)); 71 + 72 + // The null block starts as the entire heap. 73 + m_nullBlock = m_blockPool.alloc(); 74 + *m_nullBlock = Block{}; 75 + m_nullBlock->offset = 0; 76 + m_nullBlock->size = size; 77 + m_nullBlock->markFree(); 78 + m_nullBlock->nextFree() = nullptr; 79 + m_nullBlock->prevFree() = nullptr; 80 + 81 + #ifdef DEBUG 82 + validate(); 83 + #endif 84 + } 85 + 86 + // ------------------------------------------------------------------------- 87 + // allocate 88 + // ------------------------------------------------------------------------- 89 + 90 + bool allocate(uint64_t size, uint64_t alignment, Allocation& out) 91 + { 92 + assert(size >= 1 && "size must be > 0"); 93 + assert(isPow2(alignment) && "alignment must be a power of two"); 94 + if (alignment < 1) alignment = 1; 95 + 96 + uint64_t sizeNeeded = size + alignment - 1; 97 + 98 + uint32_t listIdx = 0; 99 + Block* block = findFreeBlock(sizeNeeded, listIdx); 100 + 101 + if (!block) { 102 + if (m_nullBlock->size >= sizeNeeded) 103 + block = m_nullBlock; 104 + else 105 + return false; 106 + } 107 + 108 + commitAlloc(block, size, alignment, out); 109 + return true; 110 + } 111 + 112 + // ------------------------------------------------------------------------- 113 + // free 114 + // ------------------------------------------------------------------------- 115 + 116 + void free(const Allocation& alloc) 117 + { 118 + assert(alloc._block && "null allocation"); 119 + Block* block = static_cast<Block*>(alloc._block); 120 + assert(!block->isFree() && "double free"); 121 + 122 + Block* next = block->nextPhys; // always non-null (null block is last) 123 + Block* prev = block->prevPhys; 124 + 125 + // Merge with previous neighbour if free. 126 + if (prev && prev->isFree()) { 127 + removeFreeBlock(prev); 128 + mergeInto(block, prev); // block grows backward, prev is freed 129 + } 130 + 131 + // Merge with next neighbour. 132 + if (next == m_nullBlock) { 133 + mergeInto(m_nullBlock, block); // null block grows backward 134 + } else if (next->isFree()) { 135 + removeFreeBlock(next); 136 + mergeInto(next, block); // next grows backward, block is freed 137 + insertFreeBlock(next); 138 + } else { 139 + block->markTaken(); // required before insertFreeBlock 140 + insertFreeBlock(block); 141 + } 142 + 143 + #ifdef DEBUG 144 + validate(); 145 + #endif 146 + } 147 + 148 + // ------------------------------------------------------------------------- 149 + // stats 150 + // ------------------------------------------------------------------------- 151 + 152 + Stats stats() const 153 + { 154 + Stats s{}; 155 + s.totalSize = m_totalSize; 156 + s.nullBlockSize = m_nullBlock->size; 157 + s.largestFreeBlock = m_nullBlock->size; 158 + 159 + for (const Block* b = m_nullBlock->prevPhys; b; b = b->prevPhys) { 160 + if (b->isFree()) { 161 + ++s.freeBlockCount; 162 + s.freeBytes += b->size; 163 + if (b->size > s.largestFreeBlock) 164 + s.largestFreeBlock = b->size; 165 + } else { 166 + ++s.usedBlockCount; 167 + s.usedBytes += b->size; 168 + } 169 + } 170 + return s; 171 + } 172 + 173 + // ------------------------------------------------------------------------- 174 + // validate / debugVisual 175 + // ------------------------------------------------------------------------- 176 + 177 + void validate() const 178 + { 179 + validateFreeLists(); 180 + validateBitmaps(); 181 + validatePhysicalList(); 182 + } 183 + 184 + void debugVisual() const 185 + { 186 + static constexpr int W = 80; 187 + char line[W]; 188 + memset(line, '.', W); 189 + 190 + for (const Block* b = m_nullBlock; b; b = b->prevPhys) { 191 + int s = (int)((b->offset * W) / m_totalSize); 192 + int e = (int)(((b->offset + b->size) * W) / m_totalSize); 193 + char c = (b == m_nullBlock) ? '~' : (b->isFree() ? '_' : '#'); 194 + for (int i = s; i < e && i < W; ++i) line[i] = c; 195 + } 196 + std::cout << std::string(line, W) << "\n"; 197 + } 198 + 199 + private: 200 + // ------------------------------------------------------------------------- 201 + // Constants 202 + // ------------------------------------------------------------------------- 203 + 204 + static constexpr int SL_LOG2 = 5; 205 + static constexpr int SL_COUNT = 1 << SL_LOG2; // 32 206 + static constexpr int FL_COUNT = 32; 207 + 208 + // ------------------------------------------------------------------------- 209 + // Block 210 + // ------------------------------------------------------------------------- 211 + 212 + struct Block { 213 + uint64_t offset = 0; 214 + uint64_t size = 0; 215 + 216 + Block* prevPhys = nullptr; 217 + Block* nextPhys = nullptr; 218 + 219 + // Free/taken encoded in prevFree_ (see class comment). 220 + Block* prevFree_ = nullptr; 221 + union { 222 + Block* nextFree_; 223 + void* userData_; 224 + }; 225 + 226 + void markFree() { prevFree_ = nullptr; } 227 + void markTaken() { prevFree_ = this; userData_ = nullptr; } 228 + bool isFree() const { return prevFree_ != this; } 229 + 230 + Block*& prevFree() { return prevFree_; } 231 + Block*& nextFree() { assert(isFree()); return nextFree_; } 232 + void*& userData() { assert(!isFree()); return userData_; } 233 + }; 234 + 235 + // ------------------------------------------------------------------------- 236 + // Members 237 + // ------------------------------------------------------------------------- 238 + 239 + uint64_t m_totalSize; 240 + Block* m_nullBlock = nullptr; 241 + 242 + uint32_t m_flBitmap = 0; 243 + uint32_t m_slBitmap[FL_COUNT]{}; 244 + Block* m_freeLists[FL_COUNT][SL_COUNT]{}; 245 + 246 + SlabAllocator<Block> m_blockPool; 247 + 248 + // ------------------------------------------------------------------------- 249 + // Helpers 250 + // ------------------------------------------------------------------------- 251 + 252 + static bool isPow2(uint64_t x) { return x && !(x & (x - 1)); } 253 + 254 + static uint64_t alignUp(uint64_t x, uint64_t a) 255 + { 256 + return (x + a - 1) & ~(a - 1); 257 + } 258 + 259 + static int bsf(uint32_t x) { assert(x); return __builtin_ctz(x); } 260 + static int msb(uint64_t x) { assert(x); return 63 - __builtin_clzll(x); } 261 + 262 + // Map size to a (fl, sl) index. Valid for any size >= 1. 263 + static void sizeToIndex(uint64_t size, int& fl, int& sl) 264 + { 265 + assert(size >= 1); 266 + if (size <= (uint64_t)SL_COUNT) { 267 + fl = 0; 268 + sl = (int)size - 1; 269 + } else { 270 + fl = msb(size); 271 + if (fl >= FL_COUNT) fl = FL_COUNT - 1; 272 + sl = (int)((size >> (fl - SL_LOG2)) & (SL_COUNT - 1)); 273 + } 274 + } 275 + 276 + // ------------------------------------------------------------------------- 277 + // Free-list operations 278 + // ------------------------------------------------------------------------- 279 + 280 + // Insert block into the appropriate free list. 281 + // Precondition: block->markTaken() has been called (isFree() == false). 282 + // The function transitions it to free as part of the operation. 283 + void insertFreeBlock(Block* b) 284 + { 285 + assert(!b->isFree() && "call markTaken() before insertFreeBlock()"); 286 + assert(b != m_nullBlock); 287 + 288 + int fl, sl; 289 + sizeToIndex(b->size, fl, sl); 290 + 291 + b->prevFree() = nullptr; 292 + b->nextFree() = m_freeLists[fl][sl]; 293 + m_freeLists[fl][sl] = b; 294 + 295 + if (b->nextFree()) 296 + b->nextFree()->prevFree() = b; 297 + else { 298 + m_slBitmap[fl] |= (1u << sl); 299 + m_flBitmap |= (1u << fl); 300 + } 301 + } 302 + 303 + // Remove block from its free list and mark it taken. 304 + void removeFreeBlock(Block* b) 305 + { 306 + assert(b->isFree()); 307 + assert(b != m_nullBlock); 308 + 309 + int fl, sl; 310 + sizeToIndex(b->size, fl, sl); 311 + 312 + if (b->nextFree()) 313 + b->nextFree()->prevFree() = b->prevFree(); 314 + 315 + if (b->prevFree()) 316 + b->prevFree()->nextFree() = b->nextFree(); 317 + else { 318 + // b was the list head 319 + m_freeLists[fl][sl] = b->nextFree(); 320 + if (!m_freeLists[fl][sl]) { 321 + m_slBitmap[fl] &= ~(1u << sl); 322 + if (!m_slBitmap[fl]) 323 + m_flBitmap &= ~(1u << fl); 324 + } 325 + } 326 + 327 + b->markTaken(); 328 + } 329 + 330 + // Find the smallest free block whose size >= 'size'. 331 + // Never returns the null block. 332 + Block* findFreeBlock(uint64_t size, uint32_t& outListIdx) const 333 + { 334 + int fl, sl; 335 + sizeToIndex(size, fl, sl); 336 + 337 + // Look for a suitable SL bin within this FL class. 338 + uint32_t slMap = m_slBitmap[fl] & (~0u << sl); 339 + if (slMap) { 340 + sl = bsf(slMap); 341 + outListIdx = fl * SL_COUNT + sl; 342 + return m_freeLists[fl][sl]; 343 + } 344 + 345 + // No match in this FL class — search higher FL classes. 346 + uint32_t flMap = m_flBitmap & (~0u << (fl + 1)); 347 + if (!flMap) return nullptr; 348 + 349 + fl = bsf(flMap); 350 + sl = bsf(m_slBitmap[fl]); 351 + outListIdx = fl * SL_COUNT + sl; 352 + return m_freeLists[fl][sl]; 353 + } 354 + 355 + // ------------------------------------------------------------------------- 356 + // mergeInto(dst, src) 357 + // 358 + // Absorbs 'src' into 'dst'. src must be the physical predecessor of dst 359 + // (src->nextPhys == dst). src must already be removed from the free list 360 + // (i.e. markTaken()). dst grows backward to cover src's range. 361 + // src is returned to the block pool. 362 + // ------------------------------------------------------------------------- 363 + 364 + void mergeInto(Block* dst, Block* src) 365 + { 366 + assert(src->nextPhys == dst); 367 + assert(!src->isFree() && "remove src from free list before merging"); 368 + 369 + dst->offset = src->offset; 370 + dst->size += src->size; 371 + dst->prevPhys = src->prevPhys; 372 + if (dst->prevPhys) 373 + dst->prevPhys->nextPhys = dst; 374 + 375 + m_blockPool.free(src); 376 + } 377 + 378 + // ------------------------------------------------------------------------- 379 + // commitAlloc 380 + // 381 + // Carve an allocation of 'size' bytes at an aligned offset out of 'block'. 382 + // 'block' may be a regular free block (already removed from free lists, so 383 + // marked taken) or the null block (never in free lists, stays marked free 384 + // until we explicitly markTaken it below). 385 + // ------------------------------------------------------------------------- 386 + 387 + void commitAlloc(Block* block, uint64_t size, uint64_t alignment, 388 + Allocation& out) 389 + { 390 + const bool isNull = (block == m_nullBlock); 391 + 392 + if (!isNull) 393 + removeFreeBlock(block); // transitions to taken 394 + // null block is not in free lists; we markTaken it at the end 395 + 396 + // --- front padding --------------------------------------------------- 397 + 398 + uint64_t alignedOffset = alignUp(block->offset, alignment); 399 + uint64_t padding = alignedOffset - block->offset; 400 + 401 + if (padding > 0) { 402 + Block* prev = block->prevPhys; 403 + 404 + if (prev && prev->isFree()) { 405 + // Grow the existing previous free block into the gap. 406 + // If the size increase crosses a bin boundary, re-bucket it. 407 + int oldFl, oldSl; 408 + sizeToIndex(prev->size, oldFl, oldSl); 409 + prev->size += padding; 410 + int newFl, newSl; 411 + sizeToIndex(prev->size, newFl, newSl); 412 + 413 + if (oldFl != newFl || oldSl != newSl) { 414 + removeFreeBlock(prev); // marks taken 415 + insertFreeBlock(prev); // marks free again in new bin 416 + } 417 + // else: same bin, in-place size update is sufficient 418 + 419 + } else { 420 + // No usable previous free block — insert a taken gap block. 421 + Block* gap = m_blockPool.alloc(); 422 + *gap = Block{}; 423 + gap->offset = block->offset; 424 + gap->size = padding; 425 + gap->prevPhys = block->prevPhys; 426 + gap->nextPhys = block; 427 + gap->markTaken(); 428 + 429 + if (gap->prevPhys) 430 + gap->prevPhys->nextPhys = gap; 431 + 432 + block->prevPhys = gap; 433 + } 434 + 435 + block->offset = alignedOffset; 436 + block->size -= padding; 437 + } 438 + 439 + // --- tail split ------------------------------------------------------ 440 + 441 + if (block->size > size) { 442 + Block* tail = m_blockPool.alloc(); 443 + *tail = Block{}; 444 + tail->offset = block->offset + size; 445 + tail->size = block->size - size; 446 + tail->prevPhys = block; 447 + tail->nextPhys = block->nextPhys; 448 + 449 + if (tail->nextPhys) 450 + tail->nextPhys->prevPhys = tail; 451 + 452 + block->nextPhys = tail; 453 + block->size = size; 454 + 455 + if (isNull) { 456 + // Tail becomes the new null block. 457 + tail->markFree(); 458 + tail->nextFree() = nullptr; 459 + tail->prevFree() = nullptr; 460 + m_nullBlock = tail; 461 + } else { 462 + // Regular tail — free it. 463 + tail->markTaken(); 464 + insertFreeBlock(tail); 465 + } 466 + 467 + } else if (isNull) { 468 + // Consumed the entire null block — create a zero-size null sentinel. 469 + Block* newNull = m_blockPool.alloc(); 470 + *newNull = Block{}; 471 + newNull->offset = block->offset + size; 472 + newNull->size = 0; 473 + newNull->prevPhys = block; 474 + newNull->nextPhys = nullptr; 475 + newNull->markFree(); 476 + newNull->nextFree() = nullptr; 477 + newNull->prevFree() = nullptr; 478 + 479 + block->nextPhys = newNull; 480 + m_nullBlock = newNull; 481 + } 482 + 483 + // Finalise the allocated block. 484 + block->markTaken(); 485 + block->userData() = nullptr; 486 + 487 + out.offset = block->offset; 488 + out.size = block->size; 489 + out.userData = nullptr; 490 + out._block = block; 491 + 492 + #ifdef DEBUG 493 + validate(); 494 + #endif 495 + } 496 + 497 + // ------------------------------------------------------------------------- 498 + // Validation helpers 499 + // ------------------------------------------------------------------------- 500 + 501 + void validatePhysicalList() const 502 + { 503 + assert(m_nullBlock); 504 + assert(m_nullBlock->nextPhys == nullptr && "null block must be last"); 505 + 506 + uint64_t total = m_nullBlock->size; 507 + uint64_t expected = m_nullBlock->offset; 508 + 509 + for (const Block* b = m_nullBlock->prevPhys; b; b = b->prevPhys) { 510 + assert(b->size > 0 && "zero-size block in physical list"); 511 + assert(b->offset + b->size == expected && "non-contiguous blocks"); 512 + assert(b->nextPhys->prevPhys == b && "broken prevPhys link"); 513 + expected = b->offset; 514 + total += b->size; 515 + } 516 + 517 + assert(total == m_totalSize && "physical list size mismatch"); 518 + assert(expected == 0 && "physical list does not start at 0"); 519 + } 520 + 521 + void validateFreeLists() const 522 + { 523 + for (int fl = 0; fl < FL_COUNT; ++fl) { 524 + for (int sl = 0; sl < SL_COUNT; ++sl) { 525 + const Block* b = m_freeLists[fl][sl]; 526 + const Block* prev = nullptr; 527 + 528 + while (b) { 529 + assert(b != m_nullBlock && "null block must not be in free lists"); 530 + assert(b->isFree() && "non-free block in free list"); 531 + assert(b->prevFree_ == prev && "broken prevFree link"); 532 + 533 + int fl2, sl2; 534 + sizeToIndex(b->size, fl2, sl2); 535 + 536 + if (fl != fl2 || sl != sl2) { 537 + std::cout << "Free list mismatch!\n" 538 + << " Expected bin: (" << fl << ", " << sl << ")\n" 539 + << " Actual bin: (" << fl2 << ", " << sl2 << ")\n" 540 + << " Block size: " << b->size << "\n" 541 + << " Offset: " << b->offset << "\n"; 542 + debugVisual(); 543 + assert(false); 544 + } 545 + 546 + prev = b; 547 + b = b->nextFree_; 548 + } 549 + } 550 + } 551 + } 552 + 553 + void validateBitmaps() const 554 + { 555 + for (int fl = 0; fl < FL_COUNT; ++fl) { 556 + bool flExp = (m_slBitmap[fl] != 0); 557 + bool flAct = (m_flBitmap & (1u << fl)) != 0; 558 + assert(flExp == flAct && "flBitmap inconsistent with slBitmap"); 559 + 560 + for (int sl = 0; sl < SL_COUNT; ++sl) { 561 + bool slExp = (m_freeLists[fl][sl] != nullptr); 562 + bool slAct = (m_slBitmap[fl] & (1u << sl)) != 0; 563 + assert(slExp == slAct && "slBitmap inconsistent with free list"); 564 + } 565 + } 566 + } 567 + };
+20
test/examples/08_tiny_allocations/tiny_allocations.cpp
··· 1 + #include <cstdlib> 2 + #include <unistd.h> 3 + #include <kestrel/kestrel.h> 4 + 5 + #include <stdio.h> 6 + 7 + int main(void) { 8 + auto dev = kes_create(); 9 + 10 + int n = 10; 11 + for (auto i = 0; i < n; ++i) { 12 + auto size = rand() % 32; 13 + auto align = 4; 14 + 15 + auto x = kes_malloc(dev, size, align, KesMemoryDefault); 16 + printf("cpu: %p gpu: %p sz: %d\n", (void *)x.cpu, (void *)x.gpu, x.size); 17 + } 18 + 19 + return 0; 20 + }