drm/buddy: Improve offset-aligned allocation handling

Large alignment requests previously forced the buddy allocator to search by
alignment order, which often caused higher-order free blocks to be split even
when a suitably aligned smaller region already existed within them. This led
to excessive fragmentation, especially for workloads requesting small sizes
with large alignment constraints.

This change prioritizes the requested allocation size during the search and
uses an augmented RB-tree field (subtree_max_alignment) to efficiently locate
free blocks that satisfy both size and offset-alignment requirements. As a
result, the allocator can directly select an aligned sub-region without
splitting larger blocks unnecessarily.

A practical example is the VKCTS test
dEQP-VK.memory.allocation.basic.size_8KiB.reverse.count_4000, which repeatedly
allocates 8 KiB buffers with a 256 KiB alignment. Previously, such allocations
caused large blocks to be split aggressively, despite smaller aligned regions
being sufficient. With this change, those aligned regions are reused directly,
significantly reducing fragmentation.

This improvement is visible in the amdgpu VRAM buddy allocator state
(/sys/kernel/debug/dri/1/amdgpu_vram_mm). After the change, higher-order blocks
are preserved and the number of low-order fragments is substantially reduced.

Before:
order- 5 free: 1936 MiB, blocks: 15490
order- 4 free: 967 MiB, blocks: 15486
order- 3 free: 483 MiB, blocks: 15485
order- 2 free: 241 MiB, blocks: 15486
order- 1 free: 241 MiB, blocks: 30948

After:
order- 5 free: 493 MiB, blocks: 3941
order- 4 free: 246 MiB, blocks: 3943
order- 3 free: 123 MiB, blocks: 4101
order- 2 free: 61 MiB, blocks: 4101
order- 1 free: 61 MiB, blocks: 8018

By avoiding unnecessary splits, this change improves allocator efficiency and
helps maintain larger contiguous free regions under heavy offset-aligned
allocation workloads.

v2:(Matthew)
- Update augmented information along the path to the inserted node.

v3:
- Move the patch to gpu/buddy.c file.

v4:(Matthew)
- Use the helper instead of calling _ffs directly
- Remove gpu_buddy_block_order(block) >= order check and drop order
- Drop !node check as all callers handle this already
- Return larger than any other possible alignment for __ffs64(0)
- Replace __ffs with __ffs64

v5:(Matthew)
- Drop subtree_max_alignment initialization at gpu_block_alloc()

Signed-off-by: Arunpravin Paneer Selvam <Arunpravin.PaneerSelvam@amd.com>
Suggested-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patch.msgid.link/20260306060155.2114-1-Arunpravin.PaneerSelvam@amd.com

Arunpravin Paneer Selvam 2 months ago 493740d7 e597a809

+230 -46

2 changed files

expand all

drivers

gpu

buddy.c

include

linux

gpu_buddy.h

+228 -46

drivers/gpu/buddy.c

··· 53 53 return gpu_buddy_block_state(block) == GPU_BUDDY_SPLIT; 54 54 } 55 55 56 + static unsigned int gpu_buddy_block_offset_alignment(struct gpu_buddy_block *block) 57 + { 58 + u64 offset = gpu_buddy_block_offset(block); 59 + 60 + if (!offset) 61 + /* 62 + * __ffs64(0) is undefined; offset 0 is maximally aligned, so return 63 + * a value greater than any possible alignment. 64 + */ 65 + return 64 + 1; 66 + 67 + return __ffs64(offset); 68 + } 69 + 70 + RB_DECLARE_CALLBACKS_MAX(static, gpu_buddy_augment_cb, 71 + struct gpu_buddy_block, rb, 72 + unsigned int, subtree_max_alignment, 73 + gpu_buddy_block_offset_alignment); 74 + 56 75 static struct gpu_buddy_block *gpu_block_alloc(struct gpu_buddy *mm, 57 76 struct gpu_buddy_block *parent, 58 77 unsigned int order, ··· 125 106 return RB_EMPTY_ROOT(root); 126 107 } 127 108 128 - static bool gpu_buddy_block_offset_less(const struct gpu_buddy_block *block, 129 - const struct gpu_buddy_block *node) 130 - { 131 - return gpu_buddy_block_offset(block) < gpu_buddy_block_offset(node); 132 - } 133 - 134 - static bool rbtree_block_offset_less(struct rb_node *block, 135 - const struct rb_node *node) 136 - { 137 - return gpu_buddy_block_offset_less(rbtree_get_free_block(block), 138 - rbtree_get_free_block(node)); 139 - } 140 - 141 109 static void rbtree_insert(struct gpu_buddy *mm, 142 110 struct gpu_buddy_block *block, 143 111 enum gpu_buddy_free_tree tree) 144 112 { 145 - rb_add(&block->rb, 146 - &mm->free_trees[tree][gpu_buddy_block_order(block)], 147 - rbtree_block_offset_less); 113 + struct rb_node **link, *parent = NULL; 114 + unsigned int block_alignment, order; 115 + struct gpu_buddy_block *node; 116 + struct rb_root *root; 117 + 118 + order = gpu_buddy_block_order(block); 119 + block_alignment = gpu_buddy_block_offset_alignment(block); 120 + 121 + root = &mm->free_trees[tree][order]; 122 + link = &root->rb_node; 123 + 124 + while (*link) { 125 + parent = *link; 126 + node = rbtree_get_free_block(parent); 127 + /* 128 + * Manual augmentation update during insertion traversal. Required 129 + * because rb_insert_augmented() only calls rotate callback during 130 + * rotations. This ensures all ancestors on the insertion path have 131 + * correct subtree_max_alignment values. 132 + */ 133 + if (node->subtree_max_alignment < block_alignment) 134 + node->subtree_max_alignment = block_alignment; 135 + 136 + if (gpu_buddy_block_offset(block) < gpu_buddy_block_offset(node)) 137 + link = &parent->rb_left; 138 + else 139 + link = &parent->rb_right; 140 + } 141 + 142 + block->subtree_max_alignment = block_alignment; 143 + rb_link_node(&block->rb, parent, link); 144 + rb_insert_augmented(&block->rb, root, &gpu_buddy_augment_cb); 148 145 } 149 146 150 147 static void rbtree_remove(struct gpu_buddy *mm, ··· 173 138 tree = get_block_tree(block); 174 139 root = &mm->free_trees[tree][order]; 175 140 176 - rb_erase(&block->rb, root); 141 + rb_erase_augmented(&block->rb, root, &gpu_buddy_augment_cb); 177 142 RB_CLEAR_NODE(&block->rb); 178 143 } 179 144 ··· 846 811 return ERR_PTR(err); 847 812 } 848 813 814 + static bool 815 + gpu_buddy_can_offset_align(u64 size, u64 min_block_size) 816 + { 817 + return size < min_block_size && is_power_of_2(size); 818 + } 819 + 820 + static bool gpu_buddy_subtree_can_satisfy(struct rb_node *node, 821 + unsigned int alignment) 822 + { 823 + struct gpu_buddy_block *block; 824 + 825 + block = rbtree_get_free_block(node); 826 + return block->subtree_max_alignment >= alignment; 827 + } 828 + 829 + static struct gpu_buddy_block * 830 + gpu_buddy_find_block_aligned(struct gpu_buddy *mm, 831 + enum gpu_buddy_free_tree tree, 832 + unsigned int order, 833 + unsigned int alignment, 834 + unsigned long flags) 835 + { 836 + struct rb_root *root = &mm->free_trees[tree][order]; 837 + struct rb_node *rb = root->rb_node; 838 + 839 + while (rb) { 840 + struct gpu_buddy_block *block = rbtree_get_free_block(rb); 841 + struct rb_node *left_node = rb->rb_left, *right_node = rb->rb_right; 842 + 843 + if (right_node) { 844 + if (gpu_buddy_subtree_can_satisfy(right_node, alignment)) { 845 + rb = right_node; 846 + continue; 847 + } 848 + } 849 + 850 + if (gpu_buddy_block_offset_alignment(block) >= alignment) 851 + return block; 852 + 853 + if (left_node) { 854 + if (gpu_buddy_subtree_can_satisfy(left_node, alignment)) { 855 + rb = left_node; 856 + continue; 857 + } 858 + } 859 + 860 + break; 861 + } 862 + 863 + return NULL; 864 + } 865 + 866 + static struct gpu_buddy_block * 867 + gpu_buddy_offset_aligned_allocation(struct gpu_buddy *mm, 868 + u64 size, 869 + u64 min_block_size, 870 + unsigned long flags) 871 + { 872 + struct gpu_buddy_block *block = NULL; 873 + unsigned int order, tmp, alignment; 874 + struct gpu_buddy_block *buddy; 875 + enum gpu_buddy_free_tree tree; 876 + unsigned long pages; 877 + int err; 878 + 879 + alignment = ilog2(min_block_size); 880 + pages = size >> ilog2(mm->chunk_size); 881 + order = fls(pages) - 1; 882 + 883 + tree = (flags & GPU_BUDDY_CLEAR_ALLOCATION) ? 884 + GPU_BUDDY_CLEAR_TREE : GPU_BUDDY_DIRTY_TREE; 885 + 886 + for (tmp = order; tmp <= mm->max_order; ++tmp) { 887 + block = gpu_buddy_find_block_aligned(mm, tree, tmp, 888 + alignment, flags); 889 + if (!block) { 890 + tree = (tree == GPU_BUDDY_CLEAR_TREE) ? 891 + GPU_BUDDY_DIRTY_TREE : GPU_BUDDY_CLEAR_TREE; 892 + block = gpu_buddy_find_block_aligned(mm, tree, tmp, 893 + alignment, flags); 894 + } 895 + 896 + if (block) 897 + break; 898 + } 899 + 900 + if (!block) 901 + return ERR_PTR(-ENOSPC); 902 + 903 + while (gpu_buddy_block_order(block) > order) { 904 + struct gpu_buddy_block *left, *right; 905 + 906 + err = split_block(mm, block); 907 + if (unlikely(err)) 908 + goto err_undo; 909 + 910 + left = block->left; 911 + right = block->right; 912 + 913 + if (gpu_buddy_block_offset_alignment(right) >= alignment) 914 + block = right; 915 + else 916 + block = left; 917 + } 918 + 919 + return block; 920 + 921 + err_undo: 922 + /* 923 + * We really don't want to leave around a bunch of split blocks, since 924 + * bigger is better, so make sure we merge everything back before we 925 + * free the allocated blocks. 926 + */ 927 + buddy = __get_buddy(block); 928 + if (buddy && 929 + (gpu_buddy_block_is_free(block) && 930 + gpu_buddy_block_is_free(buddy))) 931 + __gpu_buddy_free(mm, block, false); 932 + return ERR_PTR(err); 933 + } 934 + 849 935 static int __alloc_range(struct gpu_buddy *mm, 850 936 struct list_head *dfs, 851 937 u64 start, u64 size, ··· 1236 1080 static struct gpu_buddy_block * 1237 1081 __gpu_buddy_alloc_blocks(struct gpu_buddy *mm, 1238 1082 u64 start, u64 end, 1083 + u64 size, u64 min_block_size, 1239 1084 unsigned int order, 1240 1085 unsigned long flags) 1241 1086 { ··· 1244 1087 /* Allocate traversing within the range */ 1245 1088 return __gpu_buddy_alloc_range_bias(mm, start, end, 1246 1089 order, flags); 1090 + else if (size < min_block_size) 1091 + /* Allocate from an offset-aligned region without size rounding */ 1092 + return gpu_buddy_offset_aligned_allocation(mm, size, 1093 + min_block_size, 1094 + flags); 1247 1095 else 1248 1096 /* Allocate from freetree */ 1249 1097 return alloc_from_freetree(mm, order, flags); ··· 1320 1158 if (flags & GPU_BUDDY_CONTIGUOUS_ALLOCATION) { 1321 1159 size = roundup_pow_of_two(size); 1322 1160 min_block_size = size; 1323 - /* Align size value to min_block_size */ 1324 - } else if (!IS_ALIGNED(size, min_block_size)) { 1161 + /* 1162 + * Normalize the requested size to min_block_size for regular allocations. 1163 + * Offset-aligned allocations intentionally skip size rounding. 1164 + */ 1165 + } else if (!gpu_buddy_can_offset_align(size, min_block_size)) { 1325 1166 size = round_up(size, min_block_size); 1326 1167 } 1327 1168 ··· 1344 1179 do { 1345 1180 order = min(order, (unsigned int)fls(pages) - 1); 1346 1181 BUG_ON(order > mm->max_order); 1347 - BUG_ON(order < min_order); 1182 + /* 1183 + * Regular allocations must not allocate blocks smaller than min_block_size. 1184 + * Offset-aligned allocations deliberately bypass this constraint. 1185 + */ 1186 + BUG_ON(size >= min_block_size && order < min_order); 1348 1187 1349 1188 do { 1189 + unsigned int fallback_order; 1190 + 1350 1191 block = __gpu_buddy_alloc_blocks(mm, start, 1351 1192 end, 1193 + size, 1194 + min_block_size, 1352 1195 order, 1353 1196 flags); 1354 1197 if (!IS_ERR(block)) 1355 1198 break; 1356 1199 1357 - if (order-- == min_order) { 1358 - /* Try allocation through force merge method */ 1359 - if (mm->clear_avail && 1360 - !__force_merge(mm, start, end, min_order)) { 1361 - block = __gpu_buddy_alloc_blocks(mm, start, 1362 - end, 1363 - min_order, 1364 - flags); 1365 - if (!IS_ERR(block)) { 1366 - order = min_order; 1367 - break; 1368 - } 1369 - } 1370 - 1371 - /* 1372 - * Try contiguous block allocation through 1373 - * try harder method. 1374 - */ 1375 - if (flags & GPU_BUDDY_CONTIGUOUS_ALLOCATION && 1376 - !(flags & GPU_BUDDY_RANGE_ALLOCATION)) 1377 - return __alloc_contig_try_harder(mm, 1378 - original_size, 1379 - original_min_size, 1380 - blocks); 1381 - err = -ENOSPC; 1382 - goto err_free; 1200 + if (size < min_block_size) { 1201 + fallback_order = order; 1202 + } else if (order == min_order) { 1203 + fallback_order = min_order; 1204 + } else { 1205 + order--; 1206 + continue; 1383 1207 } 1208 + 1209 + /* Try allocation through force merge method */ 1210 + if (mm->clear_avail && 1211 + !__force_merge(mm, start, end, fallback_order)) { 1212 + block = __gpu_buddy_alloc_blocks(mm, start, 1213 + end, 1214 + size, 1215 + min_block_size, 1216 + fallback_order, 1217 + flags); 1218 + if (!IS_ERR(block)) { 1219 + order = fallback_order; 1220 + break; 1221 + } 1222 + } 1223 + 1224 + /* 1225 + * Try contiguous block allocation through 1226 + * try harder method. 1227 + */ 1228 + if (flags & GPU_BUDDY_CONTIGUOUS_ALLOCATION && 1229 + !(flags & GPU_BUDDY_RANGE_ALLOCATION)) 1230 + return __alloc_contig_try_harder(mm, 1231 + original_size, 1232 + original_min_size, 1233 + blocks); 1234 + err = -ENOSPC; 1235 + goto err_free; 1384 1236 } while (1); 1385 1237 1386 1238 mark_allocated(mm, block);

include/linux/gpu_buddy.h

··· 11 11 #include <linux/slab.h> 12 12 #include <linux/sched.h> 13 13 #include <linux/rbtree.h> 14 + #include <linux/rbtree_augmented.h> 14 15 15 16 /** 16 17 * GPU_BUDDY_RANGE_ALLOCATION - Allocate within a specific address range ··· 129 128 }; 130 129 /* private: */ 131 130 struct list_head tmp_link; 131 + unsigned int subtree_max_alignment; 132 132 }; 133 133 134 134 /* Order-zero must be at least SZ_4K */

Configure Feed

Configure Feed