Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'akpm' (patches from Andrew)

Merge fixes from Andrew Morton:
"18 fixes"

[ The 18 fixes turned into 17 commits, because one of the fixes was a
fix for another patch in the series that I just folded in by editing
the patch manually - hopefully correctly - Linus ]

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
mm: fix memory leak in copy_huge_pmd()
drivers/hwspinlock: fix race between radix tree insertion and lookup
radix-tree: fix race in gang lookup
mm/vmpressure.c: fix subtree pressure detection
mm: polish virtual memory accounting
mm: warn about VmData over RLIMIT_DATA
Documentation: cgroup-v2: add memory.stat::sock description
mm: memcontrol: drop superfluous entry in the per-memcg stats array
drivers/scsi/sg.c: mark VMA as VM_IO to prevent migration
proc: revert /proc/<pid>/maps [stack:TID] annotation
numa: fix /proc/<pid>/numa_maps for hugetlbfs on s390
MAINTAINERS: update Seth email
ocfs2/cluster: fix memory leak in o2hb_region_release
lib/test-string_helpers.c: fix and improve string_get_size() tests
thp: limit number of object to scan on deferred_split_scan()
thp: change deferred_split_count() to return number of THP in queue
thp: make split_queue per-node

+268 -192
+4
Documentation/cgroup-v2.txt
··· 843 843 Amount of memory used to cache filesystem data, 844 844 including tmpfs and shared memory. 845 845 846 + sock 847 + 848 + Amount of memory used in network transmission buffers 849 + 846 850 file_mapped 847 851 848 852 Amount of cached filesystem data mapped with mmap()
+5 -8
Documentation/filesystems/proc.txt
··· 240 240 RssFile size of resident file mappings 241 241 RssShmem size of resident shmem memory (includes SysV shm, 242 242 mapping of tmpfs and shared anonymous mappings) 243 - VmData size of data, stack, and text segments 244 - VmStk size of data, stack, and text segments 243 + VmData size of private data segments 244 + VmStk size of stack segments 245 245 VmExe size of text segment 246 246 VmLib size of shared library code 247 247 VmPTE size of page table entries ··· 356 356 a7cb1000-a7cb2000 ---p 00000000 00:00 0 357 357 a7cb2000-a7eb2000 rw-p 00000000 00:00 0 358 358 a7eb2000-a7eb3000 ---p 00000000 00:00 0 359 - a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack:1001] 359 + a7eb3000-a7ed5000 rw-p 00000000 00:00 0 360 360 a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 361 361 a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6 362 362 a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6 ··· 388 388 389 389 [heap] = the heap of the program 390 390 [stack] = the stack of the main process 391 - [stack:1001] = the stack of the thread with tid 1001 392 391 [vdso] = the "virtual dynamic shared object", 393 392 the kernel system call handler 394 393 ··· 395 396 396 397 The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint 397 398 of the individual tasks of a process. In this file you will see a mapping marked 398 - as [stack] if that task sees it as a stack. This is a key difference from the 399 - content of /proc/PID/maps, where you will see all mappings that are being used 400 - as stack by all of those tasks. Hence, for the example above, the task-level 401 - map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this: 399 + as [stack] if that task sees it as a stack. Hence, for the example above, the 400 + task-level map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this: 402 401 403 402 08048000-08049000 r-xp 00000000 03:00 8312 /opt/test 404 403 08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test
+5
Documentation/kernel-parameters.txt
··· 1496 1496 could change it dynamically, usually by 1497 1497 /sys/module/printk/parameters/ignore_loglevel. 1498 1498 1499 + ignore_rlimit_data 1500 + Ignore RLIMIT_DATA setting for data mappings, 1501 + print warning at first misuse. Can be changed via 1502 + /sys/module/kernel/parameters/ignore_rlimit_data. 1503 + 1499 1504 ihash_entries= [KNL] 1500 1505 Set number of hash buckets for inode cache. 1501 1506
+2 -2
MAINTAINERS
··· 12150 12150 F: drivers/net/hamradio/z8530.h 12151 12151 12152 12152 ZBUD COMPRESSED PAGE ALLOCATOR 12153 - M: Seth Jennings <sjennings@variantweb.net> 12153 + M: Seth Jennings <sjenning@redhat.com> 12154 12154 L: linux-mm@kvack.org 12155 12155 S: Maintained 12156 12156 F: mm/zbud.c ··· 12205 12205 F: Documentation/vm/zsmalloc.txt 12206 12206 12207 12207 ZSWAP COMPRESSED SWAP CACHING 12208 - M: Seth Jennings <sjennings@variantweb.net> 12208 + M: Seth Jennings <sjenning@redhat.com> 12209 12209 L: linux-mm@kvack.org 12210 12210 S: Maintained 12211 12211 F: mm/zswap.c
+4
drivers/hwspinlock/hwspinlock_core.c
··· 313 313 hwlock = radix_tree_deref_slot(slot); 314 314 if (unlikely(!hwlock)) 315 315 continue; 316 + if (radix_tree_is_indirect_ptr(hwlock)) { 317 + slot = radix_tree_iter_retry(&iter); 318 + continue; 319 + } 316 320 317 321 if (hwlock->bank->dev->of_node == args.np) { 318 322 ret = 0;
+1 -1
drivers/scsi/sg.c
··· 1261 1261 } 1262 1262 1263 1263 sfp->mmap_called = 1; 1264 - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 1264 + vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; 1265 1265 vma->vm_private_data = sfp; 1266 1266 vma->vm_ops = &sg_mmap_vm_ops; 1267 1267 return 0;
+8 -6
fs/ocfs2/cluster/heartbeat.c
··· 1254 1254 1255 1255 void o2hb_exit(void) 1256 1256 { 1257 - kfree(o2hb_db_livenodes); 1258 - kfree(o2hb_db_liveregions); 1259 - kfree(o2hb_db_quorumregions); 1260 - kfree(o2hb_db_failedregions); 1261 1257 debugfs_remove(o2hb_debug_failedregions); 1262 1258 debugfs_remove(o2hb_debug_quorumregions); 1263 1259 debugfs_remove(o2hb_debug_liveregions); 1264 1260 debugfs_remove(o2hb_debug_livenodes); 1265 1261 debugfs_remove(o2hb_debug_dir); 1262 + kfree(o2hb_db_livenodes); 1263 + kfree(o2hb_db_liveregions); 1264 + kfree(o2hb_db_quorumregions); 1265 + kfree(o2hb_db_failedregions); 1266 1266 } 1267 1267 1268 1268 static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, ··· 1438 1438 1439 1439 kfree(reg->hr_slots); 1440 1440 1441 - kfree(reg->hr_db_regnum); 1442 - kfree(reg->hr_db_livenodes); 1443 1441 debugfs_remove(reg->hr_debug_livenodes); 1444 1442 debugfs_remove(reg->hr_debug_regnum); 1445 1443 debugfs_remove(reg->hr_debug_elapsed_time); 1446 1444 debugfs_remove(reg->hr_debug_pinned); 1447 1445 debugfs_remove(reg->hr_debug_dir); 1446 + kfree(reg->hr_db_livenodes); 1447 + kfree(reg->hr_db_regnum); 1448 + kfree(reg->hr_debug_elapsed_time); 1449 + kfree(reg->hr_debug_pinned); 1448 1450 1449 1451 spin_lock(&o2hb_live_lock); 1450 1452 list_del(&reg->hr_all_item);
+27 -46
fs/proc/task_mmu.c
··· 259 259 sizeof(struct proc_maps_private)); 260 260 } 261 261 262 - static pid_t pid_of_stack(struct proc_maps_private *priv, 263 - struct vm_area_struct *vma, bool is_pid) 262 + /* 263 + * Indicate if the VMA is a stack for the given task; for 264 + * /proc/PID/maps that is the stack of the main task. 265 + */ 266 + static int is_stack(struct proc_maps_private *priv, 267 + struct vm_area_struct *vma, int is_pid) 264 268 { 265 - struct inode *inode = priv->inode; 266 - struct task_struct *task; 267 - pid_t ret = 0; 269 + int stack = 0; 268 270 269 - rcu_read_lock(); 270 - task = pid_task(proc_pid(inode), PIDTYPE_PID); 271 - if (task) { 272 - task = task_of_stack(task, vma, is_pid); 271 + if (is_pid) { 272 + stack = vma->vm_start <= vma->vm_mm->start_stack && 273 + vma->vm_end >= vma->vm_mm->start_stack; 274 + } else { 275 + struct inode *inode = priv->inode; 276 + struct task_struct *task; 277 + 278 + rcu_read_lock(); 279 + task = pid_task(proc_pid(inode), PIDTYPE_PID); 273 280 if (task) 274 - ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); 281 + stack = vma_is_stack_for_task(vma, task); 282 + rcu_read_unlock(); 275 283 } 276 - rcu_read_unlock(); 277 - 278 - return ret; 284 + return stack; 279 285 } 280 286 281 287 static void ··· 341 335 342 336 name = arch_vma_name(vma); 343 337 if (!name) { 344 - pid_t tid; 345 - 346 338 if (!mm) { 347 339 name = "[vdso]"; 348 340 goto done; ··· 352 348 goto done; 353 349 } 354 350 355 - tid = pid_of_stack(priv, vma, is_pid); 356 - if (tid != 0) { 357 - /* 358 - * Thread stack in /proc/PID/task/TID/maps or 359 - * the main process stack. 360 - */ 361 - if (!is_pid || (vma->vm_start <= mm->start_stack && 362 - vma->vm_end >= mm->start_stack)) { 363 - name = "[stack]"; 364 - } else { 365 - /* Thread stack in /proc/PID/maps */ 366 - seq_pad(m, ' '); 367 - seq_printf(m, "[stack:%d]", tid); 368 - } 369 - } 351 + if (is_stack(priv, vma, is_pid)) 352 + name = "[stack]"; 370 353 } 371 354 372 355 done: ··· 1543 1552 static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, 1544 1553 unsigned long addr, unsigned long end, struct mm_walk *walk) 1545 1554 { 1555 + pte_t huge_pte = huge_ptep_get(pte); 1546 1556 struct numa_maps *md; 1547 1557 struct page *page; 1548 1558 1549 - if (!pte_present(*pte)) 1559 + if (!pte_present(huge_pte)) 1550 1560 return 0; 1551 1561 1552 - page = pte_page(*pte); 1562 + page = pte_page(huge_pte); 1553 1563 if (!page) 1554 1564 return 0; 1555 1565 1556 1566 md = walk->private; 1557 - gather_stats(page, md, pte_dirty(*pte), 1); 1567 + gather_stats(page, md, pte_dirty(huge_pte), 1); 1558 1568 return 0; 1559 1569 } 1560 1570 ··· 1609 1617 seq_file_path(m, file, "\n\t= "); 1610 1618 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { 1611 1619 seq_puts(m, " heap"); 1612 - } else { 1613 - pid_t tid = pid_of_stack(proc_priv, vma, is_pid); 1614 - if (tid != 0) { 1615 - /* 1616 - * Thread stack in /proc/PID/task/TID/maps or 1617 - * the main process stack. 1618 - */ 1619 - if (!is_pid || (vma->vm_start <= mm->start_stack && 1620 - vma->vm_end >= mm->start_stack)) 1621 - seq_puts(m, " stack"); 1622 - else 1623 - seq_printf(m, " stack:%d", tid); 1624 - } 1620 + } else if (is_stack(proc_priv, vma, is_pid)) { 1621 + seq_puts(m, " stack"); 1625 1622 } 1626 1623 1627 1624 if (is_vm_hugetlb_page(vma))
+19 -28
fs/proc/task_nommu.c
··· 123 123 return size; 124 124 } 125 125 126 - static pid_t pid_of_stack(struct proc_maps_private *priv, 127 - struct vm_area_struct *vma, bool is_pid) 126 + static int is_stack(struct proc_maps_private *priv, 127 + struct vm_area_struct *vma, int is_pid) 128 128 { 129 - struct inode *inode = priv->inode; 130 - struct task_struct *task; 131 - pid_t ret = 0; 129 + struct mm_struct *mm = vma->vm_mm; 130 + int stack = 0; 132 131 133 - rcu_read_lock(); 134 - task = pid_task(proc_pid(inode), PIDTYPE_PID); 135 - if (task) { 136 - task = task_of_stack(task, vma, is_pid); 132 + if (is_pid) { 133 + stack = vma->vm_start <= mm->start_stack && 134 + vma->vm_end >= mm->start_stack; 135 + } else { 136 + struct inode *inode = priv->inode; 137 + struct task_struct *task; 138 + 139 + rcu_read_lock(); 140 + task = pid_task(proc_pid(inode), PIDTYPE_PID); 137 141 if (task) 138 - ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); 142 + stack = vma_is_stack_for_task(vma, task); 143 + rcu_read_unlock(); 139 144 } 140 - rcu_read_unlock(); 141 - 142 - return ret; 145 + return stack; 143 146 } 144 147 145 148 /* ··· 184 181 if (file) { 185 182 seq_pad(m, ' '); 186 183 seq_file_path(m, file, ""); 187 - } else if (mm) { 188 - pid_t tid = pid_of_stack(priv, vma, is_pid); 189 - 190 - if (tid != 0) { 191 - seq_pad(m, ' '); 192 - /* 193 - * Thread stack in /proc/PID/task/TID/maps or 194 - * the main process stack. 195 - */ 196 - if (!is_pid || (vma->vm_start <= mm->start_stack && 197 - vma->vm_end >= mm->start_stack)) 198 - seq_printf(m, "[stack]"); 199 - else 200 - seq_printf(m, "[stack:%d]", tid); 201 - } 184 + } else if (mm && is_stack(priv, vma, is_pid)) { 185 + seq_pad(m, ' '); 186 + seq_printf(m, "[stack]"); 202 187 } 203 188 204 189 seq_putc(m, '\n');
+1 -1
include/linux/memcontrol.h
··· 51 51 MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ 52 52 MEM_CGROUP_STAT_NSTATS, 53 53 /* default hierarchy stats */ 54 - MEMCG_SOCK, 54 + MEMCG_SOCK = MEM_CGROUP_STAT_NSTATS, 55 55 MEMCG_NR_STAT, 56 56 }; 57 57
+5 -4
include/linux/mm.h
··· 201 201 #endif 202 202 203 203 #ifdef CONFIG_STACK_GROWSUP 204 - #define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) 204 + #define VM_STACK VM_GROWSUP 205 205 #else 206 - #define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) 206 + #define VM_STACK VM_GROWSDOWN 207 207 #endif 208 + 209 + #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) 208 210 209 211 /* 210 212 * Special vmas that are non-mergable, non-mlock()able. ··· 1343 1341 !vma_growsup(vma->vm_next, addr); 1344 1342 } 1345 1343 1346 - extern struct task_struct *task_of_stack(struct task_struct *task, 1347 - struct vm_area_struct *vma, bool in_group); 1344 + int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t); 1348 1345 1349 1346 extern unsigned long move_page_tables(struct vm_area_struct *vma, 1350 1347 unsigned long old_addr, struct vm_area_struct *new_vma,
+3 -3
include/linux/mm_types.h
··· 424 424 unsigned long total_vm; /* Total pages mapped */ 425 425 unsigned long locked_vm; /* Pages that have PG_mlocked set */ 426 426 unsigned long pinned_vm; /* Refcount permanently increased */ 427 - unsigned long data_vm; /* VM_WRITE & ~VM_SHARED/GROWSDOWN */ 428 - unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */ 429 - unsigned long stack_vm; /* VM_GROWSUP/DOWN */ 427 + unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ 428 + unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ 429 + unsigned long stack_vm; /* VM_STACK */ 430 430 unsigned long def_flags; 431 431 unsigned long start_code, end_code, start_data, end_data; 432 432 unsigned long start_brk, brk, start_stack;
+6
include/linux/mmzone.h
··· 682 682 */ 683 683 unsigned long first_deferred_pfn; 684 684 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 685 + 686 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 687 + spinlock_t split_queue_lock; 688 + struct list_head split_queue; 689 + unsigned long split_queue_len; 690 + #endif 685 691 } pg_data_t; 686 692 687 693 #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
+16
include/linux/radix-tree.h
··· 379 379 struct radix_tree_iter *iter, unsigned flags); 380 380 381 381 /** 382 + * radix_tree_iter_retry - retry this chunk of the iteration 383 + * @iter: iterator state 384 + * 385 + * If we iterate over a tree protected only by the RCU lock, a race 386 + * against deletion or creation may result in seeing a slot for which 387 + * radix_tree_deref_retry() returns true. If so, call this function 388 + * and continue the iteration. 389 + */ 390 + static inline __must_check 391 + void **radix_tree_iter_retry(struct radix_tree_iter *iter) 392 + { 393 + iter->next_index = iter->index; 394 + return NULL; 395 + } 396 + 397 + /** 382 398 * radix_tree_chunk_size - get current chunk size 383 399 * 384 400 * @iter: pointer to radix tree iterator
+10 -2
lib/radix-tree.c
··· 1019 1019 return 0; 1020 1020 1021 1021 radix_tree_for_each_slot(slot, root, &iter, first_index) { 1022 - results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot)); 1022 + results[ret] = rcu_dereference_raw(*slot); 1023 1023 if (!results[ret]) 1024 1024 continue; 1025 + if (radix_tree_is_indirect_ptr(results[ret])) { 1026 + slot = radix_tree_iter_retry(&iter); 1027 + continue; 1028 + } 1025 1029 if (++ret == max_items) 1026 1030 break; 1027 1031 } ··· 1102 1098 return 0; 1103 1099 1104 1100 radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { 1105 - results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot)); 1101 + results[ret] = rcu_dereference_raw(*slot); 1106 1102 if (!results[ret]) 1107 1103 continue; 1104 + if (radix_tree_is_indirect_ptr(results[ret])) { 1105 + slot = radix_tree_iter_retry(&iter); 1106 + continue; 1107 + } 1108 1108 if (++ret == max_items) 1109 1109 break; 1110 1110 }
+49 -18
lib/test-string_helpers.c
··· 327 327 } 328 328 329 329 #define string_get_size_maxbuf 16 330 - #define test_string_get_size_one(size, blk_size, units, exp_result) \ 330 + #define test_string_get_size_one(size, blk_size, exp_result10, exp_result2) \ 331 331 do { \ 332 - BUILD_BUG_ON(sizeof(exp_result) >= string_get_size_maxbuf); \ 333 - __test_string_get_size((size), (blk_size), (units), \ 334 - (exp_result)); \ 332 + BUILD_BUG_ON(sizeof(exp_result10) >= string_get_size_maxbuf); \ 333 + BUILD_BUG_ON(sizeof(exp_result2) >= string_get_size_maxbuf); \ 334 + __test_string_get_size((size), (blk_size), (exp_result10), \ 335 + (exp_result2)); \ 335 336 } while (0) 336 337 337 338 338 - static __init void __test_string_get_size(const u64 size, const u64 blk_size, 339 - const enum string_size_units units, 340 - const char *exp_result) 339 + static __init void test_string_get_size_check(const char *units, 340 + const char *exp, 341 + char *res, 342 + const u64 size, 343 + const u64 blk_size) 341 344 { 342 - char buf[string_get_size_maxbuf]; 343 - 344 - string_get_size(size, blk_size, units, buf, sizeof(buf)); 345 - if (!memcmp(buf, exp_result, strlen(exp_result) + 1)) 345 + if (!memcmp(res, exp, strlen(exp) + 1)) 346 346 return; 347 347 348 - buf[sizeof(buf) - 1] = '\0'; 349 - pr_warn("Test 'test_string_get_size_one' failed!\n"); 350 - pr_warn("string_get_size(size = %llu, blk_size = %llu, units = %d\n", 348 + res[string_get_size_maxbuf - 1] = '\0'; 349 + 350 + pr_warn("Test 'test_string_get_size' failed!\n"); 351 + pr_warn("string_get_size(size = %llu, blk_size = %llu, units = %s)\n", 351 352 size, blk_size, units); 352 - pr_warn("expected: '%s', got '%s'\n", exp_result, buf); 353 + pr_warn("expected: '%s', got '%s'\n", exp, res); 354 + } 355 + 356 + static __init void __test_string_get_size(const u64 size, const u64 blk_size, 357 + const char *exp_result10, 358 + const char *exp_result2) 359 + { 360 + char buf10[string_get_size_maxbuf]; 361 + char buf2[string_get_size_maxbuf]; 362 + 363 + string_get_size(size, blk_size, STRING_UNITS_10, buf10, sizeof(buf10)); 364 + string_get_size(size, blk_size, STRING_UNITS_2, buf2, sizeof(buf2)); 365 + 366 + test_string_get_size_check("STRING_UNITS_10", exp_result10, buf10, 367 + size, blk_size); 368 + 369 + test_string_get_size_check("STRING_UNITS_2", exp_result2, buf2, 370 + size, blk_size); 353 371 } 354 372 355 373 static __init void test_string_get_size(void) 356 374 { 357 - test_string_get_size_one(16384, 512, STRING_UNITS_2, "8.00 MiB"); 358 - test_string_get_size_one(8192, 4096, STRING_UNITS_10, "32.7 MB"); 359 - test_string_get_size_one(1, 512, STRING_UNITS_10, "512 B"); 375 + /* small values */ 376 + test_string_get_size_one(0, 512, "0 B", "0 B"); 377 + test_string_get_size_one(1, 512, "512 B", "512 B"); 378 + test_string_get_size_one(1100, 1, "1.10 kB", "1.07 KiB"); 379 + 380 + /* normal values */ 381 + test_string_get_size_one(16384, 512, "8.39 MB", "8.00 MiB"); 382 + test_string_get_size_one(500118192, 512, "256 GB", "238 GiB"); 383 + test_string_get_size_one(8192, 4096, "33.6 MB", "32.0 MiB"); 384 + 385 + /* weird block sizes */ 386 + test_string_get_size_one(3000, 1900, "5.70 MB", "5.44 MiB"); 387 + 388 + /* huge values */ 389 + test_string_get_size_one(U64_MAX, 4096, "75.6 ZB", "64.0 ZiB"); 390 + test_string_get_size_one(4096, U64_MAX, "75.6 ZB", "64.0 ZiB"); 360 391 } 361 392 362 393 static int __init test_string_helpers_init(void)
+48 -39
mm/huge_memory.c
··· 138 138 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 139 139 }; 140 140 141 - static DEFINE_SPINLOCK(split_queue_lock); 142 - static LIST_HEAD(split_queue); 143 - static unsigned long split_queue_len; 144 141 static struct shrinker deferred_split_shrinker; 145 142 146 143 static void set_recommended_min_free_kbytes(void) ··· 858 861 return false; 859 862 entry = mk_pmd(zero_page, vma->vm_page_prot); 860 863 entry = pmd_mkhuge(entry); 861 - pgtable_trans_huge_deposit(mm, pmd, pgtable); 864 + if (pgtable) 865 + pgtable_trans_huge_deposit(mm, pmd, pgtable); 862 866 set_pmd_at(mm, haddr, pmd, entry); 863 867 atomic_long_inc(&mm->nr_ptes); 864 868 return true; ··· 1037 1039 spinlock_t *dst_ptl, *src_ptl; 1038 1040 struct page *src_page; 1039 1041 pmd_t pmd; 1040 - pgtable_t pgtable; 1042 + pgtable_t pgtable = NULL; 1041 1043 int ret; 1042 1044 1043 - ret = -ENOMEM; 1044 - pgtable = pte_alloc_one(dst_mm, addr); 1045 - if (unlikely(!pgtable)) 1046 - goto out; 1045 + if (!vma_is_dax(vma)) { 1046 + ret = -ENOMEM; 1047 + pgtable = pte_alloc_one(dst_mm, addr); 1048 + if (unlikely(!pgtable)) 1049 + goto out; 1050 + } 1047 1051 1048 1052 dst_ptl = pmd_lock(dst_mm, dst_pmd); 1049 1053 src_ptl = pmd_lockptr(src_mm, src_pmd); ··· 1076 1076 goto out_unlock; 1077 1077 } 1078 1078 1079 - if (pmd_trans_huge(pmd)) { 1079 + if (!vma_is_dax(vma)) { 1080 1080 /* thp accounting separate from pmd_devmap accounting */ 1081 1081 src_page = pmd_page(pmd); 1082 1082 VM_BUG_ON_PAGE(!PageHead(src_page), src_page); ··· 3358 3358 int split_huge_page_to_list(struct page *page, struct list_head *list) 3359 3359 { 3360 3360 struct page *head = compound_head(page); 3361 + struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); 3361 3362 struct anon_vma *anon_vma; 3362 3363 int count, mapcount, ret; 3363 3364 bool mlocked; ··· 3402 3401 lru_add_drain(); 3403 3402 3404 3403 /* Prevent deferred_split_scan() touching ->_count */ 3405 - spin_lock_irqsave(&split_queue_lock, flags); 3404 + spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3406 3405 count = page_count(head); 3407 3406 mapcount = total_mapcount(head); 3408 3407 if (!mapcount && count == 1) { 3409 3408 if (!list_empty(page_deferred_list(head))) { 3410 - split_queue_len--; 3409 + pgdata->split_queue_len--; 3411 3410 list_del(page_deferred_list(head)); 3412 3411 } 3413 - spin_unlock_irqrestore(&split_queue_lock, flags); 3412 + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3414 3413 __split_huge_page(page, list); 3415 3414 ret = 0; 3416 3415 } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { 3417 - spin_unlock_irqrestore(&split_queue_lock, flags); 3416 + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3418 3417 pr_alert("total_mapcount: %u, page_count(): %u\n", 3419 3418 mapcount, count); 3420 3419 if (PageTail(page)) ··· 3422 3421 dump_page(page, "total_mapcount(head) > 0"); 3423 3422 BUG(); 3424 3423 } else { 3425 - spin_unlock_irqrestore(&split_queue_lock, flags); 3424 + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3426 3425 unfreeze_page(anon_vma, head); 3427 3426 ret = -EBUSY; 3428 3427 } ··· 3437 3436 3438 3437 void free_transhuge_page(struct page *page) 3439 3438 { 3439 + struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 3440 3440 unsigned long flags; 3441 3441 3442 - spin_lock_irqsave(&split_queue_lock, flags); 3442 + spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3443 3443 if (!list_empty(page_deferred_list(page))) { 3444 - split_queue_len--; 3444 + pgdata->split_queue_len--; 3445 3445 list_del(page_deferred_list(page)); 3446 3446 } 3447 - spin_unlock_irqrestore(&split_queue_lock, flags); 3447 + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3448 3448 free_compound_page(page); 3449 3449 } 3450 3450 3451 3451 void deferred_split_huge_page(struct page *page) 3452 3452 { 3453 + struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 3453 3454 unsigned long flags; 3454 3455 3455 3456 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 3456 3457 3457 - spin_lock_irqsave(&split_queue_lock, flags); 3458 + spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3458 3459 if (list_empty(page_deferred_list(page))) { 3459 - list_add_tail(page_deferred_list(page), &split_queue); 3460 - split_queue_len++; 3460 + list_add_tail(page_deferred_list(page), &pgdata->split_queue); 3461 + pgdata->split_queue_len++; 3461 3462 } 3462 - spin_unlock_irqrestore(&split_queue_lock, flags); 3463 + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3463 3464 } 3464 3465 3465 3466 static unsigned long deferred_split_count(struct shrinker *shrink, 3466 3467 struct shrink_control *sc) 3467 3468 { 3468 - /* 3469 - * Split a page from split_queue will free up at least one page, 3470 - * at most HPAGE_PMD_NR - 1. We don't track exact number. 3471 - * Let's use HPAGE_PMD_NR / 2 as ballpark. 3472 - */ 3473 - return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2; 3469 + struct pglist_data *pgdata = NODE_DATA(sc->nid); 3470 + return ACCESS_ONCE(pgdata->split_queue_len); 3474 3471 } 3475 3472 3476 3473 static unsigned long deferred_split_scan(struct shrinker *shrink, 3477 3474 struct shrink_control *sc) 3478 3475 { 3476 + struct pglist_data *pgdata = NODE_DATA(sc->nid); 3479 3477 unsigned long flags; 3480 3478 LIST_HEAD(list), *pos, *next; 3481 3479 struct page *page; 3482 3480 int split = 0; 3483 3481 3484 - spin_lock_irqsave(&split_queue_lock, flags); 3485 - list_splice_init(&split_queue, &list); 3486 - 3482 + spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3487 3483 /* Take pin on all head pages to avoid freeing them under us */ 3488 3484 list_for_each_safe(pos, next, &list) { 3489 3485 page = list_entry((void *)pos, struct page, mapping); 3490 3486 page = compound_head(page); 3491 - /* race with put_compound_page() */ 3492 - if (!get_page_unless_zero(page)) { 3487 + if (get_page_unless_zero(page)) { 3488 + list_move(page_deferred_list(page), &list); 3489 + } else { 3490 + /* We lost race with put_compound_page() */ 3493 3491 list_del_init(page_deferred_list(page)); 3494 - split_queue_len--; 3492 + pgdata->split_queue_len--; 3495 3493 } 3494 + if (!--sc->nr_to_scan) 3495 + break; 3496 3496 } 3497 - spin_unlock_irqrestore(&split_queue_lock, flags); 3497 + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3498 3498 3499 3499 list_for_each_safe(pos, next, &list) { 3500 3500 page = list_entry((void *)pos, struct page, mapping); ··· 3507 3505 put_page(page); 3508 3506 } 3509 3507 3510 - spin_lock_irqsave(&split_queue_lock, flags); 3511 - list_splice_tail(&list, &split_queue); 3512 - spin_unlock_irqrestore(&split_queue_lock, flags); 3508 + spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3509 + list_splice_tail(&list, &pgdata->split_queue); 3510 + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3513 3511 3514 - return split * HPAGE_PMD_NR / 2; 3512 + /* 3513 + * Stop shrinker if we didn't split any page, but the queue is empty. 3514 + * This can happen if pages were freed under us. 3515 + */ 3516 + if (!split && list_empty(&pgdata->split_queue)) 3517 + return SHRINK_STOP; 3518 + return split; 3515 3519 } 3516 3520 3517 3521 static struct shrinker deferred_split_shrinker = { 3518 3522 .count_objects = deferred_split_count, 3519 3523 .scan_objects = deferred_split_scan, 3520 3524 .seeks = DEFAULT_SEEKS, 3525 + .flags = SHRINKER_NUMA_AWARE, 3521 3526 }; 3522 3527 3523 3528 #ifdef CONFIG_DEBUG_FS
+31
mm/internal.h
··· 216 216 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 217 217 } 218 218 219 + /* 220 + * These three helpers classifies VMAs for virtual memory accounting. 221 + */ 222 + 223 + /* 224 + * Executable code area - executable, not writable, not stack 225 + */ 226 + static inline bool is_exec_mapping(vm_flags_t flags) 227 + { 228 + return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; 229 + } 230 + 231 + /* 232 + * Stack area - atomatically grows in one direction 233 + * 234 + * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: 235 + * do_mmap() forbids all other combinations. 236 + */ 237 + static inline bool is_stack_mapping(vm_flags_t flags) 238 + { 239 + return (flags & VM_STACK) == VM_STACK; 240 + } 241 + 242 + /* 243 + * Data area - private, writable, not stack 244 + */ 245 + static inline bool is_data_mapping(vm_flags_t flags) 246 + { 247 + return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; 248 + } 249 + 219 250 /* mm/util.c */ 220 251 void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, 221 252 struct vm_area_struct *prev, struct rb_node *rb_parent);
+17 -6
mm/mmap.c
··· 42 42 #include <linux/memory.h> 43 43 #include <linux/printk.h> 44 44 #include <linux/userfaultfd_k.h> 45 + #include <linux/moduleparam.h> 45 46 46 47 #include <asm/uaccess.h> 47 48 #include <asm/cacheflush.h> ··· 70 69 int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; 71 70 #endif 72 71 72 + static bool ignore_rlimit_data = true; 73 + core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); 73 74 74 75 static void unmap_region(struct mm_struct *mm, 75 76 struct vm_area_struct *vma, struct vm_area_struct *prev, ··· 2985 2982 if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) 2986 2983 return false; 2987 2984 2988 - if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS & 2989 - (VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE) 2990 - return mm->data_vm + npages <= rlimit(RLIMIT_DATA); 2985 + if (is_data_mapping(flags) && 2986 + mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { 2987 + if (ignore_rlimit_data) 2988 + pr_warn_once("%s (%d): VmData %lu exceed data ulimit " 2989 + "%lu. Will be forbidden soon.\n", 2990 + current->comm, current->pid, 2991 + (mm->data_vm + npages) << PAGE_SHIFT, 2992 + rlimit(RLIMIT_DATA)); 2993 + else 2994 + return false; 2995 + } 2991 2996 2992 2997 return true; 2993 2998 } ··· 3004 2993 { 3005 2994 mm->total_vm += npages; 3006 2995 3007 - if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC) 2996 + if (is_exec_mapping(flags)) 3008 2997 mm->exec_vm += npages; 3009 - else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN))) 2998 + else if (is_stack_mapping(flags)) 3010 2999 mm->stack_vm += npages; 3011 - else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) 3000 + else if (is_data_mapping(flags)) 3012 3001 mm->data_vm += npages; 3013 3002 } 3014 3003
+5
mm/page_alloc.c
··· 5210 5210 pgdat->numabalancing_migrate_nr_pages = 0; 5211 5211 pgdat->numabalancing_migrate_next_window = jiffies; 5212 5212 #endif 5213 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5214 + spin_lock_init(&pgdat->split_queue_lock); 5215 + INIT_LIST_HEAD(&pgdat->split_queue); 5216 + pgdat->split_queue_len = 0; 5217 + #endif 5213 5218 init_waitqueue_head(&pgdat->kswapd_wait); 5214 5219 init_waitqueue_head(&pgdat->pfmemalloc_wait); 5215 5220 pgdat_page_ext_init(pgdat);
+1 -26
mm/util.c
··· 230 230 } 231 231 232 232 /* Check if the vma is being used as a stack by this task */ 233 - static int vm_is_stack_for_task(struct task_struct *t, 234 - struct vm_area_struct *vma) 233 + int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t) 235 234 { 236 235 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); 237 - } 238 - 239 - /* 240 - * Check if the vma is being used as a stack. 241 - * If is_group is non-zero, check in the entire thread group or else 242 - * just check in the current task. Returns the task_struct of the task 243 - * that the vma is stack for. Must be called under rcu_read_lock(). 244 - */ 245 - struct task_struct *task_of_stack(struct task_struct *task, 246 - struct vm_area_struct *vma, bool in_group) 247 - { 248 - if (vm_is_stack_for_task(task, vma)) 249 - return task; 250 - 251 - if (in_group) { 252 - struct task_struct *t; 253 - 254 - for_each_thread(task, t) { 255 - if (vm_is_stack_for_task(t, vma)) 256 - return t; 257 - } 258 - } 259 - 260 - return NULL; 261 236 } 262 237 263 238 #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
+1 -2
mm/vmpressure.c
··· 248 248 249 249 if (tree) { 250 250 spin_lock(&vmpr->sr_lock); 251 - vmpr->tree_scanned += scanned; 251 + scanned = vmpr->tree_scanned += scanned; 252 252 vmpr->tree_reclaimed += reclaimed; 253 - scanned = vmpr->scanned; 254 253 spin_unlock(&vmpr->sr_lock); 255 254 256 255 if (scanned < vmpressure_win)