Merge branch 'akpm' (fixes from Andrew Morton)

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge branch 'akpm' (fixes from Andrew Morton)

Merge misc fixes from Andrew Morton.

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (21 commits)
mm: revert mremap pud_free anti-fix
mm: fix BUG in __split_huge_page_pmd
swap: fix set_blocksize race during swapon/swapoff
procfs: call default get_unmapped_area on MMU-present architectures
procfs: fix unintended truncation of returned mapped address
writeback: fix negative bdi max pause
percpu_refcount: export symbols
fs: buffer: move allocation failure loop into the allocator
mm: memcg: handle non-error OOM situations more gracefully
tools/testing/selftests: fix uninitialized variable
block/partitions/efi.c: treat size mismatch as a warning, not an error
mm: hugetlb: initialize PG_reserved for tail pages of gigantic compound pages
mm/zswap: bugfix: memory leak when re-swapon
mm: /proc/pid/pagemap: inspect _PAGE_SOFT_DIRTY only on present pages
mm: migration: do not lose soft dirty bit if page is in migration state
gcov: MAINTAINERS: Add an entry for gcov
mm/hugetlb.c: correct missing private flag clearing
mm/vmscan.c: don't forget to free shrinker->nr_deferred
ipc/sem.c: synchronize semop and semctl with IPC_RMID
ipc: update locking scheme comments
...

Linus Torvalds 12 years ago 056cdce0 0056019d

+236 -206

24 changed files

expand all collapse all

MAINTAINERS

block

partitions

efi.c

buffer.c

proc

inode.c

task_mmu.c

include

linux

memcontrol.h

sched.h

ipc

sem.c

util.c

lib

percpu-refcount.c

filemap.c

huge_memory.c

hugetlb.c

memcontrol.c

memory.c

migrate.c

mprotect.c

mremap.c

oom_kill.c

page-writeback.c

swapfile.c

vmscan.c

zswap.c

tools

testing

selftests

timers

posix_timers.c

MAINTAINERS

reviewed

··· 3624 3624 S: Odd Fixes (e.g., new signatures) 3625 3625 F: drivers/scsi/fdomain.* 3626 3626 3627 3627 + GCOV BASED KERNEL PROFILING 3628 3628 + M: Peter Oberparleiter <oberpar@linux.vnet.ibm.com> 3629 3629 + S: Maintained 3630 3630 + F: kernel/gcov/ 3631 3631 + F: Documentation/gcov.txt 3632 3632 + 3627 3633 GDT SCSI DISK ARRAY CONTROLLER DRIVER 3628 3634 M: Achim Leubner <achim_leubner@adaptec.com> 3629 3635 L: linux-scsi@vger.kernel.org

+6 -1

block/partitions/efi.c

reviewed

··· 222 222 * the disk size. 223 223 * 224 224 * Hybrid MBRs do not necessarily comply with this. 225 225 + * 226 226 + * Consider a bad value here to be a warning to support dd'ing 227 227 + * an image from a smaller disk to a larger disk. 225 228 */ 226 229 if (ret == GPT_MBR_PROTECTIVE) { 227 230 sz = le32_to_cpu(mbr->partition_record[part].size_in_lba); 228 231 if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF) 229 229 - ret = 0; 232 232 + pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n", 233 233 + sz, min_t(uint32_t, 234 234 + total_sectors - 1, 0xFFFFFFFF)); 230 235 } 231 236 done: 232 237 return ret;

+12 -2

fs/buffer.c

reviewed

··· 1005 1005 struct buffer_head *bh; 1006 1006 sector_t end_block; 1007 1007 int ret = 0; /* Will call free_more_memory() */ 1008 1008 + gfp_t gfp_mask; 1008 1009 1009 1009 - page = find_or_create_page(inode->i_mapping, index, 1010 1010 - (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); 1010 1010 + gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS; 1011 1011 + gfp_mask |= __GFP_MOVABLE; 1012 1012 + /* 1013 1013 + * XXX: __getblk_slow() can not really deal with failure and 1014 1014 + * will endlessly loop on improvised global reclaim. Prefer 1015 1015 + * looping in the allocator rather than here, at least that 1016 1016 + * code knows what it's doing. 1017 1017 + */ 1018 1018 + gfp_mask |= __GFP_NOFAIL; 1019 1019 + 1020 1020 + page = find_or_create_page(inode->i_mapping, index, gfp_mask); 1011 1021 if (!page) 1012 1022 return ret; 1013 1023

+7 -3

fs/proc/inode.c

reviewed

··· 288 288 static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) 289 289 { 290 290 struct proc_dir_entry *pde = PDE(file_inode(file)); 291 291 - int rv = -EIO; 292 292 - unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); 291 291 + unsigned long rv = -EIO; 292 292 + unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) = NULL; 293 293 if (use_pde(pde)) { 294 294 - get_unmapped_area = pde->proc_fops->get_unmapped_area; 294 294 + #ifdef CONFIG_MMU 295 295 + get_unmapped_area = current->mm->get_unmapped_area; 296 296 + #endif 297 297 + if (pde->proc_fops->get_unmapped_area) 298 298 + get_unmapped_area = pde->proc_fops->get_unmapped_area; 295 299 if (get_unmapped_area) 296 300 rv = get_unmapped_area(file, orig_addr, len, pgoff, flags); 297 301 unuse_pde(pde);

+3 -1

fs/proc/task_mmu.c

reviewed

··· 941 941 frame = pte_pfn(pte); 942 942 flags = PM_PRESENT; 943 943 page = vm_normal_page(vma, addr, pte); 944 944 + if (pte_soft_dirty(pte)) 945 945 + flags2 |= __PM_SOFT_DIRTY; 944 946 } else if (is_swap_pte(pte)) { 945 947 swp_entry_t entry; 946 948 if (pte_swp_soft_dirty(pte)) ··· 962 960 963 961 if (page && !PageAnon(page)) 964 962 flags |= PM_FILE; 965 965 - if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte)) 963 963 + if ((vma->vm_flags & VM_SOFTDIRTY)) 966 964 flags2 |= __PM_SOFT_DIRTY; 967 965 968 966 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);

+11 -39

include/linux/memcontrol.h

reviewed

··· 137 137 extern void mem_cgroup_replace_page_cache(struct page *oldpage, 138 138 struct page *newpage); 139 139 140 140 - /** 141 141 - * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task 142 142 - * @new: true to enable, false to disable 143 143 - * 144 144 - * Toggle whether a failed memcg charge should invoke the OOM killer 145 145 - * or just return -ENOMEM. Returns the previous toggle state. 146 146 - * 147 147 - * NOTE: Any path that enables the OOM killer before charging must 148 148 - * call mem_cgroup_oom_synchronize() afterward to finalize the 149 149 - * OOM handling and clean up. 150 150 - */ 151 151 - static inline bool mem_cgroup_toggle_oom(bool new) 140 140 + static inline void mem_cgroup_oom_enable(void) 152 141 { 153 153 - bool old; 154 154 - 155 155 - old = current->memcg_oom.may_oom; 156 156 - current->memcg_oom.may_oom = new; 157 157 - 158 158 - return old; 142 142 + WARN_ON(current->memcg_oom.may_oom); 143 143 + current->memcg_oom.may_oom = 1; 159 144 } 160 145 161 161 - static inline void mem_cgroup_enable_oom(void) 146 146 + static inline void mem_cgroup_oom_disable(void) 162 147 { 163 163 - bool old = mem_cgroup_toggle_oom(true); 164 164 - 165 165 - WARN_ON(old == true); 166 166 - } 167 167 - 168 168 - static inline void mem_cgroup_disable_oom(void) 169 169 - { 170 170 - bool old = mem_cgroup_toggle_oom(false); 171 171 - 172 172 - WARN_ON(old == false); 148 148 + WARN_ON(!current->memcg_oom.may_oom); 149 149 + current->memcg_oom.may_oom = 0; 173 150 } 174 151 175 152 static inline bool task_in_memcg_oom(struct task_struct *p) 176 153 { 177 177 - return p->memcg_oom.in_memcg_oom; 154 154 + return p->memcg_oom.memcg; 178 155 } 179 156 180 180 - bool mem_cgroup_oom_synchronize(void); 157 157 + bool mem_cgroup_oom_synchronize(bool wait); 181 158 182 159 #ifdef CONFIG_MEMCG_SWAP 183 160 extern int do_swap_account; ··· 379 402 { 380 403 } 381 404 382 382 - static inline bool mem_cgroup_toggle_oom(bool new) 383 383 - { 384 384 - return false; 385 385 - } 386 386 - 387 387 - static inline void mem_cgroup_enable_oom(void) 405 405 + static inline void mem_cgroup_oom_enable(void) 388 406 { 389 407 } 390 408 391 391 - static inline void mem_cgroup_disable_oom(void) 409 409 + static inline void mem_cgroup_oom_disable(void) 392 410 { 393 411 } 394 412 ··· 392 420 return false; 393 421 } 394 422 395 395 - static inline bool mem_cgroup_oom_synchronize(void) 423 423 + static inline bool mem_cgroup_oom_synchronize(bool wait) 396 424 { 397 425 return false; 398 426 }

+3 -4

include/linux/sched.h

reviewed

··· 1394 1394 } memcg_batch; 1395 1395 unsigned int memcg_kmem_skip_account; 1396 1396 struct memcg_oom_info { 1397 1397 + struct mem_cgroup *memcg; 1398 1398 + gfp_t gfp_mask; 1399 1399 + int order; 1397 1400 unsigned int may_oom:1; 1398 1398 - unsigned int in_memcg_oom:1; 1399 1399 - unsigned int oom_locked:1; 1400 1400 - int wakeups; 1401 1401 - struct mem_cgroup *wait_on_memcg; 1402 1401 } memcg_oom; 1403 1402 #endif 1404 1403 #ifdef CONFIG_UPROBES

+29 -13

ipc/sem.c

reviewed

··· 1282 1282 1283 1283 sem_lock(sma, NULL, -1); 1284 1284 1285 1285 + if (sma->sem_perm.deleted) { 1286 1286 + sem_unlock(sma, -1); 1287 1287 + rcu_read_unlock(); 1288 1288 + return -EIDRM; 1289 1289 + } 1290 1290 + 1285 1291 curr = &sma->sem_base[semnum]; 1286 1292 1287 1293 ipc_assert_locked_object(&sma->sem_perm); ··· 1342 1336 int i; 1343 1337 1344 1338 sem_lock(sma, NULL, -1); 1339 1339 + if (sma->sem_perm.deleted) { 1340 1340 + err = -EIDRM; 1341 1341 + goto out_unlock; 1342 1342 + } 1345 1343 if(nsems > SEMMSL_FAST) { 1346 1344 if (!ipc_rcu_getref(sma)) { 1347 1347 - sem_unlock(sma, -1); 1348 1348 - rcu_read_unlock(); 1349 1345 err = -EIDRM; 1350 1350 - goto out_free; 1346 1346 + goto out_unlock; 1351 1347 } 1352 1348 sem_unlock(sma, -1); 1353 1349 rcu_read_unlock(); ··· 1362 1354 rcu_read_lock(); 1363 1355 sem_lock_and_putref(sma); 1364 1356 if (sma->sem_perm.deleted) { 1365 1365 - sem_unlock(sma, -1); 1366 1366 - rcu_read_unlock(); 1367 1357 err = -EIDRM; 1368 1368 - goto out_free; 1358 1358 + goto out_unlock; 1369 1359 } 1370 1360 } 1371 1361 for (i = 0; i < sma->sem_nsems; i++) ··· 1381 1375 struct sem_undo *un; 1382 1376 1383 1377 if (!ipc_rcu_getref(sma)) { 1384 1384 - rcu_read_unlock(); 1385 1385 - return -EIDRM; 1378 1378 + err = -EIDRM; 1379 1379 + goto out_rcu_wakeup; 1386 1380 } 1387 1381 rcu_read_unlock(); 1388 1382 ··· 1410 1404 rcu_read_lock(); 1411 1405 sem_lock_and_putref(sma); 1412 1406 if (sma->sem_perm.deleted) { 1413 1413 - sem_unlock(sma, -1); 1414 1414 - rcu_read_unlock(); 1415 1407 err = -EIDRM; 1416 1416 - goto out_free; 1408 1408 + goto out_unlock; 1417 1409 } 1418 1410 1419 1411 for (i = 0; i < nsems; i++) ··· 1435 1431 goto out_rcu_wakeup; 1436 1432 1437 1433 sem_lock(sma, NULL, -1); 1434 1434 + if (sma->sem_perm.deleted) { 1435 1435 + err = -EIDRM; 1436 1436 + goto out_unlock; 1437 1437 + } 1438 1438 curr = &sma->sem_base[semnum]; 1439 1439 1440 1440 switch (cmd) { ··· 1844 1836 if (error) 1845 1837 goto out_rcu_wakeup; 1846 1838 1839 1839 + error = -EIDRM; 1840 1840 + locknum = sem_lock(sma, sops, nsops); 1841 1841 + if (sma->sem_perm.deleted) 1842 1842 + goto out_unlock_free; 1847 1843 /* 1848 1844 * semid identifiers are not unique - find_alloc_undo may have 1849 1845 * allocated an undo structure, it was invalidated by an RMID ··· 1855 1843 * This case can be detected checking un->semid. The existence of 1856 1844 * "un" itself is guaranteed by rcu. 1857 1845 */ 1858 1858 - error = -EIDRM; 1859 1859 - locknum = sem_lock(sma, sops, nsops); 1860 1846 if (un && un->semid == -1) 1861 1847 goto out_unlock_free; 1862 1848 ··· 2067 2057 } 2068 2058 2069 2059 sem_lock(sma, NULL, -1); 2060 2060 + /* exit_sem raced with IPC_RMID, nothing to do */ 2061 2061 + if (sma->sem_perm.deleted) { 2062 2062 + sem_unlock(sma, -1); 2063 2063 + rcu_read_unlock(); 2064 2064 + continue; 2065 2065 + } 2070 2066 un = __lookup_undo(ulp, semid); 2071 2067 if (un == NULL) { 2072 2068 /* exit_sem raced with IPC_RMID+semget() that created

+21 -6

ipc/util.c

reviewed

··· 17 17 * Pavel Emelianov <xemul@openvz.org> 18 18 * 19 19 * General sysv ipc locking scheme: 20 20 - * when doing ipc id lookups, take the ids->rwsem 21 21 - * rcu_read_lock() 22 22 - * obtain the ipc object (kern_ipc_perm) 23 23 - * perform security, capabilities, auditing and permission checks, etc. 24 24 - * acquire the ipc lock (kern_ipc_perm.lock) throught ipc_lock_object() 25 25 - * perform data updates (ie: SET, RMID, LOCK/UNLOCK commands) 20 20 + * rcu_read_lock() 21 21 + * obtain the ipc object (kern_ipc_perm) by looking up the id in an idr 22 22 + * tree. 23 23 + * - perform initial checks (capabilities, auditing and permission, 24 24 + * etc). 25 25 + * - perform read-only operations, such as STAT, INFO commands. 26 26 + * acquire the ipc lock (kern_ipc_perm.lock) through 27 27 + * ipc_lock_object() 28 28 + * - perform data updates, such as SET, RMID commands and 29 29 + * mechanism-specific operations (semop/semtimedop, 30 30 + * msgsnd/msgrcv, shmat/shmdt). 31 31 + * drop the ipc lock, through ipc_unlock_object(). 32 32 + * rcu_read_unlock() 33 33 + * 34 34 + * The ids->rwsem must be taken when: 35 35 + * - creating, removing and iterating the existing entries in ipc 36 36 + * identifier sets. 37 37 + * - iterating through files under /proc/sysvipc/ 38 38 + * 39 39 + * Note that sems have a special fast path that avoids kern_ipc_perm.lock - 40 40 + * see sem_lock(). 26 41 */ 27 42 28 43 #include <linux/mm.h>

lib/percpu-refcount.c

reviewed

··· 53 53 ref->release = release; 54 54 return 0; 55 55 } 56 56 + EXPORT_SYMBOL_GPL(percpu_ref_init); 56 57 57 58 /** 58 59 * percpu_ref_cancel_init - cancel percpu_ref_init() ··· 85 84 free_percpu(ref->pcpu_count); 86 85 } 87 86 } 87 87 + EXPORT_SYMBOL_GPL(percpu_ref_cancel_init); 88 88 89 89 static void percpu_ref_kill_rcu(struct rcu_head *rcu) 90 90 { ··· 158 156 159 157 call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); 160 158 } 159 159 + EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);

+1 -10

mm/filemap.c

reviewed

··· 1616 1616 struct inode *inode = mapping->host; 1617 1617 pgoff_t offset = vmf->pgoff; 1618 1618 struct page *page; 1619 1619 - bool memcg_oom; 1620 1619 pgoff_t size; 1621 1620 int ret = 0; 1622 1621 ··· 1624 1625 return VM_FAULT_SIGBUS; 1625 1626 1626 1627 /* 1627 1627 - * Do we have something in the page cache already? Either 1628 1628 - * way, try readahead, but disable the memcg OOM killer for it 1629 1629 - * as readahead is optional and no errors are propagated up 1630 1630 - * the fault stack. The OOM killer is enabled while trying to 1631 1631 - * instantiate the faulting page individually below. 1628 1628 + * Do we have something in the page cache already? 1632 1629 */ 1633 1630 page = find_get_page(mapping, offset); 1634 1631 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { ··· 1632 1637 * We found the page, so try async readahead before 1633 1638 * waiting for the lock. 1634 1639 */ 1635 1635 - memcg_oom = mem_cgroup_toggle_oom(false); 1636 1640 do_async_mmap_readahead(vma, ra, file, page, offset); 1637 1637 - mem_cgroup_toggle_oom(memcg_oom); 1638 1641 } else if (!page) { 1639 1642 /* No page in the page cache at all */ 1640 1640 - memcg_oom = mem_cgroup_toggle_oom(false); 1641 1643 do_sync_mmap_readahead(vma, ra, file, offset); 1642 1642 - mem_cgroup_toggle_oom(memcg_oom); 1643 1644 count_vm_event(PGMAJFAULT); 1644 1645 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1645 1646 ret = VM_FAULT_MAJOR;

+9 -1

mm/huge_memory.c

reviewed

··· 2697 2697 2698 2698 mmun_start = haddr; 2699 2699 mmun_end = haddr + HPAGE_PMD_SIZE; 2700 2700 + again: 2700 2701 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2701 2702 spin_lock(&mm->page_table_lock); 2702 2703 if (unlikely(!pmd_trans_huge(*pmd))) { ··· 2720 2719 split_huge_page(page); 2721 2720 2722 2721 put_page(page); 2723 2723 - BUG_ON(pmd_trans_huge(*pmd)); 2722 2722 + 2723 2723 + /* 2724 2724 + * We don't always have down_write of mmap_sem here: a racing 2725 2725 + * do_huge_pmd_wp_page() might have copied-on-write to another 2726 2726 + * huge page before our split_huge_page() got the anon_vma lock. 2727 2727 + */ 2728 2728 + if (unlikely(pmd_trans_huge(*pmd))) 2729 2729 + goto again; 2724 2730 } 2725 2731 2726 2732 void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,

+16 -1

mm/hugetlb.c

reviewed

··· 653 653 BUG_ON(page_count(page)); 654 654 BUG_ON(page_mapcount(page)); 655 655 restore_reserve = PagePrivate(page); 656 656 + ClearPagePrivate(page); 656 657 657 658 spin_lock(&hugetlb_lock); 658 659 hugetlb_cgroup_uncharge_page(hstate_index(h), ··· 696 695 /* we rely on prep_new_huge_page to set the destructor */ 697 696 set_compound_order(page, order); 698 697 __SetPageHead(page); 698 698 + __ClearPageReserved(page); 699 699 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 700 700 __SetPageTail(p); 701 701 + /* 702 702 + * For gigantic hugepages allocated through bootmem at 703 703 + * boot, it's safer to be consistent with the not-gigantic 704 704 + * hugepages and clear the PG_reserved bit from all tail pages 705 705 + * too. Otherwse drivers using get_user_pages() to access tail 706 706 + * pages may get the reference counting wrong if they see 707 707 + * PG_reserved set on a tail page (despite the head page not 708 708 + * having PG_reserved set). Enforcing this consistency between 709 709 + * head and tail pages allows drivers to optimize away a check 710 710 + * on the head page when they need know if put_page() is needed 711 711 + * after get_user_pages(). 712 712 + */ 713 713 + __ClearPageReserved(p); 701 714 set_page_count(p, 0); 702 715 p->first_page = page; 703 716 } ··· 1344 1329 #else 1345 1330 page = virt_to_page(m); 1346 1331 #endif 1347 1347 - __ClearPageReserved(page); 1348 1332 WARN_ON(page_count(page) != 1); 1349 1333 prep_compound_huge_page(page, h->order); 1334 1334 + WARN_ON(PageReserved(page)); 1350 1335 prep_new_huge_page(h, page, page_to_nid(page)); 1351 1336 /* 1352 1337 * If we had gigantic hugepages allocated at boot time, we need

+72 -105

mm/memcontrol.c

reviewed

··· 866 866 unsigned long val = 0; 867 867 int cpu; 868 868 869 869 + get_online_cpus(); 869 870 for_each_online_cpu(cpu) 870 871 val += per_cpu(memcg->stat->events[idx], cpu); 871 872 #ifdef CONFIG_HOTPLUG_CPU ··· 874 873 val += memcg->nocpu_base.events[idx]; 875 874 spin_unlock(&memcg->pcp_counter_lock); 876 875 #endif 876 876 + put_online_cpus(); 877 877 return val; 878 878 } 879 879 ··· 2161 2159 memcg_wakeup_oom(memcg); 2162 2160 } 2163 2161 2164 2164 - /* 2165 2165 - * try to call OOM killer 2166 2166 - */ 2167 2162 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 2168 2163 { 2169 2169 - bool locked; 2170 2170 - int wakeups; 2171 2171 - 2172 2164 if (!current->memcg_oom.may_oom) 2173 2165 return; 2174 2174 - 2175 2175 - current->memcg_oom.in_memcg_oom = 1; 2176 2176 - 2177 2166 /* 2178 2178 - * As with any blocking lock, a contender needs to start 2179 2179 - * listening for wakeups before attempting the trylock, 2180 2180 - * otherwise it can miss the wakeup from the unlock and sleep 2181 2181 - * indefinitely. This is just open-coded because our locking 2182 2182 - * is so particular to memcg hierarchies. 2167 2167 + * We are in the middle of the charge context here, so we 2168 2168 + * don't want to block when potentially sitting on a callstack 2169 2169 + * that holds all kinds of filesystem and mm locks. 2170 2170 + * 2171 2171 + * Also, the caller may handle a failed allocation gracefully 2172 2172 + * (like optional page cache readahead) and so an OOM killer 2173 2173 + * invocation might not even be necessary. 2174 2174 + * 2175 2175 + * That's why we don't do anything here except remember the 2176 2176 + * OOM context and then deal with it at the end of the page 2177 2177 + * fault when the stack is unwound, the locks are released, 2178 2178 + * and when we know whether the fault was overall successful. 2183 2179 */ 2184 2184 - wakeups = atomic_read(&memcg->oom_wakeups); 2180 2180 + css_get(&memcg->css); 2181 2181 + current->memcg_oom.memcg = memcg; 2182 2182 + current->memcg_oom.gfp_mask = mask; 2183 2183 + current->memcg_oom.order = order; 2184 2184 + } 2185 2185 + 2186 2186 + /** 2187 2187 + * mem_cgroup_oom_synchronize - complete memcg OOM handling 2188 2188 + * @handle: actually kill/wait or just clean up the OOM state 2189 2189 + * 2190 2190 + * This has to be called at the end of a page fault if the memcg OOM 2191 2191 + * handler was enabled. 2192 2192 + * 2193 2193 + * Memcg supports userspace OOM handling where failed allocations must 2194 2194 + * sleep on a waitqueue until the userspace task resolves the 2195 2195 + * situation. Sleeping directly in the charge context with all kinds 2196 2196 + * of locks held is not a good idea, instead we remember an OOM state 2197 2197 + * in the task and mem_cgroup_oom_synchronize() has to be called at 2198 2198 + * the end of the page fault to complete the OOM handling. 2199 2199 + * 2200 2200 + * Returns %true if an ongoing memcg OOM situation was detected and 2201 2201 + * completed, %false otherwise. 2202 2202 + */ 2203 2203 + bool mem_cgroup_oom_synchronize(bool handle) 2204 2204 + { 2205 2205 + struct mem_cgroup *memcg = current->memcg_oom.memcg; 2206 2206 + struct oom_wait_info owait; 2207 2207 + bool locked; 2208 2208 + 2209 2209 + /* OOM is global, do not handle */ 2210 2210 + if (!memcg) 2211 2211 + return false; 2212 2212 + 2213 2213 + if (!handle) 2214 2214 + goto cleanup; 2215 2215 + 2216 2216 + owait.memcg = memcg; 2217 2217 + owait.wait.flags = 0; 2218 2218 + owait.wait.func = memcg_oom_wake_function; 2219 2219 + owait.wait.private = current; 2220 2220 + INIT_LIST_HEAD(&owait.wait.task_list); 2221 2221 + 2222 2222 + prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2185 2223 mem_cgroup_mark_under_oom(memcg); 2186 2224 2187 2225 locked = mem_cgroup_oom_trylock(memcg); ··· 2231 2189 2232 2190 if (locked && !memcg->oom_kill_disable) { 2233 2191 mem_cgroup_unmark_under_oom(memcg); 2234 2234 - mem_cgroup_out_of_memory(memcg, mask, order); 2235 2235 - mem_cgroup_oom_unlock(memcg); 2236 2236 - /* 2237 2237 - * There is no guarantee that an OOM-lock contender 2238 2238 - * sees the wakeups triggered by the OOM kill 2239 2239 - * uncharges. Wake any sleepers explicitely. 2240 2240 - */ 2241 2241 - memcg_oom_recover(memcg); 2192 2192 + finish_wait(&memcg_oom_waitq, &owait.wait); 2193 2193 + mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, 2194 2194 + current->memcg_oom.order); 2242 2195 } else { 2243 2243 - /* 2244 2244 - * A system call can just return -ENOMEM, but if this 2245 2245 - * is a page fault and somebody else is handling the 2246 2246 - * OOM already, we need to sleep on the OOM waitqueue 2247 2247 - * for this memcg until the situation is resolved. 2248 2248 - * Which can take some time because it might be 2249 2249 - * handled by a userspace task. 2250 2250 - * 2251 2251 - * However, this is the charge context, which means 2252 2252 - * that we may sit on a large call stack and hold 2253 2253 - * various filesystem locks, the mmap_sem etc. and we 2254 2254 - * don't want the OOM handler to deadlock on them 2255 2255 - * while we sit here and wait. Store the current OOM 2256 2256 - * context in the task_struct, then return -ENOMEM. 2257 2257 - * At the end of the page fault handler, with the 2258 2258 - * stack unwound, pagefault_out_of_memory() will check 2259 2259 - * back with us by calling 2260 2260 - * mem_cgroup_oom_synchronize(), possibly putting the 2261 2261 - * task to sleep. 2262 2262 - */ 2263 2263 - current->memcg_oom.oom_locked = locked; 2264 2264 - current->memcg_oom.wakeups = wakeups; 2265 2265 - css_get(&memcg->css); 2266 2266 - current->memcg_oom.wait_on_memcg = memcg; 2267 2267 - } 2268 2268 - } 2269 2269 - 2270 2270 - /** 2271 2271 - * mem_cgroup_oom_synchronize - complete memcg OOM handling 2272 2272 - * 2273 2273 - * This has to be called at the end of a page fault if the the memcg 2274 2274 - * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. 2275 2275 - * 2276 2276 - * Memcg supports userspace OOM handling, so failed allocations must 2277 2277 - * sleep on a waitqueue until the userspace task resolves the 2278 2278 - * situation. Sleeping directly in the charge context with all kinds 2279 2279 - * of locks held is not a good idea, instead we remember an OOM state 2280 2280 - * in the task and mem_cgroup_oom_synchronize() has to be called at 2281 2281 - * the end of the page fault to put the task to sleep and clean up the 2282 2282 - * OOM state. 2283 2283 - * 2284 2284 - * Returns %true if an ongoing memcg OOM situation was detected and 2285 2285 - * finalized, %false otherwise. 2286 2286 - */ 2287 2287 - bool mem_cgroup_oom_synchronize(void) 2288 2288 - { 2289 2289 - struct oom_wait_info owait; 2290 2290 - struct mem_cgroup *memcg; 2291 2291 - 2292 2292 - /* OOM is global, do not handle */ 2293 2293 - if (!current->memcg_oom.in_memcg_oom) 2294 2294 - return false; 2295 2295 - 2296 2296 - /* 2297 2297 - * We invoked the OOM killer but there is a chance that a kill 2298 2298 - * did not free up any charges. Everybody else might already 2299 2299 - * be sleeping, so restart the fault and keep the rampage 2300 2300 - * going until some charges are released. 2301 2301 - */ 2302 2302 - memcg = current->memcg_oom.wait_on_memcg; 2303 2303 - if (!memcg) 2304 2304 - goto out; 2305 2305 - 2306 2306 - if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 2307 2307 - goto out_memcg; 2308 2308 - 2309 2309 - owait.memcg = memcg; 2310 2310 - owait.wait.flags = 0; 2311 2311 - owait.wait.func = memcg_oom_wake_function; 2312 2312 - owait.wait.private = current; 2313 2313 - INIT_LIST_HEAD(&owait.wait.task_list); 2314 2314 - 2315 2315 - prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2316 2316 - /* Only sleep if we didn't miss any wakeups since OOM */ 2317 2317 - if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) 2318 2196 schedule(); 2319 2319 - finish_wait(&memcg_oom_waitq, &owait.wait); 2320 2320 - out_memcg: 2321 2321 - mem_cgroup_unmark_under_oom(memcg); 2322 2322 - if (current->memcg_oom.oom_locked) { 2197 2197 + mem_cgroup_unmark_under_oom(memcg); 2198 2198 + finish_wait(&memcg_oom_waitq, &owait.wait); 2199 2199 + } 2200 2200 + 2201 2201 + if (locked) { 2323 2202 mem_cgroup_oom_unlock(memcg); 2324 2203 /* 2325 2204 * There is no guarantee that an OOM-lock contender ··· 2249 2286 */ 2250 2287 memcg_oom_recover(memcg); 2251 2288 } 2289 2289 + cleanup: 2290 2290 + current->memcg_oom.memcg = NULL; 2252 2291 css_put(&memcg->css); 2253 2253 - current->memcg_oom.wait_on_memcg = NULL; 2254 2254 - out: 2255 2255 - current->memcg_oom.in_memcg_oom = 0; 2256 2292 return true; 2257 2293 } 2258 2294 ··· 2665 2703 || fatal_signal_pending(current))) 2666 2704 goto bypass; 2667 2705 2706 2706 + if (unlikely(task_in_memcg_oom(current))) 2707 2707 + goto bypass; 2708 2708 + 2668 2709 /* 2669 2710 * We always charge the cgroup the mm_struct belongs to. 2670 2711 * The mm_struct's mem_cgroup changes on task migration if the ··· 2766 2801 return 0; 2767 2802 nomem: 2768 2803 *ptr = NULL; 2804 2804 + if (gfp_mask & __GFP_NOFAIL) 2805 2805 + return 0; 2769 2806 return -ENOMEM; 2770 2807 bypass: 2771 2808 *ptr = root_mem_cgroup;

+14 -6

mm/memory.c

reviewed

··· 837 837 */ 838 838 make_migration_entry_read(&entry); 839 839 pte = swp_entry_to_pte(entry); 840 840 + if (pte_swp_soft_dirty(*src_pte)) 841 841 + pte = pte_swp_mksoft_dirty(pte); 840 842 set_pte_at(src_mm, addr, src_pte, pte); 841 843 } 842 844 } ··· 3865 3863 * space. Kernel faults are handled more gracefully. 3866 3864 */ 3867 3865 if (flags & FAULT_FLAG_USER) 3868 3868 - mem_cgroup_enable_oom(); 3866 3866 + mem_cgroup_oom_enable(); 3869 3867 3870 3868 ret = __handle_mm_fault(mm, vma, address, flags); 3871 3869 3872 3872 - if (flags & FAULT_FLAG_USER) 3873 3873 - mem_cgroup_disable_oom(); 3874 3874 - 3875 3875 - if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) 3876 3876 - mem_cgroup_oom_synchronize(); 3870 3870 + if (flags & FAULT_FLAG_USER) { 3871 3871 + mem_cgroup_oom_disable(); 3872 3872 + /* 3873 3873 + * The task may have entered a memcg OOM situation but 3874 3874 + * if the allocation error was handled gracefully (no 3875 3875 + * VM_FAULT_OOM), there is no need to kill anything. 3876 3876 + * Just clean up the OOM state peacefully. 3877 3877 + */ 3878 3878 + if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) 3879 3879 + mem_cgroup_oom_synchronize(false); 3880 3880 + } 3877 3881 3878 3882 return ret; 3879 3883 }

mm/migrate.c

reviewed

··· 161 161 162 162 get_page(new); 163 163 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 164 164 + if (pte_swp_soft_dirty(*ptep)) 165 165 + pte = pte_mksoft_dirty(pte); 164 166 if (is_write_migration_entry(entry)) 165 167 pte = pte_mkwrite(pte); 166 168 #ifdef CONFIG_HUGETLB_PAGE

+5 -2

mm/mprotect.c

reviewed

··· 94 94 swp_entry_t entry = pte_to_swp_entry(oldpte); 95 95 96 96 if (is_write_migration_entry(entry)) { 97 97 + pte_t newpte; 97 98 /* 98 99 * A protection check is difficult so 99 100 * just be safe and disable write 100 101 */ 101 102 make_migration_entry_read(&entry); 102 102 - set_pte_at(mm, addr, pte, 103 103 - swp_entry_to_pte(entry)); 103 103 + newpte = swp_entry_to_pte(entry); 104 104 + if (pte_swp_soft_dirty(oldpte)) 105 105 + newpte = pte_swp_mksoft_dirty(newpte); 106 106 + set_pte_at(mm, addr, pte, newpte); 104 107 } 105 108 pages++; 106 109 }

+1 -4

mm/mremap.c

reviewed

··· 25 25 #include <asm/uaccess.h> 26 26 #include <asm/cacheflush.h> 27 27 #include <asm/tlbflush.h> 28 28 - #include <asm/pgalloc.h> 29 28 30 29 #include "internal.h" 31 30 ··· 62 63 return NULL; 63 64 64 65 pmd = pmd_alloc(mm, pud, addr); 65 65 - if (!pmd) { 66 66 - pud_free(mm, pud); 66 66 + if (!pmd) 67 67 return NULL; 68 68 - } 69 68 70 69 VM_BUG_ON(pmd_trans_huge(*pmd)); 71 70

+1 -1

mm/oom_kill.c

reviewed

··· 680 680 { 681 681 struct zonelist *zonelist; 682 682 683 683 - if (mem_cgroup_oom_synchronize()) 683 683 + if (mem_cgroup_oom_synchronize(true)) 684 684 return; 685 685 686 686 zonelist = node_zonelist(first_online_node, GFP_KERNEL);

+5 -5

mm/page-writeback.c

reviewed

··· 1210 1210 return 1; 1211 1211 } 1212 1212 1213 1213 - static long bdi_max_pause(struct backing_dev_info *bdi, 1214 1214 - unsigned long bdi_dirty) 1213 1213 + static unsigned long bdi_max_pause(struct backing_dev_info *bdi, 1214 1214 + unsigned long bdi_dirty) 1215 1215 { 1216 1216 - long bw = bdi->avg_write_bandwidth; 1217 1217 - long t; 1216 1216 + unsigned long bw = bdi->avg_write_bandwidth; 1217 1217 + unsigned long t; 1218 1218 1219 1219 /* 1220 1220 * Limit pause time for small memory systems. If sleeping for too long ··· 1226 1226 t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); 1227 1227 t++; 1228 1228 1229 1229 - return min_t(long, t, MAX_PAUSE); 1229 1229 + return min_t(unsigned long, t, MAX_PAUSE); 1230 1230 } 1231 1231 1232 1232 static long bdi_min_pause(struct backing_dev_info *bdi,

+3 -1

mm/swapfile.c

reviewed

··· 1824 1824 struct filename *pathname; 1825 1825 int i, type, prev; 1826 1826 int err; 1827 1827 + unsigned int old_block_size; 1827 1828 1828 1829 if (!capable(CAP_SYS_ADMIN)) 1829 1830 return -EPERM; ··· 1915 1914 } 1916 1915 1917 1916 swap_file = p->swap_file; 1917 1917 + old_block_size = p->old_block_size; 1918 1918 p->swap_file = NULL; 1919 1919 p->max = 0; 1920 1920 swap_map = p->swap_map; ··· 1940 1938 inode = mapping->host; 1941 1939 if (S_ISBLK(inode->i_mode)) { 1942 1940 struct block_device *bdev = I_BDEV(inode); 1943 1943 - set_blocksize(bdev, p->old_block_size); 1941 1941 + set_blocksize(bdev, old_block_size); 1944 1942 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 1945 1943 } else { 1946 1944 mutex_lock(&inode->i_mutex);

mm/vmscan.c

reviewed

··· 211 211 down_write(&shrinker_rwsem); 212 212 list_del(&shrinker->list); 213 213 up_write(&shrinker_rwsem); 214 214 + kfree(shrinker->nr_deferred); 214 215 } 215 216 EXPORT_SYMBOL(unregister_shrinker); 216 217

mm/zswap.c

reviewed

··· 804 804 } 805 805 tree->rbroot = RB_ROOT; 806 806 spin_unlock(&tree->lock); 807 807 + 808 808 + zbud_destroy_pool(tree->pool); 809 809 + kfree(tree); 810 810 + zswap_trees[type] = NULL; 807 811 } 808 812 809 813 static struct zbud_ops zswap_zbud_ops = {

+1 -1

tools/testing/selftests/timers/posix_timers.c

reviewed

··· 151 151 fflush(stdout); 152 152 153 153 done = 0; 154 154 - timer_create(which, NULL, &id); 154 154 + err = timer_create(which, NULL, &id); 155 155 if (err < 0) { 156 156 perror("Can't create timer\n"); 157 157 return -1;