Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'akpm' (fixes from Andrew Morton)

Merge misc fixes from Andrew Morton.

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (21 commits)
mm: revert mremap pud_free anti-fix
mm: fix BUG in __split_huge_page_pmd
swap: fix set_blocksize race during swapon/swapoff
procfs: call default get_unmapped_area on MMU-present architectures
procfs: fix unintended truncation of returned mapped address
writeback: fix negative bdi max pause
percpu_refcount: export symbols
fs: buffer: move allocation failure loop into the allocator
mm: memcg: handle non-error OOM situations more gracefully
tools/testing/selftests: fix uninitialized variable
block/partitions/efi.c: treat size mismatch as a warning, not an error
mm: hugetlb: initialize PG_reserved for tail pages of gigantic compound pages
mm/zswap: bugfix: memory leak when re-swapon
mm: /proc/pid/pagemap: inspect _PAGE_SOFT_DIRTY only on present pages
mm: migration: do not lose soft dirty bit if page is in migration state
gcov: MAINTAINERS: Add an entry for gcov
mm/hugetlb.c: correct missing private flag clearing
mm/vmscan.c: don't forget to free shrinker->nr_deferred
ipc/sem.c: synchronize semop and semctl with IPC_RMID
ipc: update locking scheme comments
...

+236 -206
+6
MAINTAINERS
··· 3624 3624 S: Odd Fixes (e.g., new signatures) 3625 3625 F: drivers/scsi/fdomain.* 3626 3626 3627 + GCOV BASED KERNEL PROFILING 3628 + M: Peter Oberparleiter <oberpar@linux.vnet.ibm.com> 3629 + S: Maintained 3630 + F: kernel/gcov/ 3631 + F: Documentation/gcov.txt 3632 + 3627 3633 GDT SCSI DISK ARRAY CONTROLLER DRIVER 3628 3634 M: Achim Leubner <achim_leubner@adaptec.com> 3629 3635 L: linux-scsi@vger.kernel.org
+6 -1
block/partitions/efi.c
··· 222 222 * the disk size. 223 223 * 224 224 * Hybrid MBRs do not necessarily comply with this. 225 + * 226 + * Consider a bad value here to be a warning to support dd'ing 227 + * an image from a smaller disk to a larger disk. 225 228 */ 226 229 if (ret == GPT_MBR_PROTECTIVE) { 227 230 sz = le32_to_cpu(mbr->partition_record[part].size_in_lba); 228 231 if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF) 229 - ret = 0; 232 + pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n", 233 + sz, min_t(uint32_t, 234 + total_sectors - 1, 0xFFFFFFFF)); 230 235 } 231 236 done: 232 237 return ret;
+12 -2
fs/buffer.c
··· 1005 1005 struct buffer_head *bh; 1006 1006 sector_t end_block; 1007 1007 int ret = 0; /* Will call free_more_memory() */ 1008 + gfp_t gfp_mask; 1008 1009 1009 - page = find_or_create_page(inode->i_mapping, index, 1010 - (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); 1010 + gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS; 1011 + gfp_mask |= __GFP_MOVABLE; 1012 + /* 1013 + * XXX: __getblk_slow() can not really deal with failure and 1014 + * will endlessly loop on improvised global reclaim. Prefer 1015 + * looping in the allocator rather than here, at least that 1016 + * code knows what it's doing. 1017 + */ 1018 + gfp_mask |= __GFP_NOFAIL; 1019 + 1020 + page = find_or_create_page(inode->i_mapping, index, gfp_mask); 1011 1021 if (!page) 1012 1022 return ret; 1013 1023
+7 -3
fs/proc/inode.c
··· 288 288 static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) 289 289 { 290 290 struct proc_dir_entry *pde = PDE(file_inode(file)); 291 - int rv = -EIO; 292 - unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); 291 + unsigned long rv = -EIO; 292 + unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) = NULL; 293 293 if (use_pde(pde)) { 294 - get_unmapped_area = pde->proc_fops->get_unmapped_area; 294 + #ifdef CONFIG_MMU 295 + get_unmapped_area = current->mm->get_unmapped_area; 296 + #endif 297 + if (pde->proc_fops->get_unmapped_area) 298 + get_unmapped_area = pde->proc_fops->get_unmapped_area; 295 299 if (get_unmapped_area) 296 300 rv = get_unmapped_area(file, orig_addr, len, pgoff, flags); 297 301 unuse_pde(pde);
+3 -1
fs/proc/task_mmu.c
··· 941 941 frame = pte_pfn(pte); 942 942 flags = PM_PRESENT; 943 943 page = vm_normal_page(vma, addr, pte); 944 + if (pte_soft_dirty(pte)) 945 + flags2 |= __PM_SOFT_DIRTY; 944 946 } else if (is_swap_pte(pte)) { 945 947 swp_entry_t entry; 946 948 if (pte_swp_soft_dirty(pte)) ··· 962 960 963 961 if (page && !PageAnon(page)) 964 962 flags |= PM_FILE; 965 - if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte)) 963 + if ((vma->vm_flags & VM_SOFTDIRTY)) 966 964 flags2 |= __PM_SOFT_DIRTY; 967 965 968 966 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
+11 -39
include/linux/memcontrol.h
··· 137 137 extern void mem_cgroup_replace_page_cache(struct page *oldpage, 138 138 struct page *newpage); 139 139 140 - /** 141 - * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task 142 - * @new: true to enable, false to disable 143 - * 144 - * Toggle whether a failed memcg charge should invoke the OOM killer 145 - * or just return -ENOMEM. Returns the previous toggle state. 146 - * 147 - * NOTE: Any path that enables the OOM killer before charging must 148 - * call mem_cgroup_oom_synchronize() afterward to finalize the 149 - * OOM handling and clean up. 150 - */ 151 - static inline bool mem_cgroup_toggle_oom(bool new) 140 + static inline void mem_cgroup_oom_enable(void) 152 141 { 153 - bool old; 154 - 155 - old = current->memcg_oom.may_oom; 156 - current->memcg_oom.may_oom = new; 157 - 158 - return old; 142 + WARN_ON(current->memcg_oom.may_oom); 143 + current->memcg_oom.may_oom = 1; 159 144 } 160 145 161 - static inline void mem_cgroup_enable_oom(void) 146 + static inline void mem_cgroup_oom_disable(void) 162 147 { 163 - bool old = mem_cgroup_toggle_oom(true); 164 - 165 - WARN_ON(old == true); 166 - } 167 - 168 - static inline void mem_cgroup_disable_oom(void) 169 - { 170 - bool old = mem_cgroup_toggle_oom(false); 171 - 172 - WARN_ON(old == false); 148 + WARN_ON(!current->memcg_oom.may_oom); 149 + current->memcg_oom.may_oom = 0; 173 150 } 174 151 175 152 static inline bool task_in_memcg_oom(struct task_struct *p) 176 153 { 177 - return p->memcg_oom.in_memcg_oom; 154 + return p->memcg_oom.memcg; 178 155 } 179 156 180 - bool mem_cgroup_oom_synchronize(void); 157 + bool mem_cgroup_oom_synchronize(bool wait); 181 158 182 159 #ifdef CONFIG_MEMCG_SWAP 183 160 extern int do_swap_account; ··· 379 402 { 380 403 } 381 404 382 - static inline bool mem_cgroup_toggle_oom(bool new) 383 - { 384 - return false; 385 - } 386 - 387 - static inline void mem_cgroup_enable_oom(void) 405 + static inline void mem_cgroup_oom_enable(void) 388 406 { 389 407 } 390 408 391 - static inline void mem_cgroup_disable_oom(void) 409 + static inline void mem_cgroup_oom_disable(void) 392 410 { 393 411 } 394 412 ··· 392 420 return false; 393 421 } 394 422 395 - static inline bool mem_cgroup_oom_synchronize(void) 423 + static inline bool mem_cgroup_oom_synchronize(bool wait) 396 424 { 397 425 return false; 398 426 }
+3 -4
include/linux/sched.h
··· 1394 1394 } memcg_batch; 1395 1395 unsigned int memcg_kmem_skip_account; 1396 1396 struct memcg_oom_info { 1397 + struct mem_cgroup *memcg; 1398 + gfp_t gfp_mask; 1399 + int order; 1397 1400 unsigned int may_oom:1; 1398 - unsigned int in_memcg_oom:1; 1399 - unsigned int oom_locked:1; 1400 - int wakeups; 1401 - struct mem_cgroup *wait_on_memcg; 1402 1401 } memcg_oom; 1403 1402 #endif 1404 1403 #ifdef CONFIG_UPROBES
+29 -13
ipc/sem.c
··· 1282 1282 1283 1283 sem_lock(sma, NULL, -1); 1284 1284 1285 + if (sma->sem_perm.deleted) { 1286 + sem_unlock(sma, -1); 1287 + rcu_read_unlock(); 1288 + return -EIDRM; 1289 + } 1290 + 1285 1291 curr = &sma->sem_base[semnum]; 1286 1292 1287 1293 ipc_assert_locked_object(&sma->sem_perm); ··· 1342 1336 int i; 1343 1337 1344 1338 sem_lock(sma, NULL, -1); 1339 + if (sma->sem_perm.deleted) { 1340 + err = -EIDRM; 1341 + goto out_unlock; 1342 + } 1345 1343 if(nsems > SEMMSL_FAST) { 1346 1344 if (!ipc_rcu_getref(sma)) { 1347 - sem_unlock(sma, -1); 1348 - rcu_read_unlock(); 1349 1345 err = -EIDRM; 1350 - goto out_free; 1346 + goto out_unlock; 1351 1347 } 1352 1348 sem_unlock(sma, -1); 1353 1349 rcu_read_unlock(); ··· 1362 1354 rcu_read_lock(); 1363 1355 sem_lock_and_putref(sma); 1364 1356 if (sma->sem_perm.deleted) { 1365 - sem_unlock(sma, -1); 1366 - rcu_read_unlock(); 1367 1357 err = -EIDRM; 1368 - goto out_free; 1358 + goto out_unlock; 1369 1359 } 1370 1360 } 1371 1361 for (i = 0; i < sma->sem_nsems; i++) ··· 1381 1375 struct sem_undo *un; 1382 1376 1383 1377 if (!ipc_rcu_getref(sma)) { 1384 - rcu_read_unlock(); 1385 - return -EIDRM; 1378 + err = -EIDRM; 1379 + goto out_rcu_wakeup; 1386 1380 } 1387 1381 rcu_read_unlock(); 1388 1382 ··· 1410 1404 rcu_read_lock(); 1411 1405 sem_lock_and_putref(sma); 1412 1406 if (sma->sem_perm.deleted) { 1413 - sem_unlock(sma, -1); 1414 - rcu_read_unlock(); 1415 1407 err = -EIDRM; 1416 - goto out_free; 1408 + goto out_unlock; 1417 1409 } 1418 1410 1419 1411 for (i = 0; i < nsems; i++) ··· 1435 1431 goto out_rcu_wakeup; 1436 1432 1437 1433 sem_lock(sma, NULL, -1); 1434 + if (sma->sem_perm.deleted) { 1435 + err = -EIDRM; 1436 + goto out_unlock; 1437 + } 1438 1438 curr = &sma->sem_base[semnum]; 1439 1439 1440 1440 switch (cmd) { ··· 1844 1836 if (error) 1845 1837 goto out_rcu_wakeup; 1846 1838 1839 + error = -EIDRM; 1840 + locknum = sem_lock(sma, sops, nsops); 1841 + if (sma->sem_perm.deleted) 1842 + goto out_unlock_free; 1847 1843 /* 1848 1844 * semid identifiers are not unique - find_alloc_undo may have 1849 1845 * allocated an undo structure, it was invalidated by an RMID ··· 1855 1843 * This case can be detected checking un->semid. The existence of 1856 1844 * "un" itself is guaranteed by rcu. 1857 1845 */ 1858 - error = -EIDRM; 1859 - locknum = sem_lock(sma, sops, nsops); 1860 1846 if (un && un->semid == -1) 1861 1847 goto out_unlock_free; 1862 1848 ··· 2067 2057 } 2068 2058 2069 2059 sem_lock(sma, NULL, -1); 2060 + /* exit_sem raced with IPC_RMID, nothing to do */ 2061 + if (sma->sem_perm.deleted) { 2062 + sem_unlock(sma, -1); 2063 + rcu_read_unlock(); 2064 + continue; 2065 + } 2070 2066 un = __lookup_undo(ulp, semid); 2071 2067 if (un == NULL) { 2072 2068 /* exit_sem raced with IPC_RMID+semget() that created
+21 -6
ipc/util.c
··· 17 17 * Pavel Emelianov <xemul@openvz.org> 18 18 * 19 19 * General sysv ipc locking scheme: 20 - * when doing ipc id lookups, take the ids->rwsem 21 - * rcu_read_lock() 22 - * obtain the ipc object (kern_ipc_perm) 23 - * perform security, capabilities, auditing and permission checks, etc. 24 - * acquire the ipc lock (kern_ipc_perm.lock) throught ipc_lock_object() 25 - * perform data updates (ie: SET, RMID, LOCK/UNLOCK commands) 20 + * rcu_read_lock() 21 + * obtain the ipc object (kern_ipc_perm) by looking up the id in an idr 22 + * tree. 23 + * - perform initial checks (capabilities, auditing and permission, 24 + * etc). 25 + * - perform read-only operations, such as STAT, INFO commands. 26 + * acquire the ipc lock (kern_ipc_perm.lock) through 27 + * ipc_lock_object() 28 + * - perform data updates, such as SET, RMID commands and 29 + * mechanism-specific operations (semop/semtimedop, 30 + * msgsnd/msgrcv, shmat/shmdt). 31 + * drop the ipc lock, through ipc_unlock_object(). 32 + * rcu_read_unlock() 33 + * 34 + * The ids->rwsem must be taken when: 35 + * - creating, removing and iterating the existing entries in ipc 36 + * identifier sets. 37 + * - iterating through files under /proc/sysvipc/ 38 + * 39 + * Note that sems have a special fast path that avoids kern_ipc_perm.lock - 40 + * see sem_lock(). 26 41 */ 27 42 28 43 #include <linux/mm.h>
+3
lib/percpu-refcount.c
··· 53 53 ref->release = release; 54 54 return 0; 55 55 } 56 + EXPORT_SYMBOL_GPL(percpu_ref_init); 56 57 57 58 /** 58 59 * percpu_ref_cancel_init - cancel percpu_ref_init() ··· 85 84 free_percpu(ref->pcpu_count); 86 85 } 87 86 } 87 + EXPORT_SYMBOL_GPL(percpu_ref_cancel_init); 88 88 89 89 static void percpu_ref_kill_rcu(struct rcu_head *rcu) 90 90 { ··· 158 156 159 157 call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); 160 158 } 159 + EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
+1 -10
mm/filemap.c
··· 1616 1616 struct inode *inode = mapping->host; 1617 1617 pgoff_t offset = vmf->pgoff; 1618 1618 struct page *page; 1619 - bool memcg_oom; 1620 1619 pgoff_t size; 1621 1620 int ret = 0; 1622 1621 ··· 1624 1625 return VM_FAULT_SIGBUS; 1625 1626 1626 1627 /* 1627 - * Do we have something in the page cache already? Either 1628 - * way, try readahead, but disable the memcg OOM killer for it 1629 - * as readahead is optional and no errors are propagated up 1630 - * the fault stack. The OOM killer is enabled while trying to 1631 - * instantiate the faulting page individually below. 1628 + * Do we have something in the page cache already? 1632 1629 */ 1633 1630 page = find_get_page(mapping, offset); 1634 1631 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { ··· 1632 1637 * We found the page, so try async readahead before 1633 1638 * waiting for the lock. 1634 1639 */ 1635 - memcg_oom = mem_cgroup_toggle_oom(false); 1636 1640 do_async_mmap_readahead(vma, ra, file, page, offset); 1637 - mem_cgroup_toggle_oom(memcg_oom); 1638 1641 } else if (!page) { 1639 1642 /* No page in the page cache at all */ 1640 - memcg_oom = mem_cgroup_toggle_oom(false); 1641 1643 do_sync_mmap_readahead(vma, ra, file, offset); 1642 - mem_cgroup_toggle_oom(memcg_oom); 1643 1644 count_vm_event(PGMAJFAULT); 1644 1645 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1645 1646 ret = VM_FAULT_MAJOR;
+9 -1
mm/huge_memory.c
··· 2697 2697 2698 2698 mmun_start = haddr; 2699 2699 mmun_end = haddr + HPAGE_PMD_SIZE; 2700 + again: 2700 2701 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2701 2702 spin_lock(&mm->page_table_lock); 2702 2703 if (unlikely(!pmd_trans_huge(*pmd))) { ··· 2720 2719 split_huge_page(page); 2721 2720 2722 2721 put_page(page); 2723 - BUG_ON(pmd_trans_huge(*pmd)); 2722 + 2723 + /* 2724 + * We don't always have down_write of mmap_sem here: a racing 2725 + * do_huge_pmd_wp_page() might have copied-on-write to another 2726 + * huge page before our split_huge_page() got the anon_vma lock. 2727 + */ 2728 + if (unlikely(pmd_trans_huge(*pmd))) 2729 + goto again; 2724 2730 } 2725 2731 2726 2732 void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
+16 -1
mm/hugetlb.c
··· 653 653 BUG_ON(page_count(page)); 654 654 BUG_ON(page_mapcount(page)); 655 655 restore_reserve = PagePrivate(page); 656 + ClearPagePrivate(page); 656 657 657 658 spin_lock(&hugetlb_lock); 658 659 hugetlb_cgroup_uncharge_page(hstate_index(h), ··· 696 695 /* we rely on prep_new_huge_page to set the destructor */ 697 696 set_compound_order(page, order); 698 697 __SetPageHead(page); 698 + __ClearPageReserved(page); 699 699 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 700 700 __SetPageTail(p); 701 + /* 702 + * For gigantic hugepages allocated through bootmem at 703 + * boot, it's safer to be consistent with the not-gigantic 704 + * hugepages and clear the PG_reserved bit from all tail pages 705 + * too. Otherwse drivers using get_user_pages() to access tail 706 + * pages may get the reference counting wrong if they see 707 + * PG_reserved set on a tail page (despite the head page not 708 + * having PG_reserved set). Enforcing this consistency between 709 + * head and tail pages allows drivers to optimize away a check 710 + * on the head page when they need know if put_page() is needed 711 + * after get_user_pages(). 712 + */ 713 + __ClearPageReserved(p); 701 714 set_page_count(p, 0); 702 715 p->first_page = page; 703 716 } ··· 1344 1329 #else 1345 1330 page = virt_to_page(m); 1346 1331 #endif 1347 - __ClearPageReserved(page); 1348 1332 WARN_ON(page_count(page) != 1); 1349 1333 prep_compound_huge_page(page, h->order); 1334 + WARN_ON(PageReserved(page)); 1350 1335 prep_new_huge_page(h, page, page_to_nid(page)); 1351 1336 /* 1352 1337 * If we had gigantic hugepages allocated at boot time, we need
+72 -105
mm/memcontrol.c
··· 866 866 unsigned long val = 0; 867 867 int cpu; 868 868 869 + get_online_cpus(); 869 870 for_each_online_cpu(cpu) 870 871 val += per_cpu(memcg->stat->events[idx], cpu); 871 872 #ifdef CONFIG_HOTPLUG_CPU ··· 874 873 val += memcg->nocpu_base.events[idx]; 875 874 spin_unlock(&memcg->pcp_counter_lock); 876 875 #endif 876 + put_online_cpus(); 877 877 return val; 878 878 } 879 879 ··· 2161 2159 memcg_wakeup_oom(memcg); 2162 2160 } 2163 2161 2164 - /* 2165 - * try to call OOM killer 2166 - */ 2167 2162 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 2168 2163 { 2169 - bool locked; 2170 - int wakeups; 2171 - 2172 2164 if (!current->memcg_oom.may_oom) 2173 2165 return; 2174 - 2175 - current->memcg_oom.in_memcg_oom = 1; 2176 - 2177 2166 /* 2178 - * As with any blocking lock, a contender needs to start 2179 - * listening for wakeups before attempting the trylock, 2180 - * otherwise it can miss the wakeup from the unlock and sleep 2181 - * indefinitely. This is just open-coded because our locking 2182 - * is so particular to memcg hierarchies. 2167 + * We are in the middle of the charge context here, so we 2168 + * don't want to block when potentially sitting on a callstack 2169 + * that holds all kinds of filesystem and mm locks. 2170 + * 2171 + * Also, the caller may handle a failed allocation gracefully 2172 + * (like optional page cache readahead) and so an OOM killer 2173 + * invocation might not even be necessary. 2174 + * 2175 + * That's why we don't do anything here except remember the 2176 + * OOM context and then deal with it at the end of the page 2177 + * fault when the stack is unwound, the locks are released, 2178 + * and when we know whether the fault was overall successful. 2183 2179 */ 2184 - wakeups = atomic_read(&memcg->oom_wakeups); 2180 + css_get(&memcg->css); 2181 + current->memcg_oom.memcg = memcg; 2182 + current->memcg_oom.gfp_mask = mask; 2183 + current->memcg_oom.order = order; 2184 + } 2185 + 2186 + /** 2187 + * mem_cgroup_oom_synchronize - complete memcg OOM handling 2188 + * @handle: actually kill/wait or just clean up the OOM state 2189 + * 2190 + * This has to be called at the end of a page fault if the memcg OOM 2191 + * handler was enabled. 2192 + * 2193 + * Memcg supports userspace OOM handling where failed allocations must 2194 + * sleep on a waitqueue until the userspace task resolves the 2195 + * situation. Sleeping directly in the charge context with all kinds 2196 + * of locks held is not a good idea, instead we remember an OOM state 2197 + * in the task and mem_cgroup_oom_synchronize() has to be called at 2198 + * the end of the page fault to complete the OOM handling. 2199 + * 2200 + * Returns %true if an ongoing memcg OOM situation was detected and 2201 + * completed, %false otherwise. 2202 + */ 2203 + bool mem_cgroup_oom_synchronize(bool handle) 2204 + { 2205 + struct mem_cgroup *memcg = current->memcg_oom.memcg; 2206 + struct oom_wait_info owait; 2207 + bool locked; 2208 + 2209 + /* OOM is global, do not handle */ 2210 + if (!memcg) 2211 + return false; 2212 + 2213 + if (!handle) 2214 + goto cleanup; 2215 + 2216 + owait.memcg = memcg; 2217 + owait.wait.flags = 0; 2218 + owait.wait.func = memcg_oom_wake_function; 2219 + owait.wait.private = current; 2220 + INIT_LIST_HEAD(&owait.wait.task_list); 2221 + 2222 + prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2185 2223 mem_cgroup_mark_under_oom(memcg); 2186 2224 2187 2225 locked = mem_cgroup_oom_trylock(memcg); ··· 2231 2189 2232 2190 if (locked && !memcg->oom_kill_disable) { 2233 2191 mem_cgroup_unmark_under_oom(memcg); 2234 - mem_cgroup_out_of_memory(memcg, mask, order); 2235 - mem_cgroup_oom_unlock(memcg); 2236 - /* 2237 - * There is no guarantee that an OOM-lock contender 2238 - * sees the wakeups triggered by the OOM kill 2239 - * uncharges. Wake any sleepers explicitely. 2240 - */ 2241 - memcg_oom_recover(memcg); 2192 + finish_wait(&memcg_oom_waitq, &owait.wait); 2193 + mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, 2194 + current->memcg_oom.order); 2242 2195 } else { 2243 - /* 2244 - * A system call can just return -ENOMEM, but if this 2245 - * is a page fault and somebody else is handling the 2246 - * OOM already, we need to sleep on the OOM waitqueue 2247 - * for this memcg until the situation is resolved. 2248 - * Which can take some time because it might be 2249 - * handled by a userspace task. 2250 - * 2251 - * However, this is the charge context, which means 2252 - * that we may sit on a large call stack and hold 2253 - * various filesystem locks, the mmap_sem etc. and we 2254 - * don't want the OOM handler to deadlock on them 2255 - * while we sit here and wait. Store the current OOM 2256 - * context in the task_struct, then return -ENOMEM. 2257 - * At the end of the page fault handler, with the 2258 - * stack unwound, pagefault_out_of_memory() will check 2259 - * back with us by calling 2260 - * mem_cgroup_oom_synchronize(), possibly putting the 2261 - * task to sleep. 2262 - */ 2263 - current->memcg_oom.oom_locked = locked; 2264 - current->memcg_oom.wakeups = wakeups; 2265 - css_get(&memcg->css); 2266 - current->memcg_oom.wait_on_memcg = memcg; 2267 - } 2268 - } 2269 - 2270 - /** 2271 - * mem_cgroup_oom_synchronize - complete memcg OOM handling 2272 - * 2273 - * This has to be called at the end of a page fault if the the memcg 2274 - * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. 2275 - * 2276 - * Memcg supports userspace OOM handling, so failed allocations must 2277 - * sleep on a waitqueue until the userspace task resolves the 2278 - * situation. Sleeping directly in the charge context with all kinds 2279 - * of locks held is not a good idea, instead we remember an OOM state 2280 - * in the task and mem_cgroup_oom_synchronize() has to be called at 2281 - * the end of the page fault to put the task to sleep and clean up the 2282 - * OOM state. 2283 - * 2284 - * Returns %true if an ongoing memcg OOM situation was detected and 2285 - * finalized, %false otherwise. 2286 - */ 2287 - bool mem_cgroup_oom_synchronize(void) 2288 - { 2289 - struct oom_wait_info owait; 2290 - struct mem_cgroup *memcg; 2291 - 2292 - /* OOM is global, do not handle */ 2293 - if (!current->memcg_oom.in_memcg_oom) 2294 - return false; 2295 - 2296 - /* 2297 - * We invoked the OOM killer but there is a chance that a kill 2298 - * did not free up any charges. Everybody else might already 2299 - * be sleeping, so restart the fault and keep the rampage 2300 - * going until some charges are released. 2301 - */ 2302 - memcg = current->memcg_oom.wait_on_memcg; 2303 - if (!memcg) 2304 - goto out; 2305 - 2306 - if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 2307 - goto out_memcg; 2308 - 2309 - owait.memcg = memcg; 2310 - owait.wait.flags = 0; 2311 - owait.wait.func = memcg_oom_wake_function; 2312 - owait.wait.private = current; 2313 - INIT_LIST_HEAD(&owait.wait.task_list); 2314 - 2315 - prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2316 - /* Only sleep if we didn't miss any wakeups since OOM */ 2317 - if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) 2318 2196 schedule(); 2319 - finish_wait(&memcg_oom_waitq, &owait.wait); 2320 - out_memcg: 2321 - mem_cgroup_unmark_under_oom(memcg); 2322 - if (current->memcg_oom.oom_locked) { 2197 + mem_cgroup_unmark_under_oom(memcg); 2198 + finish_wait(&memcg_oom_waitq, &owait.wait); 2199 + } 2200 + 2201 + if (locked) { 2323 2202 mem_cgroup_oom_unlock(memcg); 2324 2203 /* 2325 2204 * There is no guarantee that an OOM-lock contender ··· 2249 2286 */ 2250 2287 memcg_oom_recover(memcg); 2251 2288 } 2289 + cleanup: 2290 + current->memcg_oom.memcg = NULL; 2252 2291 css_put(&memcg->css); 2253 - current->memcg_oom.wait_on_memcg = NULL; 2254 - out: 2255 - current->memcg_oom.in_memcg_oom = 0; 2256 2292 return true; 2257 2293 } 2258 2294 ··· 2665 2703 || fatal_signal_pending(current))) 2666 2704 goto bypass; 2667 2705 2706 + if (unlikely(task_in_memcg_oom(current))) 2707 + goto bypass; 2708 + 2668 2709 /* 2669 2710 * We always charge the cgroup the mm_struct belongs to. 2670 2711 * The mm_struct's mem_cgroup changes on task migration if the ··· 2766 2801 return 0; 2767 2802 nomem: 2768 2803 *ptr = NULL; 2804 + if (gfp_mask & __GFP_NOFAIL) 2805 + return 0; 2769 2806 return -ENOMEM; 2770 2807 bypass: 2771 2808 *ptr = root_mem_cgroup;
+14 -6
mm/memory.c
··· 837 837 */ 838 838 make_migration_entry_read(&entry); 839 839 pte = swp_entry_to_pte(entry); 840 + if (pte_swp_soft_dirty(*src_pte)) 841 + pte = pte_swp_mksoft_dirty(pte); 840 842 set_pte_at(src_mm, addr, src_pte, pte); 841 843 } 842 844 } ··· 3865 3863 * space. Kernel faults are handled more gracefully. 3866 3864 */ 3867 3865 if (flags & FAULT_FLAG_USER) 3868 - mem_cgroup_enable_oom(); 3866 + mem_cgroup_oom_enable(); 3869 3867 3870 3868 ret = __handle_mm_fault(mm, vma, address, flags); 3871 3869 3872 - if (flags & FAULT_FLAG_USER) 3873 - mem_cgroup_disable_oom(); 3874 - 3875 - if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) 3876 - mem_cgroup_oom_synchronize(); 3870 + if (flags & FAULT_FLAG_USER) { 3871 + mem_cgroup_oom_disable(); 3872 + /* 3873 + * The task may have entered a memcg OOM situation but 3874 + * if the allocation error was handled gracefully (no 3875 + * VM_FAULT_OOM), there is no need to kill anything. 3876 + * Just clean up the OOM state peacefully. 3877 + */ 3878 + if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) 3879 + mem_cgroup_oom_synchronize(false); 3880 + } 3877 3881 3878 3882 return ret; 3879 3883 }
+2
mm/migrate.c
··· 161 161 162 162 get_page(new); 163 163 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 164 + if (pte_swp_soft_dirty(*ptep)) 165 + pte = pte_mksoft_dirty(pte); 164 166 if (is_write_migration_entry(entry)) 165 167 pte = pte_mkwrite(pte); 166 168 #ifdef CONFIG_HUGETLB_PAGE
+5 -2
mm/mprotect.c
··· 94 94 swp_entry_t entry = pte_to_swp_entry(oldpte); 95 95 96 96 if (is_write_migration_entry(entry)) { 97 + pte_t newpte; 97 98 /* 98 99 * A protection check is difficult so 99 100 * just be safe and disable write 100 101 */ 101 102 make_migration_entry_read(&entry); 102 - set_pte_at(mm, addr, pte, 103 - swp_entry_to_pte(entry)); 103 + newpte = swp_entry_to_pte(entry); 104 + if (pte_swp_soft_dirty(oldpte)) 105 + newpte = pte_swp_mksoft_dirty(newpte); 106 + set_pte_at(mm, addr, pte, newpte); 104 107 } 105 108 pages++; 106 109 }
+1 -4
mm/mremap.c
··· 25 25 #include <asm/uaccess.h> 26 26 #include <asm/cacheflush.h> 27 27 #include <asm/tlbflush.h> 28 - #include <asm/pgalloc.h> 29 28 30 29 #include "internal.h" 31 30 ··· 62 63 return NULL; 63 64 64 65 pmd = pmd_alloc(mm, pud, addr); 65 - if (!pmd) { 66 - pud_free(mm, pud); 66 + if (!pmd) 67 67 return NULL; 68 - } 69 68 70 69 VM_BUG_ON(pmd_trans_huge(*pmd)); 71 70
+1 -1
mm/oom_kill.c
··· 680 680 { 681 681 struct zonelist *zonelist; 682 682 683 - if (mem_cgroup_oom_synchronize()) 683 + if (mem_cgroup_oom_synchronize(true)) 684 684 return; 685 685 686 686 zonelist = node_zonelist(first_online_node, GFP_KERNEL);
+5 -5
mm/page-writeback.c
··· 1210 1210 return 1; 1211 1211 } 1212 1212 1213 - static long bdi_max_pause(struct backing_dev_info *bdi, 1214 - unsigned long bdi_dirty) 1213 + static unsigned long bdi_max_pause(struct backing_dev_info *bdi, 1214 + unsigned long bdi_dirty) 1215 1215 { 1216 - long bw = bdi->avg_write_bandwidth; 1217 - long t; 1216 + unsigned long bw = bdi->avg_write_bandwidth; 1217 + unsigned long t; 1218 1218 1219 1219 /* 1220 1220 * Limit pause time for small memory systems. If sleeping for too long ··· 1226 1226 t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); 1227 1227 t++; 1228 1228 1229 - return min_t(long, t, MAX_PAUSE); 1229 + return min_t(unsigned long, t, MAX_PAUSE); 1230 1230 } 1231 1231 1232 1232 static long bdi_min_pause(struct backing_dev_info *bdi,
+3 -1
mm/swapfile.c
··· 1824 1824 struct filename *pathname; 1825 1825 int i, type, prev; 1826 1826 int err; 1827 + unsigned int old_block_size; 1827 1828 1828 1829 if (!capable(CAP_SYS_ADMIN)) 1829 1830 return -EPERM; ··· 1915 1914 } 1916 1915 1917 1916 swap_file = p->swap_file; 1917 + old_block_size = p->old_block_size; 1918 1918 p->swap_file = NULL; 1919 1919 p->max = 0; 1920 1920 swap_map = p->swap_map; ··· 1940 1938 inode = mapping->host; 1941 1939 if (S_ISBLK(inode->i_mode)) { 1942 1940 struct block_device *bdev = I_BDEV(inode); 1943 - set_blocksize(bdev, p->old_block_size); 1941 + set_blocksize(bdev, old_block_size); 1944 1942 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 1945 1943 } else { 1946 1944 mutex_lock(&inode->i_mutex);
+1
mm/vmscan.c
··· 211 211 down_write(&shrinker_rwsem); 212 212 list_del(&shrinker->list); 213 213 up_write(&shrinker_rwsem); 214 + kfree(shrinker->nr_deferred); 214 215 } 215 216 EXPORT_SYMBOL(unregister_shrinker); 216 217
+4
mm/zswap.c
··· 804 804 } 805 805 tree->rbroot = RB_ROOT; 806 806 spin_unlock(&tree->lock); 807 + 808 + zbud_destroy_pool(tree->pool); 809 + kfree(tree); 810 + zswap_trees[type] = NULL; 807 811 } 808 812 809 813 static struct zbud_ops zswap_zbud_ops = {
+1 -1
tools/testing/selftests/timers/posix_timers.c
··· 151 151 fflush(stdout); 152 152 153 153 done = 0; 154 - timer_create(which, NULL, &id); 154 + err = timer_create(which, NULL, &id); 155 155 if (err < 0) { 156 156 perror("Can't create timer\n"); 157 157 return -1;