Merge branch 'akpm' (patches from Andrew)

+26 -16

fs/userfaultfd.c

··· 40 40 /* 41 41 * Start with fault_pending_wqh and fault_wqh so they're more likely 42 42 * to be in the same cacheline. 43 + * 44 + * Locking order: 45 + * fd_wqh.lock 46 + * fault_pending_wqh.lock 47 + * fault_wqh.lock 48 + * event_wqh.lock 49 + * 50 + * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, 51 + * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's 52 + * also taken in IRQ context. 43 53 */ 44 54 struct userfaultfd_ctx { 45 55 /* waitqueue head for the pending (i.e. not read) userfaults */ ··· 468 458 blocking_state = return_to_userland ? TASK_INTERRUPTIBLE : 469 459 TASK_KILLABLE; 470 460 471 - spin_lock(&ctx->fault_pending_wqh.lock); 461 + spin_lock_irq(&ctx->fault_pending_wqh.lock); 472 462 /* 473 463 * After the __add_wait_queue the uwq is visible to userland 474 464 * through poll/read(). ··· 480 470 * __add_wait_queue. 481 471 */ 482 472 set_current_state(blocking_state); 483 - spin_unlock(&ctx->fault_pending_wqh.lock); 473 + spin_unlock_irq(&ctx->fault_pending_wqh.lock); 484 474 485 475 if (!is_vm_hugetlb_page(vmf->vma)) 486 476 must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, ··· 562 552 * kernel stack can be released after the list_del_init. 563 553 */ 564 554 if (!list_empty_careful(&uwq.wq.entry)) { 565 - spin_lock(&ctx->fault_pending_wqh.lock); 555 + spin_lock_irq(&ctx->fault_pending_wqh.lock); 566 556 /* 567 557 * No need of list_del_init(), the uwq on the stack 568 558 * will be freed shortly anyway. 569 559 */ 570 560 list_del(&uwq.wq.entry); 571 - spin_unlock(&ctx->fault_pending_wqh.lock); 561 + spin_unlock_irq(&ctx->fault_pending_wqh.lock); 572 562 } 573 563 574 564 /* ··· 593 583 init_waitqueue_entry(&ewq->wq, current); 594 584 release_new_ctx = NULL; 595 585 596 - spin_lock(&ctx->event_wqh.lock); 586 + spin_lock_irq(&ctx->event_wqh.lock); 597 587 /* 598 588 * After the __add_wait_queue the uwq is visible to userland 599 589 * through poll/read(). ··· 623 613 break; 624 614 } 625 615 626 - spin_unlock(&ctx->event_wqh.lock); 616 + spin_unlock_irq(&ctx->event_wqh.lock); 627 617 628 618 wake_up_poll(&ctx->fd_wqh, EPOLLIN); 629 619 schedule(); 630 620 631 - spin_lock(&ctx->event_wqh.lock); 621 + spin_lock_irq(&ctx->event_wqh.lock); 632 622 } 633 623 __set_current_state(TASK_RUNNING); 634 - spin_unlock(&ctx->event_wqh.lock); 624 + spin_unlock_irq(&ctx->event_wqh.lock); 635 625 636 626 if (release_new_ctx) { 637 627 struct vm_area_struct *vma; ··· 928 918 * the last page faults that may have been already waiting on 929 919 * the fault_*wqh. 930 920 */ 931 - spin_lock(&ctx->fault_pending_wqh.lock); 921 + spin_lock_irq(&ctx->fault_pending_wqh.lock); 932 922 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); 933 923 __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range); 934 - spin_unlock(&ctx->fault_pending_wqh.lock); 924 + spin_unlock_irq(&ctx->fault_pending_wqh.lock); 935 925 936 926 /* Flush pending events that may still wait on event_wqh */ 937 927 wake_up_all(&ctx->event_wqh); ··· 1144 1134 1145 1135 if (!ret && msg->event == UFFD_EVENT_FORK) { 1146 1136 ret = resolve_userfault_fork(ctx, fork_nctx, msg); 1147 - spin_lock(&ctx->event_wqh.lock); 1137 + spin_lock_irq(&ctx->event_wqh.lock); 1148 1138 if (!list_empty(&fork_event)) { 1149 1139 /* 1150 1140 * The fork thread didn't abort, so we can ··· 1190 1180 if (ret) 1191 1181 userfaultfd_ctx_put(fork_nctx); 1192 1182 } 1193 - spin_unlock(&ctx->event_wqh.lock); 1183 + spin_unlock_irq(&ctx->event_wqh.lock); 1194 1184 } 1195 1185 1196 1186 return ret; ··· 1229 1219 static void __wake_userfault(struct userfaultfd_ctx *ctx, 1230 1220 struct userfaultfd_wake_range *range) 1231 1221 { 1232 - spin_lock(&ctx->fault_pending_wqh.lock); 1222 + spin_lock_irq(&ctx->fault_pending_wqh.lock); 1233 1223 /* wake all in the range and autoremove */ 1234 1224 if (waitqueue_active(&ctx->fault_pending_wqh)) 1235 1225 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 1236 1226 range); 1237 1227 if (waitqueue_active(&ctx->fault_wqh)) 1238 1228 __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range); 1239 - spin_unlock(&ctx->fault_pending_wqh.lock); 1229 + spin_unlock_irq(&ctx->fault_pending_wqh.lock); 1240 1230 } 1241 1231 1242 1232 static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, ··· 1891 1881 wait_queue_entry_t *wq; 1892 1882 unsigned long pending = 0, total = 0; 1893 1883 1894 - spin_lock(&ctx->fault_pending_wqh.lock); 1884 + spin_lock_irq(&ctx->fault_pending_wqh.lock); 1895 1885 list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { 1896 1886 pending++; 1897 1887 total++; ··· 1899 1889 list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { 1900 1890 total++; 1901 1891 } 1902 - spin_unlock(&ctx->fault_pending_wqh.lock); 1892 + spin_unlock_irq(&ctx->fault_pending_wqh.lock); 1903 1893 1904 1894 /* 1905 1895 * If more protocols will be added, there will be all shown

+2 -1

include/linux/device.h

··· 704 704 gfp_t gfp_mask, unsigned int order); 705 705 extern void devm_free_pages(struct device *dev, unsigned long addr); 706 706 707 - void __iomem *devm_ioremap_resource(struct device *dev, struct resource *res); 707 + void __iomem *devm_ioremap_resource(struct device *dev, 708 + const struct resource *res); 708 709 709 710 void __iomem *devm_of_iomap(struct device *dev, 710 711 struct device_node *node, int index,

+2 -1

lib/devres.c

··· 131 131 * if (IS_ERR(base)) 132 132 * return PTR_ERR(base); 133 133 */ 134 - void __iomem *devm_ioremap_resource(struct device *dev, struct resource *res) 134 + void __iomem *devm_ioremap_resource(struct device *dev, 135 + const struct resource *res) 135 136 { 136 137 resource_size_t size; 137 138 void __iomem *dest_ptr;

+2 -1

mm/page_alloc.c

··· 1826 1826 first_deferred_pfn)) { 1827 1827 pgdat->first_deferred_pfn = ULONG_MAX; 1828 1828 pgdat_resize_unlock(pgdat, &flags); 1829 - return true; 1829 + /* Retry only once. */ 1830 + return first_deferred_pfn != ULONG_MAX; 1830 1831 } 1831 1832 1832 1833 /*

+8 -5

mm/page_io.c

··· 137 137 unlock_page(page); 138 138 WRITE_ONCE(bio->bi_private, NULL); 139 139 bio_put(bio); 140 - blk_wake_io_task(waiter); 141 - put_task_struct(waiter); 140 + if (waiter) { 141 + blk_wake_io_task(waiter); 142 + put_task_struct(waiter); 143 + } 142 144 } 143 145 144 146 int generic_swapfile_activate(struct swap_info_struct *sis, ··· 397 395 * Keep this task valid during swap readpage because the oom killer may 398 396 * attempt to access it in the page fault retry time check. 399 397 */ 400 - get_task_struct(current); 401 - bio->bi_private = current; 402 398 bio_set_op_attrs(bio, REQ_OP_READ, 0); 403 - if (synchronous) 399 + if (synchronous) { 404 400 bio->bi_opf |= REQ_HIPRI; 401 + get_task_struct(current); 402 + bio->bi_private = current; 403 + } 405 404 count_vm_event(PSWPIN); 406 405 bio_get(bio); 407 406 qc = submit_bio(bio);

+15 -12

mm/vmscan.c

··· 3644 3644 } 3645 3645 3646 3646 /* 3647 - * pgdat->kswapd_classzone_idx is the highest zone index that a recent 3648 - * allocation request woke kswapd for. When kswapd has not woken recently, 3649 - * the value is MAX_NR_ZONES which is not a valid index. This compares a 3650 - * given classzone and returns it or the highest classzone index kswapd 3651 - * was recently woke for. 3647 + * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be 3648 + * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not 3649 + * a valid index then either kswapd runs for first time or kswapd couldn't sleep 3650 + * after previous reclaim attempt (node is still unbalanced). In that case 3651 + * return the zone index of the previous kswapd reclaim cycle. 3652 3652 */ 3653 3653 static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, 3654 - enum zone_type classzone_idx) 3654 + enum zone_type prev_classzone_idx) 3655 3655 { 3656 3656 if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES) 3657 - return classzone_idx; 3658 - 3659 - return max(pgdat->kswapd_classzone_idx, classzone_idx); 3657 + return prev_classzone_idx; 3658 + return pgdat->kswapd_classzone_idx; 3660 3659 } 3661 3660 3662 3661 static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, ··· 3796 3797 3797 3798 /* Read the new order and classzone_idx */ 3798 3799 alloc_order = reclaim_order = pgdat->kswapd_order; 3799 - classzone_idx = kswapd_classzone_idx(pgdat, 0); 3800 + classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); 3800 3801 pgdat->kswapd_order = 0; 3801 3802 pgdat->kswapd_classzone_idx = MAX_NR_ZONES; 3802 3803 ··· 3850 3851 if (!cpuset_zone_allowed(zone, gfp_flags)) 3851 3852 return; 3852 3853 pgdat = zone->zone_pgdat; 3853 - pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, 3854 - classzone_idx); 3854 + 3855 + if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES) 3856 + pgdat->kswapd_classzone_idx = classzone_idx; 3857 + else 3858 + pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, 3859 + classzone_idx); 3855 3860 pgdat->kswapd_order = max(pgdat->kswapd_order, order); 3856 3861 if (!waitqueue_active(&pgdat->kswapd_wait)) 3857 3862 return;

Configure Feed

Configure Feed