Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull block layer fixes from Jens Axboe:
"A final set of fixes for 4.3.

It is (again) bigger than I would have liked, but it's all been
through the testing mill and has been carefully reviewed by multiple
parties. Each fix is either a regression fix for this cycle, or is
marked stable. You can scold me at KS. The pull request contains:

- Three simple fixes for NVMe, fixing regressions since 4.3. From
Arnd, Christoph, and Keith.

- A single xen-blkfront fix from Cathy, fixing a NULL dereference if
an error is returned through the staste change callback.

- Fixup for some bad/sloppy code in nbd that got introduced earlier
in this cycle. From Markus Pargmann.

- A blk-mq tagset use-after-free fix from Junichi.

- A backing device lifetime fix from Tejun, fixing a crash.

- And finally, a set of regression/stable fixes for cgroup writeback
from Tejun"

* 'for-linus' of git://git.kernel.dk/linux-block:
writeback: remove broken rbtree_postorder_for_each_entry_safe() usage in cgwb_bdi_destroy()
NVMe: Fix memory leak on retried commands
block: don't release bdi while request_queue has live references
nvme: use an integer value to Linux errno values
blk-mq: fix use-after-free in blk_mq_free_tag_set()
nvme: fix 32-bit build warning
writeback: fix incorrect calculation of available memory for memcg domains
writeback: memcg dirty_throttle_control should be initialized with wb->memcg_completions
writeback: bdi_writeback iteration must not skip dying ones
writeback: fix bdi_writeback iteration in wakeup_dirtytime_writeback()
writeback: laptop_mode_timer_fn() needs rcu_read_lock() around bdi_writeback iteration
nbd: Add locking for tasks
xen-blkfront: check for null drvdata in blkback_changed (XenbusStateClosing)

+167 -144
+1 -1
block/blk-core.c
··· 576 576 q->queue_lock = &q->__queue_lock; 577 577 spin_unlock_irq(lock); 578 578 579 - bdi_destroy(&q->backing_dev_info); 579 + bdi_unregister(&q->backing_dev_info); 580 580 581 581 /* @q is and will stay empty, shutdown and put */ 582 582 blk_put_queue(q);
+1
block/blk-mq-tag.c
··· 641 641 { 642 642 bt_free(&tags->bitmap_tags); 643 643 bt_free(&tags->breserved_tags); 644 + free_cpumask_var(tags->cpumask); 644 645 kfree(tags); 645 646 } 646 647
+1 -3
block/blk-mq.c
··· 2296 2296 int i; 2297 2297 2298 2298 for (i = 0; i < set->nr_hw_queues; i++) { 2299 - if (set->tags[i]) { 2299 + if (set->tags[i]) 2300 2300 blk_mq_free_rq_map(set, set->tags[i], i); 2301 - free_cpumask_var(set->tags[i]->cpumask); 2302 - } 2303 2301 } 2304 2302 2305 2303 kfree(set->tags);
+1
block/blk-sysfs.c
··· 540 540 struct request_queue *q = 541 541 container_of(kobj, struct request_queue, kobj); 542 542 543 + bdi_exit(&q->backing_dev_info); 543 544 blkcg_exit_queue(q); 544 545 545 546 if (q->elevator) {
+30 -6
drivers/block/nbd.c
··· 60 60 bool disconnect; /* a disconnect has been requested by user */ 61 61 62 62 struct timer_list timeout_timer; 63 + spinlock_t tasks_lock; 63 64 struct task_struct *task_recv; 64 65 struct task_struct *task_send; 65 66 ··· 141 140 static void nbd_xmit_timeout(unsigned long arg) 142 141 { 143 142 struct nbd_device *nbd = (struct nbd_device *)arg; 144 - struct task_struct *task; 143 + unsigned long flags; 145 144 146 145 if (list_empty(&nbd->queue_head)) 147 146 return; 148 147 149 148 nbd->disconnect = true; 150 149 151 - task = READ_ONCE(nbd->task_recv); 152 - if (task) 153 - force_sig(SIGKILL, task); 150 + spin_lock_irqsave(&nbd->tasks_lock, flags); 154 151 155 - task = READ_ONCE(nbd->task_send); 156 - if (task) 152 + if (nbd->task_recv) 153 + force_sig(SIGKILL, nbd->task_recv); 154 + 155 + if (nbd->task_send) 157 156 force_sig(SIGKILL, nbd->task_send); 157 + 158 + spin_unlock_irqrestore(&nbd->tasks_lock, flags); 158 159 159 160 dev_err(nbd_to_dev(nbd), "Connection timed out, killed receiver and sender, shutting down connection\n"); 160 161 } ··· 406 403 { 407 404 struct request *req; 408 405 int ret; 406 + unsigned long flags; 409 407 410 408 BUG_ON(nbd->magic != NBD_MAGIC); 411 409 412 410 sk_set_memalloc(nbd->sock->sk); 413 411 412 + spin_lock_irqsave(&nbd->tasks_lock, flags); 414 413 nbd->task_recv = current; 414 + spin_unlock_irqrestore(&nbd->tasks_lock, flags); 415 415 416 416 ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); 417 417 if (ret) { 418 418 dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); 419 + 420 + spin_lock_irqsave(&nbd->tasks_lock, flags); 419 421 nbd->task_recv = NULL; 422 + spin_unlock_irqrestore(&nbd->tasks_lock, flags); 423 + 420 424 return ret; 421 425 } 422 426 ··· 439 429 440 430 device_remove_file(disk_to_dev(nbd->disk), &pid_attr); 441 431 432 + spin_lock_irqsave(&nbd->tasks_lock, flags); 442 433 nbd->task_recv = NULL; 434 + spin_unlock_irqrestore(&nbd->tasks_lock, flags); 443 435 444 436 if (signal_pending(current)) { 445 437 siginfo_t info; ··· 546 534 { 547 535 struct nbd_device *nbd = data; 548 536 struct request *req; 537 + unsigned long flags; 549 538 539 + spin_lock_irqsave(&nbd->tasks_lock, flags); 550 540 nbd->task_send = current; 541 + spin_unlock_irqrestore(&nbd->tasks_lock, flags); 551 542 552 543 set_user_nice(current, MIN_NICE); 553 544 while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { ··· 587 572 nbd_handle_req(nbd, req); 588 573 } 589 574 575 + spin_lock_irqsave(&nbd->tasks_lock, flags); 590 576 nbd->task_send = NULL; 577 + spin_unlock_irqrestore(&nbd->tasks_lock, flags); 578 + 579 + /* Clear maybe pending signals */ 580 + if (signal_pending(current)) { 581 + siginfo_t info; 582 + dequeue_signal_lock(current, &current->blocked, &info); 583 + } 591 584 592 585 return 0; 593 586 } ··· 1075 1052 nbd_dev[i].magic = NBD_MAGIC; 1076 1053 INIT_LIST_HEAD(&nbd_dev[i].waiting_queue); 1077 1054 spin_lock_init(&nbd_dev[i].queue_lock); 1055 + spin_lock_init(&nbd_dev[i].tasks_lock); 1078 1056 INIT_LIST_HEAD(&nbd_dev[i].queue_head); 1079 1057 mutex_init(&nbd_dev[i].tx_lock); 1080 1058 init_timer(&nbd_dev[i].timeout_timer);
+15 -9
drivers/block/nvme-core.c
··· 603 603 struct nvme_iod *iod = ctx; 604 604 struct request *req = iod_get_private(iod); 605 605 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); 606 - 607 606 u16 status = le16_to_cpup(&cqe->status) >> 1; 607 + bool requeue = false; 608 + int error = 0; 608 609 609 610 if (unlikely(status)) { 610 611 if (!(status & NVME_SC_DNR || blk_noretry_request(req)) 611 612 && (jiffies - req->start_time) < req->timeout) { 612 613 unsigned long flags; 613 614 615 + requeue = true; 614 616 blk_mq_requeue_request(req); 615 617 spin_lock_irqsave(req->q->queue_lock, flags); 616 618 if (!blk_queue_stopped(req->q)) 617 619 blk_mq_kick_requeue_list(req->q); 618 620 spin_unlock_irqrestore(req->q->queue_lock, flags); 619 - return; 621 + goto release_iod; 620 622 } 621 623 622 624 if (req->cmd_type == REQ_TYPE_DRV_PRIV) { 623 625 if (cmd_rq->ctx == CMD_CTX_CANCELLED) 624 - status = -EINTR; 626 + error = -EINTR; 627 + else 628 + error = status; 625 629 } else { 626 - status = nvme_error_status(status); 630 + error = nvme_error_status(status); 627 631 } 628 632 } 629 633 ··· 639 635 if (cmd_rq->aborted) 640 636 dev_warn(nvmeq->dev->dev, 641 637 "completing aborted command with status:%04x\n", 642 - status); 638 + error); 643 639 640 + release_iod: 644 641 if (iod->nents) { 645 642 dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents, 646 643 rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); ··· 654 649 } 655 650 nvme_free_iod(nvmeq->dev, iod); 656 651 657 - blk_mq_complete_request(req, status); 652 + if (likely(!requeue)) 653 + blk_mq_complete_request(req, error); 658 654 } 659 655 660 656 /* length is in bytes. gfp flags indicates whether we may sleep. */ ··· 1810 1804 1811 1805 length = (io.nblocks + 1) << ns->lba_shift; 1812 1806 meta_len = (io.nblocks + 1) * ns->ms; 1813 - metadata = (void __user *)(unsigned long)io.metadata; 1807 + metadata = (void __user *)(uintptr_t)io.metadata; 1814 1808 write = io.opcode & 1; 1815 1809 1816 1810 if (ns->ext) { ··· 1850 1844 c.rw.metadata = cpu_to_le64(meta_dma); 1851 1845 1852 1846 status = __nvme_submit_sync_cmd(ns->queue, &c, NULL, 1853 - (void __user *)io.addr, length, NULL, 0); 1847 + (void __user *)(uintptr_t)io.addr, length, NULL, 0); 1854 1848 unmap: 1855 1849 if (meta) { 1856 1850 if (status == NVME_SC_SUCCESS && !write) { ··· 1892 1886 timeout = msecs_to_jiffies(cmd.timeout_ms); 1893 1887 1894 1888 status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c, 1895 - NULL, (void __user *)cmd.addr, cmd.data_len, 1889 + NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len, 1896 1890 &cmd.result, timeout); 1897 1891 if (status >= 0) { 1898 1892 if (put_user(cmd.result, &ucmd->result))
+2 -1
drivers/block/xen-blkfront.c
··· 1956 1956 break; 1957 1957 /* Missed the backend's Closing state -- fallthrough */ 1958 1958 case XenbusStateClosing: 1959 - blkfront_closing(info); 1959 + if (info) 1960 + blkfront_closing(info); 1960 1961 break; 1961 1962 } 1962 1963 }
+24 -11
fs/fs-writeback.c
··· 778 778 struct wb_writeback_work *base_work, 779 779 bool skip_if_busy) 780 780 { 781 - int next_memcg_id = 0; 782 - struct bdi_writeback *wb; 783 - struct wb_iter iter; 781 + struct bdi_writeback *last_wb = NULL; 782 + struct bdi_writeback *wb = list_entry_rcu(&bdi->wb_list, 783 + struct bdi_writeback, bdi_node); 784 784 785 785 might_sleep(); 786 786 restart: 787 787 rcu_read_lock(); 788 - bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) { 788 + list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) { 789 789 DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done); 790 790 struct wb_writeback_work fallback_work; 791 791 struct wb_writeback_work *work; 792 792 long nr_pages; 793 + 794 + if (last_wb) { 795 + wb_put(last_wb); 796 + last_wb = NULL; 797 + } 793 798 794 799 /* SYNC_ALL writes out I_DIRTY_TIME too */ 795 800 if (!wb_has_dirty_io(wb) && ··· 824 819 825 820 wb_queue_work(wb, work); 826 821 827 - next_memcg_id = wb->memcg_css->id + 1; 822 + /* 823 + * Pin @wb so that it stays on @bdi->wb_list. This allows 824 + * continuing iteration from @wb after dropping and 825 + * regrabbing rcu read lock. 826 + */ 827 + wb_get(wb); 828 + last_wb = wb; 829 + 828 830 rcu_read_unlock(); 829 831 wb_wait_for_completion(bdi, &fallback_work_done); 830 832 goto restart; 831 833 } 832 834 rcu_read_unlock(); 835 + 836 + if (last_wb) 837 + wb_put(last_wb); 833 838 } 834 839 835 840 #else /* CONFIG_CGROUP_WRITEBACK */ ··· 1872 1857 rcu_read_lock(); 1873 1858 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 1874 1859 struct bdi_writeback *wb; 1875 - struct wb_iter iter; 1876 1860 1877 1861 if (!bdi_has_dirty_io(bdi)) 1878 1862 continue; 1879 1863 1880 - bdi_for_each_wb(wb, bdi, &iter, 0) 1864 + list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) 1881 1865 wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages), 1882 1866 false, reason); 1883 1867 } ··· 1908 1894 rcu_read_lock(); 1909 1895 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 1910 1896 struct bdi_writeback *wb; 1911 - struct wb_iter iter; 1912 1897 1913 - bdi_for_each_wb(wb, bdi, &iter, 0) 1914 - if (!list_empty(&bdi->wb.b_dirty_time)) 1915 - wb_wakeup(&bdi->wb); 1898 + list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) 1899 + if (!list_empty(&wb->b_dirty_time)) 1900 + wb_wakeup(wb); 1916 1901 } 1917 1902 rcu_read_unlock(); 1918 1903 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+3
include/linux/backing-dev-defs.h
··· 116 116 struct list_head work_list; 117 117 struct delayed_work dwork; /* work item used for writeback */ 118 118 119 + struct list_head bdi_node; /* anchored at bdi->wb_list */ 120 + 119 121 #ifdef CONFIG_CGROUP_WRITEBACK 120 122 struct percpu_ref refcnt; /* used only for !root wb's */ 121 123 struct fprop_local_percpu memcg_completions; ··· 152 150 atomic_long_t tot_write_bandwidth; 153 151 154 152 struct bdi_writeback wb; /* the root writeback info for this bdi */ 153 + struct list_head wb_list; /* list of all wbs */ 155 154 #ifdef CONFIG_CGROUP_WRITEBACK 156 155 struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ 157 156 struct rb_root cgwb_congested_tree; /* their congested states */
+5 -64
include/linux/backing-dev.h
··· 19 19 #include <linux/slab.h> 20 20 21 21 int __must_check bdi_init(struct backing_dev_info *bdi); 22 - void bdi_destroy(struct backing_dev_info *bdi); 22 + void bdi_exit(struct backing_dev_info *bdi); 23 23 24 24 __printf(3, 4) 25 25 int bdi_register(struct backing_dev_info *bdi, struct device *parent, 26 26 const char *fmt, ...); 27 27 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); 28 + void bdi_unregister(struct backing_dev_info *bdi); 29 + 28 30 int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); 31 + void bdi_destroy(struct backing_dev_info *bdi); 32 + 29 33 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, 30 34 bool range_cyclic, enum wb_reason reason); 31 35 void wb_start_background_writeback(struct bdi_writeback *wb); ··· 412 408 rcu_read_unlock(); 413 409 } 414 410 415 - struct wb_iter { 416 - int start_memcg_id; 417 - struct radix_tree_iter tree_iter; 418 - void **slot; 419 - }; 420 - 421 - static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter, 422 - struct backing_dev_info *bdi) 423 - { 424 - struct radix_tree_iter *titer = &iter->tree_iter; 425 - 426 - WARN_ON_ONCE(!rcu_read_lock_held()); 427 - 428 - if (iter->start_memcg_id >= 0) { 429 - iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id); 430 - iter->start_memcg_id = -1; 431 - } else { 432 - iter->slot = radix_tree_next_slot(iter->slot, titer, 0); 433 - } 434 - 435 - if (!iter->slot) 436 - iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0); 437 - if (iter->slot) 438 - return *iter->slot; 439 - return NULL; 440 - } 441 - 442 - static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter, 443 - struct backing_dev_info *bdi, 444 - int start_memcg_id) 445 - { 446 - iter->start_memcg_id = start_memcg_id; 447 - 448 - if (start_memcg_id) 449 - return __wb_iter_next(iter, bdi); 450 - else 451 - return &bdi->wb; 452 - } 453 - 454 - /** 455 - * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order 456 - * @wb_cur: cursor struct bdi_writeback pointer 457 - * @bdi: bdi to walk wb's of 458 - * @iter: pointer to struct wb_iter to be used as iteration buffer 459 - * @start_memcg_id: memcg ID to start iteration from 460 - * 461 - * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending 462 - * memcg ID order starting from @start_memcg_id. @iter is struct wb_iter 463 - * to be used as temp storage during iteration. rcu_read_lock() must be 464 - * held throughout iteration. 465 - */ 466 - #define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id) \ 467 - for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id); \ 468 - (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi)) 469 - 470 411 #else /* CONFIG_CGROUP_WRITEBACK */ 471 412 472 413 static inline bool inode_cgwb_enabled(struct inode *inode) ··· 470 521 static inline void wb_blkcg_offline(struct blkcg *blkcg) 471 522 { 472 523 } 473 - 474 - struct wb_iter { 475 - int next_id; 476 - }; 477 - 478 - #define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \ 479 - for ((iter)->next_id = (start_blkcg_id); \ 480 - ({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); ) 481 524 482 525 static inline int inode_congested(struct inode *inode, int cong_bits) 483 526 {
+5 -3
include/linux/memcontrol.h
··· 676 676 677 677 struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); 678 678 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); 679 - void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail, 680 - unsigned long *pdirty, unsigned long *pwriteback); 679 + void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, 680 + unsigned long *pheadroom, unsigned long *pdirty, 681 + unsigned long *pwriteback); 681 682 682 683 #else /* CONFIG_CGROUP_WRITEBACK */ 683 684 ··· 688 687 } 689 688 690 689 static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, 691 - unsigned long *pavail, 690 + unsigned long *pfilepages, 691 + unsigned long *pheadroom, 692 692 unsigned long *pdirty, 693 693 unsigned long *pwriteback) 694 694 {
+30 -6
mm/backing-dev.c
··· 480 480 release_work); 481 481 struct backing_dev_info *bdi = wb->bdi; 482 482 483 + spin_lock_irq(&cgwb_lock); 484 + list_del_rcu(&wb->bdi_node); 485 + spin_unlock_irq(&cgwb_lock); 486 + 483 487 wb_shutdown(wb); 484 488 485 489 css_put(wb->memcg_css); ··· 579 575 ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); 580 576 if (!ret) { 581 577 atomic_inc(&bdi->usage_cnt); 578 + list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); 582 579 list_add(&wb->memcg_node, memcg_cgwb_list); 583 580 list_add(&wb->blkcg_node, blkcg_cgwb_list); 584 581 css_get(memcg_css); ··· 681 676 static void cgwb_bdi_destroy(struct backing_dev_info *bdi) 682 677 { 683 678 struct radix_tree_iter iter; 684 - struct bdi_writeback_congested *congested, *congested_n; 679 + struct rb_node *rbn; 685 680 void **slot; 686 681 687 682 WARN_ON(test_bit(WB_registered, &bdi->wb.state)); ··· 691 686 radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) 692 687 cgwb_kill(*slot); 693 688 694 - rbtree_postorder_for_each_entry_safe(congested, congested_n, 695 - &bdi->cgwb_congested_tree, rb_node) { 696 - rb_erase(&congested->rb_node, &bdi->cgwb_congested_tree); 689 + while ((rbn = rb_first(&bdi->cgwb_congested_tree))) { 690 + struct bdi_writeback_congested *congested = 691 + rb_entry(rbn, struct bdi_writeback_congested, rb_node); 692 + 693 + rb_erase(rbn, &bdi->cgwb_congested_tree); 697 694 congested->bdi = NULL; /* mark @congested unlinked */ 698 695 } 699 696 ··· 771 764 772 765 int bdi_init(struct backing_dev_info *bdi) 773 766 { 767 + int ret; 768 + 774 769 bdi->dev = NULL; 775 770 776 771 bdi->min_ratio = 0; 777 772 bdi->max_ratio = 100; 778 773 bdi->max_prop_frac = FPROP_FRAC_BASE; 779 774 INIT_LIST_HEAD(&bdi->bdi_list); 775 + INIT_LIST_HEAD(&bdi->wb_list); 780 776 init_waitqueue_head(&bdi->wb_waitq); 781 777 782 - return cgwb_bdi_init(bdi); 778 + ret = cgwb_bdi_init(bdi); 779 + 780 + list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); 781 + 782 + return ret; 783 783 } 784 784 EXPORT_SYMBOL(bdi_init); 785 785 ··· 837 823 synchronize_rcu_expedited(); 838 824 } 839 825 840 - void bdi_destroy(struct backing_dev_info *bdi) 826 + void bdi_unregister(struct backing_dev_info *bdi) 841 827 { 842 828 /* make sure nobody finds us on the bdi_list anymore */ 843 829 bdi_remove_from_list(bdi); ··· 849 835 device_unregister(bdi->dev); 850 836 bdi->dev = NULL; 851 837 } 838 + } 852 839 840 + void bdi_exit(struct backing_dev_info *bdi) 841 + { 842 + WARN_ON_ONCE(bdi->dev); 853 843 wb_exit(&bdi->wb); 844 + } 845 + 846 + void bdi_destroy(struct backing_dev_info *bdi) 847 + { 848 + bdi_unregister(bdi); 849 + bdi_exit(bdi); 854 850 } 855 851 EXPORT_SYMBOL(bdi_destroy); 856 852
+17 -18
mm/memcontrol.c
··· 3741 3741 /** 3742 3742 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 3743 3743 * @wb: bdi_writeback in question 3744 - * @pavail: out parameter for number of available pages 3744 + * @pfilepages: out parameter for number of file pages 3745 + * @pheadroom: out parameter for number of allocatable pages according to memcg 3745 3746 * @pdirty: out parameter for number of dirty pages 3746 3747 * @pwriteback: out parameter for number of pages under writeback 3747 3748 * 3748 - * Determine the numbers of available, dirty, and writeback pages in @wb's 3749 - * memcg. Dirty and writeback are self-explanatory. Available is a bit 3750 - * more involved. 3749 + * Determine the numbers of file, headroom, dirty, and writeback pages in 3750 + * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom 3751 + * is a bit more involved. 3751 3752 * 3752 - * A memcg's headroom is "min(max, high) - used". The available memory is 3753 - * calculated as the lowest headroom of itself and the ancestors plus the 3754 - * number of pages already being used for file pages. Note that this 3755 - * doesn't consider the actual amount of available memory in the system. 3756 - * The caller should further cap *@pavail accordingly. 3753 + * A memcg's headroom is "min(max, high) - used". In the hierarchy, the 3754 + * headroom is calculated as the lowest headroom of itself and the 3755 + * ancestors. Note that this doesn't consider the actual amount of 3756 + * available memory in the system. The caller should further cap 3757 + * *@pheadroom accordingly. 3757 3758 */ 3758 - void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail, 3759 - unsigned long *pdirty, unsigned long *pwriteback) 3759 + void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, 3760 + unsigned long *pheadroom, unsigned long *pdirty, 3761 + unsigned long *pwriteback) 3760 3762 { 3761 3763 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 3762 3764 struct mem_cgroup *parent; 3763 - unsigned long head_room = PAGE_COUNTER_MAX; 3764 - unsigned long file_pages; 3765 3765 3766 3766 *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY); 3767 3767 3768 3768 /* this should eventually include NR_UNSTABLE_NFS */ 3769 3769 *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); 3770 + *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | 3771 + (1 << LRU_ACTIVE_FILE)); 3772 + *pheadroom = PAGE_COUNTER_MAX; 3770 3773 3771 - file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | 3772 - (1 << LRU_ACTIVE_FILE)); 3773 3774 while ((parent = parent_mem_cgroup(memcg))) { 3774 3775 unsigned long ceiling = min(memcg->memory.limit, memcg->high); 3775 3776 unsigned long used = page_counter_read(&memcg->memory); 3776 3777 3777 - head_room = min(head_room, ceiling - min(ceiling, used)); 3778 + *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); 3778 3779 memcg = parent; 3779 3780 } 3780 - 3781 - *pavail = file_pages + head_room; 3782 3781 } 3783 3782 3784 3783 #else /* CONFIG_CGROUP_WRITEBACK */
+32 -22
mm/page-writeback.c
··· 145 145 unsigned long pos_ratio; 146 146 }; 147 147 148 - #define DTC_INIT_COMMON(__wb) .wb = (__wb), \ 149 - .wb_completions = &(__wb)->completions 150 - 151 148 /* 152 149 * Length of period for aging writeout fractions of bdis. This is an 153 150 * arbitrarily chosen number. The longer the period, the slower fractions will ··· 154 157 155 158 #ifdef CONFIG_CGROUP_WRITEBACK 156 159 157 - #define GDTC_INIT(__wb) .dom = &global_wb_domain, \ 158 - DTC_INIT_COMMON(__wb) 160 + #define GDTC_INIT(__wb) .wb = (__wb), \ 161 + .dom = &global_wb_domain, \ 162 + .wb_completions = &(__wb)->completions 163 + 159 164 #define GDTC_INIT_NO_WB .dom = &global_wb_domain 160 - #define MDTC_INIT(__wb, __gdtc) .dom = mem_cgroup_wb_domain(__wb), \ 161 - .gdtc = __gdtc, \ 162 - DTC_INIT_COMMON(__wb) 165 + 166 + #define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \ 167 + .dom = mem_cgroup_wb_domain(__wb), \ 168 + .wb_completions = &(__wb)->memcg_completions, \ 169 + .gdtc = __gdtc 163 170 164 171 static bool mdtc_valid(struct dirty_throttle_control *dtc) 165 172 { ··· 214 213 215 214 #else /* CONFIG_CGROUP_WRITEBACK */ 216 215 217 - #define GDTC_INIT(__wb) DTC_INIT_COMMON(__wb) 216 + #define GDTC_INIT(__wb) .wb = (__wb), \ 217 + .wb_completions = &(__wb)->completions 218 218 #define GDTC_INIT_NO_WB 219 219 #define MDTC_INIT(__wb, __gdtc) 220 220 ··· 684 682 return max(thresh, dom->dirty_limit); 685 683 } 686 684 687 - /* memory available to a memcg domain is capped by system-wide clean memory */ 688 - static void mdtc_cap_avail(struct dirty_throttle_control *mdtc) 685 + /* 686 + * Memory which can be further allocated to a memcg domain is capped by 687 + * system-wide clean memory excluding the amount being used in the domain. 688 + */ 689 + static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, 690 + unsigned long filepages, unsigned long headroom) 689 691 { 690 692 struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc); 691 - unsigned long clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); 693 + unsigned long clean = filepages - min(filepages, mdtc->dirty); 694 + unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); 695 + unsigned long other_clean = global_clean - min(global_clean, clean); 692 696 693 - mdtc->avail = min(mdtc->avail, clean); 697 + mdtc->avail = filepages + min(headroom, other_clean); 694 698 } 695 699 696 700 /** ··· 1570 1562 } 1571 1563 1572 1564 if (mdtc) { 1573 - unsigned long writeback; 1565 + unsigned long filepages, headroom, writeback; 1574 1566 1575 1567 /* 1576 1568 * If @wb belongs to !root memcg, repeat the same 1577 1569 * basic calculations for the memcg domain. 1578 1570 */ 1579 - mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, 1580 - &writeback); 1581 - mdtc_cap_avail(mdtc); 1571 + mem_cgroup_wb_stats(wb, &filepages, &headroom, 1572 + &mdtc->dirty, &writeback); 1582 1573 mdtc->dirty += writeback; 1574 + mdtc_calc_avail(mdtc, filepages, headroom); 1583 1575 1584 1576 domain_dirty_limits(mdtc); 1585 1577 ··· 1901 1893 return true; 1902 1894 1903 1895 if (mdtc) { 1904 - unsigned long writeback; 1896 + unsigned long filepages, headroom, writeback; 1905 1897 1906 - mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, &writeback); 1907 - mdtc_cap_avail(mdtc); 1898 + mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, 1899 + &writeback); 1900 + mdtc_calc_avail(mdtc, filepages, headroom); 1908 1901 domain_dirty_limits(mdtc); /* ditto, ignore writeback */ 1909 1902 1910 1903 if (mdtc->dirty > mdtc->bg_thresh) ··· 1965 1956 int nr_pages = global_page_state(NR_FILE_DIRTY) + 1966 1957 global_page_state(NR_UNSTABLE_NFS); 1967 1958 struct bdi_writeback *wb; 1968 - struct wb_iter iter; 1969 1959 1970 1960 /* 1971 1961 * We want to write everything out, not just down to the dirty ··· 1973 1965 if (!bdi_has_dirty_io(&q->backing_dev_info)) 1974 1966 return; 1975 1967 1976 - bdi_for_each_wb(wb, &q->backing_dev_info, &iter, 0) 1968 + rcu_read_lock(); 1969 + list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node) 1977 1970 if (wb_has_dirty_io(wb)) 1978 1971 wb_start_writeback(wb, nr_pages, true, 1979 1972 WB_REASON_LAPTOP_TIMER); 1973 + rcu_read_unlock(); 1980 1974 } 1981 1975 1982 1976 /*