Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm: memcontrol: revert use of root_mem_cgroup res_counter

Dave Hansen reports a massive scalability regression in an uncontained
page fault benchmark with more than 30 concurrent threads, which he
bisected down to 05b843012335 ("mm: memcontrol: use root_mem_cgroup
res_counter") and pin-pointed on res_counter spinlock contention.

That change relied on the per-cpu charge caches to mostly swallow the
res_counter costs, but it's apparent that the caches don't scale yet.

Revert memcg back to bypassing res_counters on the root level in order
to restore performance for uncontained workloads.

Reported-by: Dave Hansen <dave@sr71.net>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Johannes Weiner and committed by
Linus Torvalds
ce00a967 10096fb1

+78 -25
+78 -25
mm/memcontrol.c
··· 2534 2534 unsigned long long size; 2535 2535 int ret = 0; 2536 2536 2537 + if (mem_cgroup_is_root(memcg)) 2538 + goto done; 2537 2539 retry: 2538 2540 if (consume_stock(memcg, nr_pages)) 2539 2541 goto done; ··· 2613 2611 if (!(gfp_mask & __GFP_NOFAIL)) 2614 2612 return -ENOMEM; 2615 2613 bypass: 2616 - memcg = root_mem_cgroup; 2617 - ret = -EINTR; 2618 - goto retry; 2614 + return -EINTR; 2619 2615 2620 2616 done_restock: 2621 2617 if (batch > nr_pages) ··· 2625 2625 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2626 2626 { 2627 2627 unsigned long bytes = nr_pages * PAGE_SIZE; 2628 + 2629 + if (mem_cgroup_is_root(memcg)) 2630 + return; 2628 2631 2629 2632 res_counter_uncharge(&memcg->res, bytes); 2630 2633 if (do_swap_account) ··· 2642 2639 unsigned int nr_pages) 2643 2640 { 2644 2641 unsigned long bytes = nr_pages * PAGE_SIZE; 2642 + 2643 + if (mem_cgroup_is_root(memcg)) 2644 + return; 2645 2645 2646 2646 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); 2647 2647 if (do_swap_account) ··· 4099 4093 return retval; 4100 4094 } 4101 4095 4096 + static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, 4097 + enum mem_cgroup_stat_index idx) 4098 + { 4099 + struct mem_cgroup *iter; 4100 + long val = 0; 4101 + 4102 + /* Per-cpu values can be negative, use a signed accumulator */ 4103 + for_each_mem_cgroup_tree(iter, memcg) 4104 + val += mem_cgroup_read_stat(iter, idx); 4105 + 4106 + if (val < 0) /* race ? */ 4107 + val = 0; 4108 + return val; 4109 + } 4110 + 4111 + static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 4112 + { 4113 + u64 val; 4114 + 4115 + if (!mem_cgroup_is_root(memcg)) { 4116 + if (!swap) 4117 + return res_counter_read_u64(&memcg->res, RES_USAGE); 4118 + else 4119 + return res_counter_read_u64(&memcg->memsw, RES_USAGE); 4120 + } 4121 + 4122 + /* 4123 + * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS 4124 + * as well as in MEM_CGROUP_STAT_RSS_HUGE. 4125 + */ 4126 + val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); 4127 + val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); 4128 + 4129 + if (swap) 4130 + val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); 4131 + 4132 + return val << PAGE_SHIFT; 4133 + } 4134 + 4135 + 4102 4136 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 4103 4137 struct cftype *cft) 4104 4138 { ··· 4148 4102 4149 4103 switch (type) { 4150 4104 case _MEM: 4105 + if (name == RES_USAGE) 4106 + return mem_cgroup_usage(memcg, false); 4151 4107 return res_counter_read_u64(&memcg->res, name); 4152 4108 case _MEMSWAP: 4109 + if (name == RES_USAGE) 4110 + return mem_cgroup_usage(memcg, true); 4153 4111 return res_counter_read_u64(&memcg->memsw, name); 4154 4112 case _KMEM: 4155 4113 return res_counter_read_u64(&memcg->kmem, name); ··· 4622 4572 if (!t) 4623 4573 goto unlock; 4624 4574 4625 - if (!swap) 4626 - usage = res_counter_read_u64(&memcg->res, RES_USAGE); 4627 - else 4628 - usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4575 + usage = mem_cgroup_usage(memcg, swap); 4629 4576 4630 4577 /* 4631 4578 * current_threshold points to threshold just below or equal to usage. ··· 4720 4673 4721 4674 if (type == _MEM) { 4722 4675 thresholds = &memcg->thresholds; 4723 - usage = res_counter_read_u64(&memcg->res, RES_USAGE); 4676 + usage = mem_cgroup_usage(memcg, false); 4724 4677 } else if (type == _MEMSWAP) { 4725 4678 thresholds = &memcg->memsw_thresholds; 4726 - usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4679 + usage = mem_cgroup_usage(memcg, true); 4727 4680 } else 4728 4681 BUG(); 4729 4682 ··· 4809 4762 4810 4763 if (type == _MEM) { 4811 4764 thresholds = &memcg->thresholds; 4812 - usage = res_counter_read_u64(&memcg->res, RES_USAGE); 4765 + usage = mem_cgroup_usage(memcg, false); 4813 4766 } else if (type == _MEMSWAP) { 4814 4767 thresholds = &memcg->memsw_thresholds; 4815 - usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4768 + usage = mem_cgroup_usage(memcg, true); 4816 4769 } else 4817 4770 BUG(); 4818 4771 ··· 5572 5525 * core guarantees its existence. 5573 5526 */ 5574 5527 } else { 5575 - res_counter_init(&memcg->res, &root_mem_cgroup->res); 5576 - res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); 5577 - res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); 5528 + res_counter_init(&memcg->res, NULL); 5529 + res_counter_init(&memcg->memsw, NULL); 5530 + res_counter_init(&memcg->kmem, NULL); 5578 5531 /* 5579 5532 * Deeper hierachy with use_hierarchy == false doesn't make 5580 5533 * much sense so let cgroup subsystem know about this ··· 6016 5969 /* we must fixup refcnts and charges */ 6017 5970 if (mc.moved_swap) { 6018 5971 /* uncharge swap account from the old cgroup */ 6019 - res_counter_uncharge(&mc.from->memsw, 6020 - PAGE_SIZE * mc.moved_swap); 5972 + if (!mem_cgroup_is_root(mc.from)) 5973 + res_counter_uncharge(&mc.from->memsw, 5974 + PAGE_SIZE * mc.moved_swap); 6021 5975 6022 5976 for (i = 0; i < mc.moved_swap; i++) 6023 5977 css_put(&mc.from->css); ··· 6027 5979 * we charged both to->res and to->memsw, so we should 6028 5980 * uncharge to->res. 6029 5981 */ 6030 - res_counter_uncharge(&mc.to->res, 6031 - PAGE_SIZE * mc.moved_swap); 5982 + if (!mem_cgroup_is_root(mc.to)) 5983 + res_counter_uncharge(&mc.to->res, 5984 + PAGE_SIZE * mc.moved_swap); 6032 5985 /* we've already done css_get(mc.to) */ 6033 5986 mc.moved_swap = 0; 6034 5987 } ··· 6394 6345 rcu_read_lock(); 6395 6346 memcg = mem_cgroup_lookup(id); 6396 6347 if (memcg) { 6397 - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 6348 + if (!mem_cgroup_is_root(memcg)) 6349 + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 6398 6350 mem_cgroup_swap_statistics(memcg, false); 6399 6351 css_put(&memcg->css); 6400 6352 } ··· 6559 6509 { 6560 6510 unsigned long flags; 6561 6511 6562 - if (nr_mem) 6563 - res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE); 6564 - if (nr_memsw) 6565 - res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE); 6566 - 6567 - memcg_oom_recover(memcg); 6512 + if (!mem_cgroup_is_root(memcg)) { 6513 + if (nr_mem) 6514 + res_counter_uncharge(&memcg->res, 6515 + nr_mem * PAGE_SIZE); 6516 + if (nr_memsw) 6517 + res_counter_uncharge(&memcg->memsw, 6518 + nr_memsw * PAGE_SIZE); 6519 + memcg_oom_recover(memcg); 6520 + } 6568 6521 6569 6522 local_irq_save(flags); 6570 6523 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);