workqueue: Introduce struct wq_node_nr_active

Currently, for both percpu and unbound workqueues, max_active applies
per-cpu, which is a recent change for unbound workqueues. The change for
unbound workqueues was a significant departure from the previous behavior of
per-node application. It made some use cases create undesirable number of
concurrent work items and left no good way of fixing them. To address the
problem, workqueue is implementing a NUMA node segmented global nr_active
mechanism, which will be explained further in the next patch.

As a preparation, this patch introduces struct wq_node_nr_active. It's a
data structured allocated for each workqueue and NUMA node pair and
currently only tracks the workqueue's number of active work items on the
node. This is split out from the next patch to make it easier to understand
and review.

Note that there is an extra wq_node_nr_active allocated for the invalid node
nr_node_ids which is used to track nr_active for pools which don't have NUMA
node associated such as the default fallback system-wide pool.

This doesn't cause any behavior changes visible to userland yet. The next
patch will expand to implement the control mechanism on top.

v4: - Fixed out-of-bound access when freeing per-cpu workqueues.

v3: - Use flexible array for wq->node_nr_active as suggested by Lai.

v2: - wq->max_active now uses WRITE/READ_ONCE() as suggested by Lai.

- Lai pointed out that pwq_tryinc_nr_active() incorrectly dropped
pwq->max_active check. Restored. As the next patch replaces the
max_active enforcement mechanism, this doesn't change the end result.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Lai Jiangshan <jiangshanlai@gmail.com>

Tejun Heo 2 years ago 91ccc6e7 dd6c3c54

+135 -7

1 changed file

expand all

kernel

workqueue.c

+135 -7

kernel/workqueue.c

··· 285 285 struct wq_device; 286 286 287 287 /* 288 + * Unlike in a per-cpu workqueue where max_active limits its concurrency level 289 + * on each CPU, in an unbound workqueue, max_active applies to the whole system. 290 + * As sharing a single nr_active across multiple sockets can be very expensive, 291 + * the counting and enforcement is per NUMA node. 292 + */ 293 + struct wq_node_nr_active { 294 + atomic_t nr; /* per-node nr_active count */ 295 + }; 296 + 297 + /* 288 298 * The externally visible workqueue. It relays the issued work items to 289 299 * the appropriate worker_pool through its pool_workqueues. 290 300 */ ··· 340 330 /* hot fields used during command issue, aligned to cacheline */ 341 331 unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ 342 332 struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */ 333 + struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */ 343 334 }; 344 335 345 336 static struct kmem_cache *pwq_cache; ··· 1437 1426 } 1438 1427 1439 1428 /** 1429 + * wq_node_nr_active - Determine wq_node_nr_active to use 1430 + * @wq: workqueue of interest 1431 + * @node: NUMA node, can be %NUMA_NO_NODE 1432 + * 1433 + * Determine wq_node_nr_active to use for @wq on @node. Returns: 1434 + * 1435 + * - %NULL for per-cpu workqueues as they don't need to use shared nr_active. 1436 + * 1437 + * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE. 1438 + * 1439 + * - Otherwise, node_nr_active[@node]. 1440 + */ 1441 + static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq, 1442 + int node) 1443 + { 1444 + if (!(wq->flags & WQ_UNBOUND)) 1445 + return NULL; 1446 + 1447 + if (node == NUMA_NO_NODE) 1448 + node = nr_node_ids; 1449 + 1450 + return wq->node_nr_active[node]; 1451 + } 1452 + 1453 + /** 1440 1454 * get_pwq - get an extra reference on the specified pool_workqueue 1441 1455 * @pwq: pool_workqueue to get 1442 1456 * ··· 1542 1506 struct work_struct *work) 1543 1507 { 1544 1508 struct worker_pool *pool = pwq->pool; 1509 + struct wq_node_nr_active *nna; 1545 1510 1546 1511 lockdep_assert_held(&pool->lock); 1547 1512 1548 1513 if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE)) 1549 1514 return false; 1515 + 1516 + nna = wq_node_nr_active(pwq->wq, pool->node); 1517 + if (nna) 1518 + atomic_inc(&nna->nr); 1550 1519 1551 1520 pwq->nr_active++; 1552 1521 __pwq_activate_work(pwq, work); ··· 1569 1528 { 1570 1529 struct workqueue_struct *wq = pwq->wq; 1571 1530 struct worker_pool *pool = pwq->pool; 1531 + struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node); 1572 1532 bool obtained; 1573 1533 1574 1534 lockdep_assert_held(&pool->lock); 1575 1535 1576 1536 obtained = pwq->nr_active < READ_ONCE(wq->max_active); 1577 1537 1578 - if (obtained) 1538 + if (obtained) { 1579 1539 pwq->nr_active++; 1540 + if (nna) 1541 + atomic_inc(&nna->nr); 1542 + } 1580 1543 return obtained; 1581 1544 } 1582 1545 ··· 1617 1572 static void pwq_dec_nr_active(struct pool_workqueue *pwq) 1618 1573 { 1619 1574 struct worker_pool *pool = pwq->pool; 1575 + struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node); 1620 1576 1621 1577 lockdep_assert_held(&pool->lock); 1622 1578 1579 + /* 1580 + * @pwq->nr_active should be decremented for both percpu and unbound 1581 + * workqueues. 1582 + */ 1623 1583 pwq->nr_active--; 1584 + 1585 + /* 1586 + * For a percpu workqueue, it's simple. Just need to kick the first 1587 + * inactive work item on @pwq itself. 1588 + */ 1589 + if (!nna) { 1590 + pwq_activate_first_inactive(pwq); 1591 + return; 1592 + } 1593 + 1594 + atomic_dec(&nna->nr); 1624 1595 pwq_activate_first_inactive(pwq); 1625 1596 } 1626 1597 ··· 4100 4039 } 4101 4040 #endif 4102 4041 4042 + static void free_node_nr_active(struct wq_node_nr_active **nna_ar) 4043 + { 4044 + int node; 4045 + 4046 + for_each_node(node) { 4047 + kfree(nna_ar[node]); 4048 + nna_ar[node] = NULL; 4049 + } 4050 + 4051 + kfree(nna_ar[nr_node_ids]); 4052 + nna_ar[nr_node_ids] = NULL; 4053 + } 4054 + 4055 + static void init_node_nr_active(struct wq_node_nr_active *nna) 4056 + { 4057 + atomic_set(&nna->nr, 0); 4058 + } 4059 + 4060 + /* 4061 + * Each node's nr_active counter will be accessed mostly from its own node and 4062 + * should be allocated in the node. 4063 + */ 4064 + static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar) 4065 + { 4066 + struct wq_node_nr_active *nna; 4067 + int node; 4068 + 4069 + for_each_node(node) { 4070 + nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node); 4071 + if (!nna) 4072 + goto err_free; 4073 + init_node_nr_active(nna); 4074 + nna_ar[node] = nna; 4075 + } 4076 + 4077 + /* [nr_node_ids] is used as the fallback */ 4078 + nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE); 4079 + if (!nna) 4080 + goto err_free; 4081 + init_node_nr_active(nna); 4082 + nna_ar[nr_node_ids] = nna; 4083 + 4084 + return 0; 4085 + 4086 + err_free: 4087 + free_node_nr_active(nna_ar); 4088 + return -ENOMEM; 4089 + } 4090 + 4103 4091 static void rcu_free_wq(struct rcu_head *rcu) 4104 4092 { 4105 4093 struct workqueue_struct *wq = 4106 4094 container_of(rcu, struct workqueue_struct, rcu); 4095 + 4096 + if (wq->flags & WQ_UNBOUND) 4097 + free_node_nr_active(wq->node_nr_active); 4107 4098 4108 4099 wq_free_lockdep(wq); 4109 4100 free_percpu(wq->cpu_pwq); ··· 4898 4785 { 4899 4786 va_list args; 4900 4787 struct workqueue_struct *wq; 4901 - int len; 4788 + size_t wq_size; 4789 + int name_len; 4902 4790 4903 4791 /* 4904 4792 * Unbound && max_active == 1 used to imply ordered, which is no longer ··· 4915 4801 flags |= WQ_UNBOUND; 4916 4802 4917 4803 /* allocate wq and format name */ 4918 - wq = kzalloc(sizeof(*wq), GFP_KERNEL); 4804 + if (flags & WQ_UNBOUND) 4805 + wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1); 4806 + else 4807 + wq_size = sizeof(*wq); 4808 + 4809 + wq = kzalloc(wq_size, GFP_KERNEL); 4919 4810 if (!wq) 4920 4811 return NULL; 4921 4812 ··· 4931 4812 } 4932 4813 4933 4814 va_start(args, max_active); 4934 - len = vsnprintf(wq->name, sizeof(wq->name), fmt, args); 4815 + name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args); 4935 4816 va_end(args); 4936 4817 4937 - if (len >= WQ_NAME_LEN) 4938 - pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", wq->name); 4818 + if (name_len >= WQ_NAME_LEN) 4819 + pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", 4820 + wq->name); 4939 4821 4940 4822 max_active = max_active ?: WQ_DFL_ACTIVE; 4941 4823 max_active = wq_clamp_max_active(max_active, flags, wq->name); ··· 4955 4835 wq_init_lockdep(wq); 4956 4836 INIT_LIST_HEAD(&wq->list); 4957 4837 4838 + if (flags & WQ_UNBOUND) { 4839 + if (alloc_node_nr_active(wq->node_nr_active) < 0) 4840 + goto err_unreg_lockdep; 4841 + } 4842 + 4958 4843 if (alloc_and_link_pwqs(wq) < 0) 4959 - goto err_unreg_lockdep; 4844 + goto err_free_node_nr_active; 4960 4845 4961 4846 if (wq_online && init_rescuer(wq) < 0) 4962 4847 goto err_destroy; ··· 4986 4861 4987 4862 return wq; 4988 4863 4864 + err_free_node_nr_active: 4865 + if (wq->flags & WQ_UNBOUND) 4866 + free_node_nr_active(wq->node_nr_active); 4989 4867 err_unreg_lockdep: 4990 4868 wq_unregister_lockdep(wq); 4991 4869 wq_free_lockdep(wq);

Configure Feed

Configure Feed