Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+1 -1

fs/exec.c

··· 1828 1828 membarrier_execve(current); 1829 1829 rseq_execve(current); 1830 1830 acct_update_integrals(current); 1831 - task_numa_free(current); 1831 + task_numa_free(current, false); 1832 1832 free_bprm(bprm); 1833 1833 kfree(pathbuf); 1834 1834 if (filename)

+9 -1

include/linux/sched.h

··· 1092 1092 u64 last_sum_exec_runtime; 1093 1093 struct callback_head numa_work; 1094 1094 1095 - struct numa_group *numa_group; 1095 + /* 1096 + * This pointer is only modified for current in syscall and 1097 + * pagefault context (and for tasks being destroyed), so it can be read 1098 + * from any of the following contexts: 1099 + * - RCU read-side critical section 1100 + * - current->numa_group from everywhere 1101 + * - task's runqueue locked, task not running 1102 + */ 1103 + struct numa_group __rcu *numa_group; 1096 1104 1097 1105 /* 1098 1106 * numa_faults is an array split into four regions:

+2 -2

include/linux/sched/numa_balancing.h

··· 19 19 extern void task_numa_fault(int last_node, int node, int pages, int flags); 20 20 extern pid_t task_numa_group_id(struct task_struct *p); 21 21 extern void set_numabalancing_state(bool enabled); 22 - extern void task_numa_free(struct task_struct *p); 22 + extern void task_numa_free(struct task_struct *p, bool final); 23 23 extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page, 24 24 int src_nid, int dst_cpu); 25 25 #else ··· 34 34 static inline void set_numabalancing_state(bool enabled) 35 35 { 36 36 } 37 - static inline void task_numa_free(struct task_struct *p) 37 + static inline void task_numa_free(struct task_struct *p, bool final) 38 38 { 39 39 } 40 40 static inline bool should_numa_migrate_memory(struct task_struct *p,

+1 -1

kernel/fork.c

··· 726 726 WARN_ON(tsk == current); 727 727 728 728 cgroup_free(tsk); 729 - task_numa_free(tsk); 729 + task_numa_free(tsk, true); 730 730 security_task_free(tsk); 731 731 exit_creds(tsk); 732 732 delayacct_tsk_free(tsk);

+102 -44

kernel/sched/fair.c

··· 1086 1086 unsigned long faults[0]; 1087 1087 }; 1088 1088 1089 + /* 1090 + * For functions that can be called in multiple contexts that permit reading 1091 + * ->numa_group (see struct task_struct for locking rules). 1092 + */ 1093 + static struct numa_group *deref_task_numa_group(struct task_struct *p) 1094 + { 1095 + return rcu_dereference_check(p->numa_group, p == current || 1096 + (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu))); 1097 + } 1098 + 1099 + static struct numa_group *deref_curr_numa_group(struct task_struct *p) 1100 + { 1101 + return rcu_dereference_protected(p->numa_group, p == current); 1102 + } 1103 + 1089 1104 static inline unsigned long group_faults_priv(struct numa_group *ng); 1090 1105 static inline unsigned long group_faults_shared(struct numa_group *ng); 1091 1106 ··· 1144 1129 { 1145 1130 unsigned long smin = task_scan_min(p); 1146 1131 unsigned long period = smin; 1132 + struct numa_group *ng; 1147 1133 1148 1134 /* Scale the maximum scan period with the amount of shared memory. */ 1149 - if (p->numa_group) { 1150 - struct numa_group *ng = p->numa_group; 1135 + rcu_read_lock(); 1136 + ng = rcu_dereference(p->numa_group); 1137 + if (ng) { 1151 1138 unsigned long shared = group_faults_shared(ng); 1152 1139 unsigned long private = group_faults_priv(ng); 1153 1140 ··· 1157 1140 period *= shared + 1; 1158 1141 period /= private + shared + 1; 1159 1142 } 1143 + rcu_read_unlock(); 1160 1144 1161 1145 return max(smin, period); 1162 1146 } ··· 1166 1148 { 1167 1149 unsigned long smin = task_scan_min(p); 1168 1150 unsigned long smax; 1151 + struct numa_group *ng; 1169 1152 1170 1153 /* Watch for min being lower than max due to floor calculations */ 1171 1154 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); 1172 1155 1173 1156 /* Scale the maximum scan period with the amount of shared memory. */ 1174 - if (p->numa_group) { 1175 - struct numa_group *ng = p->numa_group; 1157 + ng = deref_curr_numa_group(p); 1158 + if (ng) { 1176 1159 unsigned long shared = group_faults_shared(ng); 1177 1160 unsigned long private = group_faults_priv(ng); 1178 1161 unsigned long period = smax; ··· 1205 1186 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1206 1187 p->numa_work.next = &p->numa_work; 1207 1188 p->numa_faults = NULL; 1208 - p->numa_group = NULL; 1189 + RCU_INIT_POINTER(p->numa_group, NULL); 1209 1190 p->last_task_numa_placement = 0; 1210 1191 p->last_sum_exec_runtime = 0; 1211 1192 ··· 1252 1233 1253 1234 pid_t task_numa_group_id(struct task_struct *p) 1254 1235 { 1255 - return p->numa_group ? p->numa_group->gid : 0; 1236 + struct numa_group *ng; 1237 + pid_t gid = 0; 1238 + 1239 + rcu_read_lock(); 1240 + ng = rcu_dereference(p->numa_group); 1241 + if (ng) 1242 + gid = ng->gid; 1243 + rcu_read_unlock(); 1244 + 1245 + return gid; 1256 1246 } 1257 1247 1258 1248 /* ··· 1286 1258 1287 1259 static inline unsigned long group_faults(struct task_struct *p, int nid) 1288 1260 { 1289 - if (!p->numa_group) 1261 + struct numa_group *ng = deref_task_numa_group(p); 1262 + 1263 + if (!ng) 1290 1264 return 0; 1291 1265 1292 - return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] + 1293 - p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)]; 1266 + return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] + 1267 + ng->faults[task_faults_idx(NUMA_MEM, nid, 1)]; 1294 1268 } 1295 1269 1296 1270 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) ··· 1430 1400 static inline unsigned long group_weight(struct task_struct *p, int nid, 1431 1401 int dist) 1432 1402 { 1403 + struct numa_group *ng = deref_task_numa_group(p); 1433 1404 unsigned long faults, total_faults; 1434 1405 1435 - if (!p->numa_group) 1406 + if (!ng) 1436 1407 return 0; 1437 1408 1438 - total_faults = p->numa_group->total_faults; 1409 + total_faults = ng->total_faults; 1439 1410 1440 1411 if (!total_faults) 1441 1412 return 0; ··· 1450 1419 bool should_numa_migrate_memory(struct task_struct *p, struct page * page, 1451 1420 int src_nid, int dst_cpu) 1452 1421 { 1453 - struct numa_group *ng = p->numa_group; 1422 + struct numa_group *ng = deref_curr_numa_group(p); 1454 1423 int dst_nid = cpu_to_node(dst_cpu); 1455 1424 int last_cpupid, this_cpupid; 1456 1425 ··· 1631 1600 static void task_numa_compare(struct task_numa_env *env, 1632 1601 long taskimp, long groupimp, bool maymove) 1633 1602 { 1603 + struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p); 1634 1604 struct rq *dst_rq = cpu_rq(env->dst_cpu); 1605 + long imp = p_ng ? groupimp : taskimp; 1635 1606 struct task_struct *cur; 1636 1607 long src_load, dst_load; 1637 - long load; 1638 - long imp = env->p->numa_group ? groupimp : taskimp; 1639 - long moveimp = imp; 1640 1608 int dist = env->dist; 1609 + long moveimp = imp; 1610 + long load; 1641 1611 1642 1612 if (READ_ONCE(dst_rq->numa_migrate_on)) 1643 1613 return; ··· 1677 1645 * If dst and source tasks are in the same NUMA group, or not 1678 1646 * in any group then look only at task weights. 1679 1647 */ 1680 - if (cur->numa_group == env->p->numa_group) { 1648 + cur_ng = rcu_dereference(cur->numa_group); 1649 + if (cur_ng == p_ng) { 1681 1650 imp = taskimp + task_weight(cur, env->src_nid, dist) - 1682 1651 task_weight(cur, env->dst_nid, dist); 1683 1652 /* 1684 1653 * Add some hysteresis to prevent swapping the 1685 1654 * tasks within a group over tiny differences. 1686 1655 */ 1687 - if (cur->numa_group) 1656 + if (cur_ng) 1688 1657 imp -= imp / 16; 1689 1658 } else { 1690 1659 /* 1691 1660 * Compare the group weights. If a task is all by itself 1692 1661 * (not part of a group), use the task weight instead. 1693 1662 */ 1694 - if (cur->numa_group && env->p->numa_group) 1663 + if (cur_ng && p_ng) 1695 1664 imp += group_weight(cur, env->src_nid, dist) - 1696 1665 group_weight(cur, env->dst_nid, dist); 1697 1666 else ··· 1790 1757 .best_imp = 0, 1791 1758 .best_cpu = -1, 1792 1759 }; 1793 - struct sched_domain *sd; 1794 - struct rq *best_rq; 1795 1760 unsigned long taskweight, groupweight; 1796 - int nid, ret, dist; 1761 + struct sched_domain *sd; 1797 1762 long taskimp, groupimp; 1763 + struct numa_group *ng; 1764 + struct rq *best_rq; 1765 + int nid, ret, dist; 1798 1766 1799 1767 /* 1800 1768 * Pick the lowest SD_NUMA domain, as that would have the smallest ··· 1841 1807 * multiple NUMA nodes; in order to better consolidate the group, 1842 1808 * we need to check other locations. 1843 1809 */ 1844 - if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) { 1810 + ng = deref_curr_numa_group(p); 1811 + if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) { 1845 1812 for_each_online_node(nid) { 1846 1813 if (nid == env.src_nid || nid == p->numa_preferred_nid) 1847 1814 continue; ··· 1875 1840 * A task that migrated to a second choice node will be better off 1876 1841 * trying for a better one later. Do not set the preferred node here. 1877 1842 */ 1878 - if (p->numa_group) { 1843 + if (ng) { 1879 1844 if (env.best_cpu == -1) 1880 1845 nid = env.src_nid; 1881 1846 else ··· 2170 2135 unsigned long total_faults; 2171 2136 u64 runtime, period; 2172 2137 spinlock_t *group_lock = NULL; 2138 + struct numa_group *ng; 2173 2139 2174 2140 /* 2175 2141 * The p->mm->numa_scan_seq field gets updated without ··· 2188 2152 runtime = numa_get_avg_runtime(p, &period); 2189 2153 2190 2154 /* If the task is part of a group prevent parallel updates to group stats */ 2191 - if (p->numa_group) { 2192 - group_lock = &p->numa_group->lock; 2155 + ng = deref_curr_numa_group(p); 2156 + if (ng) { 2157 + group_lock = &ng->lock; 2193 2158 spin_lock_irq(group_lock); 2194 2159 } 2195 2160 ··· 2231 2194 p->numa_faults[cpu_idx] += f_diff; 2232 2195 faults += p->numa_faults[mem_idx]; 2233 2196 p->total_numa_faults += diff; 2234 - if (p->numa_group) { 2197 + if (ng) { 2235 2198 /* 2236 2199 * safe because we can only change our own group 2237 2200 * ··· 2239 2202 * nid and priv in a specific region because it 2240 2203 * is at the beginning of the numa_faults array. 2241 2204 */ 2242 - p->numa_group->faults[mem_idx] += diff; 2243 - p->numa_group->faults_cpu[mem_idx] += f_diff; 2244 - p->numa_group->total_faults += diff; 2245 - group_faults += p->numa_group->faults[mem_idx]; 2205 + ng->faults[mem_idx] += diff; 2206 + ng->faults_cpu[mem_idx] += f_diff; 2207 + ng->total_faults += diff; 2208 + group_faults += ng->faults[mem_idx]; 2246 2209 } 2247 2210 } 2248 2211 2249 - if (!p->numa_group) { 2212 + if (!ng) { 2250 2213 if (faults > max_faults) { 2251 2214 max_faults = faults; 2252 2215 max_nid = nid; ··· 2257 2220 } 2258 2221 } 2259 2222 2260 - if (p->numa_group) { 2261 - numa_group_count_active_nodes(p->numa_group); 2223 + if (ng) { 2224 + numa_group_count_active_nodes(ng); 2262 2225 spin_unlock_irq(group_lock); 2263 2226 max_nid = preferred_group_nid(p, max_nid); 2264 2227 } ··· 2292 2255 int cpu = cpupid_to_cpu(cpupid); 2293 2256 int i; 2294 2257 2295 - if (unlikely(!p->numa_group)) { 2258 + if (unlikely(!deref_curr_numa_group(p))) { 2296 2259 unsigned int size = sizeof(struct numa_group) + 2297 2260 4*nr_node_ids*sizeof(unsigned long); 2298 2261 ··· 2328 2291 if (!grp) 2329 2292 goto no_join; 2330 2293 2331 - my_grp = p->numa_group; 2294 + my_grp = deref_curr_numa_group(p); 2332 2295 if (grp == my_grp) 2333 2296 goto no_join; 2334 2297 ··· 2390 2353 return; 2391 2354 } 2392 2355 2393 - void task_numa_free(struct task_struct *p) 2356 + /* 2357 + * Get rid of NUMA staticstics associated with a task (either current or dead). 2358 + * If @final is set, the task is dead and has reached refcount zero, so we can 2359 + * safely free all relevant data structures. Otherwise, there might be 2360 + * concurrent reads from places like load balancing and procfs, and we should 2361 + * reset the data back to default state without freeing ->numa_faults. 2362 + */ 2363 + void task_numa_free(struct task_struct *p, bool final) 2394 2364 { 2395 - struct numa_group *grp = p->numa_group; 2396 - void *numa_faults = p->numa_faults; 2365 + /* safe: p either is current or is being freed by current */ 2366 + struct numa_group *grp = rcu_dereference_raw(p->numa_group); 2367 + unsigned long *numa_faults = p->numa_faults; 2397 2368 unsigned long flags; 2398 2369 int i; 2370 + 2371 + if (!numa_faults) 2372 + return; 2399 2373 2400 2374 if (grp) { 2401 2375 spin_lock_irqsave(&grp->lock, flags); ··· 2420 2372 put_numa_group(grp); 2421 2373 } 2422 2374 2423 - p->numa_faults = NULL; 2424 - kfree(numa_faults); 2375 + if (final) { 2376 + p->numa_faults = NULL; 2377 + kfree(numa_faults); 2378 + } else { 2379 + p->total_numa_faults = 0; 2380 + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 2381 + numa_faults[i] = 0; 2382 + } 2425 2383 } 2426 2384 2427 2385 /* ··· 2480 2426 * actively using should be counted as local. This allows the 2481 2427 * scan rate to slow down when a workload has settled down. 2482 2428 */ 2483 - ng = p->numa_group; 2429 + ng = deref_curr_numa_group(p); 2484 2430 if (!priv && !local && ng && ng->active_nodes > 1 && 2485 2431 numa_is_active_node(cpu_node, ng) && 2486 2432 numa_is_active_node(mem_node, ng)) ··· 10498 10444 { 10499 10445 int node; 10500 10446 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0; 10447 + struct numa_group *ng; 10501 10448 10449 + rcu_read_lock(); 10450 + ng = rcu_dereference(p->numa_group); 10502 10451 for_each_online_node(node) { 10503 10452 if (p->numa_faults) { 10504 10453 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; 10505 10454 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)]; 10506 10455 } 10507 - if (p->numa_group) { 10508 - gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)], 10509 - gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)]; 10456 + if (ng) { 10457 + gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)], 10458 + gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; 10510 10459 } 10511 10460 print_numa_stats(m, node, tsf, tpf, gsf, gpf); 10512 10461 } 10462 + rcu_read_unlock(); 10513 10463 } 10514 10464 #endif /* CONFIG_NUMA_BALANCING */ 10515 10465 #endif /* CONFIG_SCHED_DEBUG */

Configure Feed

Configure Feed