Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Ingo writes:
"scheduler fixes:

These fixes address a rather involved performance regression between
v4.17->v4.19 in the sched/numa auto-balancing code. Since distros
really need this fix we accelerated it to sched/urgent for a faster
upstream merge.

NUMA scheduling and balancing performance is now largely back to
v4.17 levels, without reintroducing the NUMA placement bugs that
v4.18 and v4.19 fixed.

Many thanks to Srikar Dronamraju, Mel Gorman and Jirka Hladky, for
reporting, testing, re-testing and solving this rather complex set of
bugs."

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/numa: Migrate pages to local nodes quicker early in the lifetime of a task
mm, sched/numa: Remove rate-limiting of automatic NUMA balancing migration
sched/numa: Avoid task migration for small NUMA improvement
mm/migrate: Use spin_trylock() while resetting rate limit
sched/numa: Limit the conditions where scan period is reset
sched/numa: Reset scan rate whenever task moves across nodes
sched/numa: Pass destination CPU as a parameter to migrate_task_rq
sched/numa: Stop multiple tasks from moving to the CPU at the same time

+95 -108
-6
include/linux/mmzone.h
··· 671 671 #ifdef CONFIG_NUMA_BALANCING 672 672 /* Lock serializing the migrate rate limiting window */ 673 673 spinlock_t numabalancing_migrate_lock; 674 - 675 - /* Rate limiting time interval */ 676 - unsigned long numabalancing_migrate_next_window; 677 - 678 - /* Number of pages migrated during the rate limiting time interval */ 679 - unsigned long numabalancing_migrate_nr_pages; 680 674 #endif 681 675 /* 682 676 * This is a per-node reserve of pages that are not available
-27
include/trace/events/migrate.h
··· 70 70 __print_symbolic(__entry->mode, MIGRATE_MODE), 71 71 __print_symbolic(__entry->reason, MIGRATE_REASON)) 72 72 ); 73 - 74 - TRACE_EVENT(mm_numa_migrate_ratelimit, 75 - 76 - TP_PROTO(struct task_struct *p, int dst_nid, unsigned long nr_pages), 77 - 78 - TP_ARGS(p, dst_nid, nr_pages), 79 - 80 - TP_STRUCT__entry( 81 - __array( char, comm, TASK_COMM_LEN) 82 - __field( pid_t, pid) 83 - __field( int, dst_nid) 84 - __field( unsigned long, nr_pages) 85 - ), 86 - 87 - TP_fast_assign( 88 - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); 89 - __entry->pid = p->pid; 90 - __entry->dst_nid = dst_nid; 91 - __entry->nr_pages = nr_pages; 92 - ), 93 - 94 - TP_printk("comm=%s pid=%d dst_nid=%d nr_pages=%lu", 95 - __entry->comm, 96 - __entry->pid, 97 - __entry->dst_nid, 98 - __entry->nr_pages) 99 - ); 100 73 #endif /* _TRACE_MIGRATE_H */ 101 74 102 75 /* This part must be outside protection */
+1 -1
kernel/sched/core.c
··· 1167 1167 1168 1168 if (task_cpu(p) != new_cpu) { 1169 1169 if (p->sched_class->migrate_task_rq) 1170 - p->sched_class->migrate_task_rq(p); 1170 + p->sched_class->migrate_task_rq(p, new_cpu); 1171 1171 p->se.nr_migrations++; 1172 1172 rseq_migrate(p); 1173 1173 perf_event_task_migrate(p);
+1 -1
kernel/sched/deadline.c
··· 1607 1607 return cpu; 1608 1608 } 1609 1609 1610 - static void migrate_task_rq_dl(struct task_struct *p) 1610 + static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) 1611 1611 { 1612 1612 struct rq *rq; 1613 1613
+91 -13
kernel/sched/fair.c
··· 1392 1392 int last_cpupid, this_cpupid; 1393 1393 1394 1394 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); 1395 + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); 1396 + 1397 + /* 1398 + * Allow first faults or private faults to migrate immediately early in 1399 + * the lifetime of a task. The magic number 4 is based on waiting for 1400 + * two full passes of the "multi-stage node selection" test that is 1401 + * executed below. 1402 + */ 1403 + if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && 1404 + (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) 1405 + return true; 1395 1406 1396 1407 /* 1397 1408 * Multi-stage node selection is used in conjunction with a periodic ··· 1421 1410 * This quadric squishes small probabilities, making it less likely we 1422 1411 * act on an unlikely task<->page relation. 1423 1412 */ 1424 - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); 1425 1413 if (!cpupid_pid_unset(last_cpupid) && 1426 1414 cpupid_to_nid(last_cpupid) != dst_nid) 1427 1415 return false; ··· 1524 1514 static void task_numa_assign(struct task_numa_env *env, 1525 1515 struct task_struct *p, long imp) 1526 1516 { 1517 + struct rq *rq = cpu_rq(env->dst_cpu); 1518 + 1519 + /* Bail out if run-queue part of active NUMA balance. */ 1520 + if (xchg(&rq->numa_migrate_on, 1)) 1521 + return; 1522 + 1523 + /* 1524 + * Clear previous best_cpu/rq numa-migrate flag, since task now 1525 + * found a better CPU to move/swap. 1526 + */ 1527 + if (env->best_cpu != -1) { 1528 + rq = cpu_rq(env->best_cpu); 1529 + WRITE_ONCE(rq->numa_migrate_on, 0); 1530 + } 1531 + 1527 1532 if (env->best_task) 1528 1533 put_task_struct(env->best_task); 1529 1534 if (p) ··· 1578 1553 } 1579 1554 1580 1555 /* 1556 + * Maximum NUMA importance can be 1998 (2*999); 1557 + * SMALLIMP @ 30 would be close to 1998/64. 1558 + * Used to deter task migration. 1559 + */ 1560 + #define SMALLIMP 30 1561 + 1562 + /* 1581 1563 * This checks if the overall compute and NUMA accesses of the system would 1582 1564 * be improved if the source tasks was migrated to the target dst_cpu taking 1583 1565 * into account that it might be best if task running on the dst_cpu should ··· 1601 1569 long moveimp = imp; 1602 1570 int dist = env->dist; 1603 1571 1572 + if (READ_ONCE(dst_rq->numa_migrate_on)) 1573 + return; 1574 + 1604 1575 rcu_read_lock(); 1605 1576 cur = task_rcu_dereference(&dst_rq->curr); 1606 1577 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) ··· 1617 1582 goto unlock; 1618 1583 1619 1584 if (!cur) { 1620 - if (maymove || imp > env->best_imp) 1585 + if (maymove && moveimp >= env->best_imp) 1621 1586 goto assign; 1622 1587 else 1623 1588 goto unlock; ··· 1660 1625 task_weight(cur, env->dst_nid, dist); 1661 1626 } 1662 1627 1663 - if (imp <= env->best_imp) 1664 - goto unlock; 1665 - 1666 1628 if (maymove && moveimp > imp && moveimp > env->best_imp) { 1667 - imp = moveimp - 1; 1629 + imp = moveimp; 1668 1630 cur = NULL; 1669 1631 goto assign; 1670 1632 } 1633 + 1634 + /* 1635 + * If the NUMA importance is less than SMALLIMP, 1636 + * task migration might only result in ping pong 1637 + * of tasks and also hurt performance due to cache 1638 + * misses. 1639 + */ 1640 + if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2) 1641 + goto unlock; 1671 1642 1672 1643 /* 1673 1644 * In the overloaded case, try and keep the load balanced. ··· 1751 1710 .best_cpu = -1, 1752 1711 }; 1753 1712 struct sched_domain *sd; 1713 + struct rq *best_rq; 1754 1714 unsigned long taskweight, groupweight; 1755 1715 int nid, ret, dist; 1756 1716 long taskimp, groupimp; ··· 1847 1805 if (env.best_cpu == -1) 1848 1806 return -EAGAIN; 1849 1807 1850 - /* 1851 - * Reset the scan period if the task is being rescheduled on an 1852 - * alternative node to recheck if the tasks is now properly placed. 1853 - */ 1854 - p->numa_scan_period = task_scan_start(p); 1855 - 1808 + best_rq = cpu_rq(env.best_cpu); 1856 1809 if (env.best_task == NULL) { 1857 1810 ret = migrate_task_to(p, env.best_cpu); 1811 + WRITE_ONCE(best_rq->numa_migrate_on, 0); 1858 1812 if (ret != 0) 1859 1813 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); 1860 1814 return ret; 1861 1815 } 1862 1816 1863 1817 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); 1818 + WRITE_ONCE(best_rq->numa_migrate_on, 0); 1864 1819 1865 1820 if (ret != 0) 1866 1821 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); ··· 2635 2596 } 2636 2597 } 2637 2598 2599 + static void update_scan_period(struct task_struct *p, int new_cpu) 2600 + { 2601 + int src_nid = cpu_to_node(task_cpu(p)); 2602 + int dst_nid = cpu_to_node(new_cpu); 2603 + 2604 + if (!static_branch_likely(&sched_numa_balancing)) 2605 + return; 2606 + 2607 + if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING)) 2608 + return; 2609 + 2610 + if (src_nid == dst_nid) 2611 + return; 2612 + 2613 + /* 2614 + * Allow resets if faults have been trapped before one scan 2615 + * has completed. This is most likely due to a new task that 2616 + * is pulled cross-node due to wakeups or load balancing. 2617 + */ 2618 + if (p->numa_scan_seq) { 2619 + /* 2620 + * Avoid scan adjustments if moving to the preferred 2621 + * node or if the task was not previously running on 2622 + * the preferred node. 2623 + */ 2624 + if (dst_nid == p->numa_preferred_nid || 2625 + (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) 2626 + return; 2627 + } 2628 + 2629 + p->numa_scan_period = task_scan_start(p); 2630 + } 2631 + 2638 2632 #else 2639 2633 static void task_tick_numa(struct rq *rq, struct task_struct *curr) 2640 2634 { ··· 2678 2606 } 2679 2607 2680 2608 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) 2609 + { 2610 + } 2611 + 2612 + static inline void update_scan_period(struct task_struct *p, int new_cpu) 2681 2613 { 2682 2614 } 2683 2615 ··· 6351 6275 * cfs_rq_of(p) references at time of call are still valid and identify the 6352 6276 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. 6353 6277 */ 6354 - static void migrate_task_rq_fair(struct task_struct *p) 6278 + static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) 6355 6279 { 6356 6280 /* 6357 6281 * As blocked tasks retain absolute vruntime the migration needs to ··· 6404 6328 6405 6329 /* We have migrated, no longer consider this task hot */ 6406 6330 p->se.exec_start = 0; 6331 + 6332 + update_scan_period(p, new_cpu); 6407 6333 } 6408 6334 6409 6335 static void task_dead_fair(struct task_struct *p)
+2 -1
kernel/sched/sched.h
··· 783 783 #ifdef CONFIG_NUMA_BALANCING 784 784 unsigned int nr_numa_running; 785 785 unsigned int nr_preferred_running; 786 + unsigned int numa_migrate_on; 786 787 #endif 787 788 #define CPU_LOAD_IDX_MAX 5 788 789 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; ··· 1524 1523 1525 1524 #ifdef CONFIG_SMP 1526 1525 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1527 - void (*migrate_task_rq)(struct task_struct *p); 1526 + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); 1528 1527 1529 1528 void (*task_woken)(struct rq *this_rq, struct task_struct *task); 1530 1529
-57
mm/migrate.c
··· 1855 1855 return newpage; 1856 1856 } 1857 1857 1858 - /* 1859 - * page migration rate limiting control. 1860 - * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs 1861 - * window of time. Default here says do not migrate more than 1280M per second. 1862 - */ 1863 - static unsigned int migrate_interval_millisecs __read_mostly = 100; 1864 - static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); 1865 - 1866 - /* Returns true if the node is migrate rate-limited after the update */ 1867 - static bool numamigrate_update_ratelimit(pg_data_t *pgdat, 1868 - unsigned long nr_pages) 1869 - { 1870 - /* 1871 - * Rate-limit the amount of data that is being migrated to a node. 1872 - * Optimal placement is no good if the memory bus is saturated and 1873 - * all the time is being spent migrating! 1874 - */ 1875 - if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { 1876 - spin_lock(&pgdat->numabalancing_migrate_lock); 1877 - pgdat->numabalancing_migrate_nr_pages = 0; 1878 - pgdat->numabalancing_migrate_next_window = jiffies + 1879 - msecs_to_jiffies(migrate_interval_millisecs); 1880 - spin_unlock(&pgdat->numabalancing_migrate_lock); 1881 - } 1882 - if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { 1883 - trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, 1884 - nr_pages); 1885 - return true; 1886 - } 1887 - 1888 - /* 1889 - * This is an unlocked non-atomic update so errors are possible. 1890 - * The consequences are failing to migrate when we potentiall should 1891 - * have which is not severe enough to warrant locking. If it is ever 1892 - * a problem, it can be converted to a per-cpu counter. 1893 - */ 1894 - pgdat->numabalancing_migrate_nr_pages += nr_pages; 1895 - return false; 1896 - } 1897 - 1898 1858 static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) 1899 1859 { 1900 1860 int page_lru; ··· 1927 1967 if (page_is_file_cache(page) && PageDirty(page)) 1928 1968 goto out; 1929 1969 1930 - /* 1931 - * Rate-limit the amount of data that is being migrated to a node. 1932 - * Optimal placement is no good if the memory bus is saturated and 1933 - * all the time is being spent migrating! 1934 - */ 1935 - if (numamigrate_update_ratelimit(pgdat, 1)) 1936 - goto out; 1937 - 1938 1970 isolated = numamigrate_isolate_page(pgdat, page); 1939 1971 if (!isolated) 1940 1972 goto out; ··· 1972 2020 int page_lru = page_is_file_cache(page); 1973 2021 unsigned long mmun_start = address & HPAGE_PMD_MASK; 1974 2022 unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; 1975 - 1976 - /* 1977 - * Rate-limit the amount of data that is being migrated to a node. 1978 - * Optimal placement is no good if the memory bus is saturated and 1979 - * all the time is being spent migrating! 1980 - */ 1981 - if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) 1982 - goto out_dropref; 1983 2023 1984 2024 new_page = alloc_pages_node(node, 1985 2025 (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), ··· 2069 2125 2070 2126 out_fail: 2071 2127 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 2072 - out_dropref: 2073 2128 ptl = pmd_lock(mm, pmd); 2074 2129 if (pmd_same(*pmd, entry)) { 2075 2130 entry = pmd_modify(entry, vma->vm_page_prot);
-2
mm/page_alloc.c
··· 6197 6197 static void pgdat_init_numabalancing(struct pglist_data *pgdat) 6198 6198 { 6199 6199 spin_lock_init(&pgdat->numabalancing_migrate_lock); 6200 - pgdat->numabalancing_migrate_nr_pages = 0; 6201 - pgdat->numabalancing_migrate_next_window = jiffies; 6202 6200 } 6203 6201 #else 6204 6202 static void pgdat_init_numabalancing(struct pglist_data *pgdat) {}