Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

-6

include/linux/mmzone.h

··· 671 671 #ifdef CONFIG_NUMA_BALANCING 672 672 /* Lock serializing the migrate rate limiting window */ 673 673 spinlock_t numabalancing_migrate_lock; 674 - 675 - /* Rate limiting time interval */ 676 - unsigned long numabalancing_migrate_next_window; 677 - 678 - /* Number of pages migrated during the rate limiting time interval */ 679 - unsigned long numabalancing_migrate_nr_pages; 680 674 #endif 681 675 /* 682 676 * This is a per-node reserve of pages that are not available

-27

include/trace/events/migrate.h

··· 70 70 __print_symbolic(__entry->mode, MIGRATE_MODE), 71 71 __print_symbolic(__entry->reason, MIGRATE_REASON)) 72 72 ); 73 - 74 - TRACE_EVENT(mm_numa_migrate_ratelimit, 75 - 76 - TP_PROTO(struct task_struct *p, int dst_nid, unsigned long nr_pages), 77 - 78 - TP_ARGS(p, dst_nid, nr_pages), 79 - 80 - TP_STRUCT__entry( 81 - __array( char, comm, TASK_COMM_LEN) 82 - __field( pid_t, pid) 83 - __field( int, dst_nid) 84 - __field( unsigned long, nr_pages) 85 - ), 86 - 87 - TP_fast_assign( 88 - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); 89 - __entry->pid = p->pid; 90 - __entry->dst_nid = dst_nid; 91 - __entry->nr_pages = nr_pages; 92 - ), 93 - 94 - TP_printk("comm=%s pid=%d dst_nid=%d nr_pages=%lu", 95 - __entry->comm, 96 - __entry->pid, 97 - __entry->dst_nid, 98 - __entry->nr_pages) 99 - ); 100 73 #endif /* _TRACE_MIGRATE_H */ 101 74 102 75 /* This part must be outside protection */

+1 -1

kernel/sched/core.c

··· 1167 1167 1168 1168 if (task_cpu(p) != new_cpu) { 1169 1169 if (p->sched_class->migrate_task_rq) 1170 - p->sched_class->migrate_task_rq(p); 1170 + p->sched_class->migrate_task_rq(p, new_cpu); 1171 1171 p->se.nr_migrations++; 1172 1172 rseq_migrate(p); 1173 1173 perf_event_task_migrate(p);

+1 -1

kernel/sched/deadline.c

··· 1607 1607 return cpu; 1608 1608 } 1609 1609 1610 - static void migrate_task_rq_dl(struct task_struct *p) 1610 + static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) 1611 1611 { 1612 1612 struct rq *rq; 1613 1613

+91 -13

kernel/sched/fair.c

··· 1392 1392 int last_cpupid, this_cpupid; 1393 1393 1394 1394 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); 1395 + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); 1396 + 1397 + /* 1398 + * Allow first faults or private faults to migrate immediately early in 1399 + * the lifetime of a task. The magic number 4 is based on waiting for 1400 + * two full passes of the "multi-stage node selection" test that is 1401 + * executed below. 1402 + */ 1403 + if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && 1404 + (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) 1405 + return true; 1395 1406 1396 1407 /* 1397 1408 * Multi-stage node selection is used in conjunction with a periodic ··· 1421 1410 * This quadric squishes small probabilities, making it less likely we 1422 1411 * act on an unlikely task<->page relation. 1423 1412 */ 1424 - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); 1425 1413 if (!cpupid_pid_unset(last_cpupid) && 1426 1414 cpupid_to_nid(last_cpupid) != dst_nid) 1427 1415 return false; ··· 1524 1514 static void task_numa_assign(struct task_numa_env *env, 1525 1515 struct task_struct *p, long imp) 1526 1516 { 1517 + struct rq *rq = cpu_rq(env->dst_cpu); 1518 + 1519 + /* Bail out if run-queue part of active NUMA balance. */ 1520 + if (xchg(&rq->numa_migrate_on, 1)) 1521 + return; 1522 + 1523 + /* 1524 + * Clear previous best_cpu/rq numa-migrate flag, since task now 1525 + * found a better CPU to move/swap. 1526 + */ 1527 + if (env->best_cpu != -1) { 1528 + rq = cpu_rq(env->best_cpu); 1529 + WRITE_ONCE(rq->numa_migrate_on, 0); 1530 + } 1531 + 1527 1532 if (env->best_task) 1528 1533 put_task_struct(env->best_task); 1529 1534 if (p) ··· 1578 1553 } 1579 1554 1580 1555 /* 1556 + * Maximum NUMA importance can be 1998 (2*999); 1557 + * SMALLIMP @ 30 would be close to 1998/64. 1558 + * Used to deter task migration. 1559 + */ 1560 + #define SMALLIMP 30 1561 + 1562 + /* 1581 1563 * This checks if the overall compute and NUMA accesses of the system would 1582 1564 * be improved if the source tasks was migrated to the target dst_cpu taking 1583 1565 * into account that it might be best if task running on the dst_cpu should ··· 1601 1569 long moveimp = imp; 1602 1570 int dist = env->dist; 1603 1571 1572 + if (READ_ONCE(dst_rq->numa_migrate_on)) 1573 + return; 1574 + 1604 1575 rcu_read_lock(); 1605 1576 cur = task_rcu_dereference(&dst_rq->curr); 1606 1577 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) ··· 1617 1582 goto unlock; 1618 1583 1619 1584 if (!cur) { 1620 - if (maymove || imp > env->best_imp) 1585 + if (maymove && moveimp >= env->best_imp) 1621 1586 goto assign; 1622 1587 else 1623 1588 goto unlock; ··· 1660 1625 task_weight(cur, env->dst_nid, dist); 1661 1626 } 1662 1627 1663 - if (imp <= env->best_imp) 1664 - goto unlock; 1665 - 1666 1628 if (maymove && moveimp > imp && moveimp > env->best_imp) { 1667 - imp = moveimp - 1; 1629 + imp = moveimp; 1668 1630 cur = NULL; 1669 1631 goto assign; 1670 1632 } 1633 + 1634 + /* 1635 + * If the NUMA importance is less than SMALLIMP, 1636 + * task migration might only result in ping pong 1637 + * of tasks and also hurt performance due to cache 1638 + * misses. 1639 + */ 1640 + if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2) 1641 + goto unlock; 1671 1642 1672 1643 /* 1673 1644 * In the overloaded case, try and keep the load balanced. ··· 1751 1710 .best_cpu = -1, 1752 1711 }; 1753 1712 struct sched_domain *sd; 1713 + struct rq *best_rq; 1754 1714 unsigned long taskweight, groupweight; 1755 1715 int nid, ret, dist; 1756 1716 long taskimp, groupimp; ··· 1847 1805 if (env.best_cpu == -1) 1848 1806 return -EAGAIN; 1849 1807 1850 - /* 1851 - * Reset the scan period if the task is being rescheduled on an 1852 - * alternative node to recheck if the tasks is now properly placed. 1853 - */ 1854 - p->numa_scan_period = task_scan_start(p); 1855 - 1808 + best_rq = cpu_rq(env.best_cpu); 1856 1809 if (env.best_task == NULL) { 1857 1810 ret = migrate_task_to(p, env.best_cpu); 1811 + WRITE_ONCE(best_rq->numa_migrate_on, 0); 1858 1812 if (ret != 0) 1859 1813 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); 1860 1814 return ret; 1861 1815 } 1862 1816 1863 1817 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); 1818 + WRITE_ONCE(best_rq->numa_migrate_on, 0); 1864 1819 1865 1820 if (ret != 0) 1866 1821 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); ··· 2635 2596 } 2636 2597 } 2637 2598 2599 + static void update_scan_period(struct task_struct *p, int new_cpu) 2600 + { 2601 + int src_nid = cpu_to_node(task_cpu(p)); 2602 + int dst_nid = cpu_to_node(new_cpu); 2603 + 2604 + if (!static_branch_likely(&sched_numa_balancing)) 2605 + return; 2606 + 2607 + if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING)) 2608 + return; 2609 + 2610 + if (src_nid == dst_nid) 2611 + return; 2612 + 2613 + /* 2614 + * Allow resets if faults have been trapped before one scan 2615 + * has completed. This is most likely due to a new task that 2616 + * is pulled cross-node due to wakeups or load balancing. 2617 + */ 2618 + if (p->numa_scan_seq) { 2619 + /* 2620 + * Avoid scan adjustments if moving to the preferred 2621 + * node or if the task was not previously running on 2622 + * the preferred node. 2623 + */ 2624 + if (dst_nid == p->numa_preferred_nid || 2625 + (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) 2626 + return; 2627 + } 2628 + 2629 + p->numa_scan_period = task_scan_start(p); 2630 + } 2631 + 2638 2632 #else 2639 2633 static void task_tick_numa(struct rq *rq, struct task_struct *curr) 2640 2634 { ··· 2678 2606 } 2679 2607 2680 2608 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) 2609 + { 2610 + } 2611 + 2612 + static inline void update_scan_period(struct task_struct *p, int new_cpu) 2681 2613 { 2682 2614 } 2683 2615 ··· 6351 6275 * cfs_rq_of(p) references at time of call are still valid and identify the 6352 6276 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. 6353 6277 */ 6354 - static void migrate_task_rq_fair(struct task_struct *p) 6278 + static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) 6355 6279 { 6356 6280 /* 6357 6281 * As blocked tasks retain absolute vruntime the migration needs to ··· 6404 6328 6405 6329 /* We have migrated, no longer consider this task hot */ 6406 6330 p->se.exec_start = 0; 6331 + 6332 + update_scan_period(p, new_cpu); 6407 6333 } 6408 6334 6409 6335 static void task_dead_fair(struct task_struct *p)

+2 -1

kernel/sched/sched.h

··· 783 783 #ifdef CONFIG_NUMA_BALANCING 784 784 unsigned int nr_numa_running; 785 785 unsigned int nr_preferred_running; 786 + unsigned int numa_migrate_on; 786 787 #endif 787 788 #define CPU_LOAD_IDX_MAX 5 788 789 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; ··· 1524 1523 1525 1524 #ifdef CONFIG_SMP 1526 1525 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1527 - void (*migrate_task_rq)(struct task_struct *p); 1526 + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); 1528 1527 1529 1528 void (*task_woken)(struct rq *this_rq, struct task_struct *task); 1530 1529

-57

mm/migrate.c

··· 1855 1855 return newpage; 1856 1856 } 1857 1857 1858 - /* 1859 - * page migration rate limiting control. 1860 - * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs 1861 - * window of time. Default here says do not migrate more than 1280M per second. 1862 - */ 1863 - static unsigned int migrate_interval_millisecs __read_mostly = 100; 1864 - static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); 1865 - 1866 - /* Returns true if the node is migrate rate-limited after the update */ 1867 - static bool numamigrate_update_ratelimit(pg_data_t *pgdat, 1868 - unsigned long nr_pages) 1869 - { 1870 - /* 1871 - * Rate-limit the amount of data that is being migrated to a node. 1872 - * Optimal placement is no good if the memory bus is saturated and 1873 - * all the time is being spent migrating! 1874 - */ 1875 - if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { 1876 - spin_lock(&pgdat->numabalancing_migrate_lock); 1877 - pgdat->numabalancing_migrate_nr_pages = 0; 1878 - pgdat->numabalancing_migrate_next_window = jiffies + 1879 - msecs_to_jiffies(migrate_interval_millisecs); 1880 - spin_unlock(&pgdat->numabalancing_migrate_lock); 1881 - } 1882 - if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { 1883 - trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, 1884 - nr_pages); 1885 - return true; 1886 - } 1887 - 1888 - /* 1889 - * This is an unlocked non-atomic update so errors are possible. 1890 - * The consequences are failing to migrate when we potentiall should 1891 - * have which is not severe enough to warrant locking. If it is ever 1892 - * a problem, it can be converted to a per-cpu counter. 1893 - */ 1894 - pgdat->numabalancing_migrate_nr_pages += nr_pages; 1895 - return false; 1896 - } 1897 - 1898 1858 static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) 1899 1859 { 1900 1860 int page_lru; ··· 1927 1967 if (page_is_file_cache(page) && PageDirty(page)) 1928 1968 goto out; 1929 1969 1930 - /* 1931 - * Rate-limit the amount of data that is being migrated to a node. 1932 - * Optimal placement is no good if the memory bus is saturated and 1933 - * all the time is being spent migrating! 1934 - */ 1935 - if (numamigrate_update_ratelimit(pgdat, 1)) 1936 - goto out; 1937 - 1938 1970 isolated = numamigrate_isolate_page(pgdat, page); 1939 1971 if (!isolated) 1940 1972 goto out; ··· 1972 2020 int page_lru = page_is_file_cache(page); 1973 2021 unsigned long mmun_start = address & HPAGE_PMD_MASK; 1974 2022 unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; 1975 - 1976 - /* 1977 - * Rate-limit the amount of data that is being migrated to a node. 1978 - * Optimal placement is no good if the memory bus is saturated and 1979 - * all the time is being spent migrating! 1980 - */ 1981 - if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) 1982 - goto out_dropref; 1983 2023 1984 2024 new_page = alloc_pages_node(node, 1985 2025 (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), ··· 2069 2125 2070 2126 out_fail: 2071 2127 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 2072 - out_dropref: 2073 2128 ptl = pmd_lock(mm, pmd); 2074 2129 if (pmd_same(*pmd, entry)) { 2075 2130 entry = pmd_modify(entry, vma->vm_page_prot);

-2

mm/page_alloc.c

··· 6197 6197 static void pgdat_init_numabalancing(struct pglist_data *pgdat) 6198 6198 { 6199 6199 spin_lock_init(&pgdat->numabalancing_migrate_lock); 6200 - pgdat->numabalancing_migrate_nr_pages = 0; 6201 - pgdat->numabalancing_migrate_next_window = jiffies; 6202 6200 } 6203 6201 #else 6204 6202 static void pgdat_init_numabalancing(struct pglist_data *pgdat) {}

Configure Feed

Configure Feed