Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'powerpc-4.17-4' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux

Pull powerpc fixes from Michael Ellerman:
"A bunch of fixes, mostly for existing code and going to stable.

Our memory hot-unplug path wasn't flushing the cache before removing
memory. That is a problem now that we are doing memory hotplug on bare
metal.

Three fixes for the NPU code that supports devices connected via
NVLink (ie. GPUs). The main one tweaks the TLB flush algorithm to
avoid soft lockups for large flushes.

A fix for our memory error handling where we would loop infinitely,
returning back to the bad access and hard lockup the CPU.

Fixes for the OPAL RTC driver, which wasn't handling some error cases
correctly.

A fix for a hardlockup in the powernv cpufreq driver.

And finally two fixes to our smp_send_stop(), required due to a recent
change to use it on shutdown.

Thanks to: Alistair Popple, Balbir Singh, Laurentiu Tudor, Mahesh
Salgaonkar, Mark Hairgrove, Nicholas Piggin, Rashmica Gupta, Shilpasri
G Bhat"

* tag 'powerpc-4.17-4' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux:
powerpc/kvm/booke: Fix altivec related build break
powerpc: Fix deadlock with multiple calls to smp_send_stop
cpufreq: powernv: Fix hardlockup due to synchronous smp_call in timer interrupt
powerpc: Fix smp_send_stop NMI IPI handling
rtc: opal: Fix OPAL RTC driver OPAL_BUSY loops
powerpc/mce: Fix a bug where mce loops on memory UE.
powerpc/powernv/npu: Do a PID GPU TLB flush when invalidating a large address range
powerpc/powernv/npu: Prevent overwriting of pnv_npu2_init_contex() callback parameters
powerpc/powernv/npu: Add lock to prevent race in concurrent context init/destroy
powerpc/powernv/memtrace: Let the arch hotunplug code flush cache
powerpc/mm: Flush cache on memory hot(un)plug

+166 -65
+1 -1
arch/powerpc/include/asm/powernv.h
··· 15 15 extern void powernv_set_nmmu_ptcr(unsigned long ptcr); 16 16 extern struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, 17 17 unsigned long flags, 18 - struct npu_context *(*cb)(struct npu_context *, void *), 18 + void (*cb)(struct npu_context *, void *), 19 19 void *priv); 20 20 extern void pnv_npu2_destroy_context(struct npu_context *context, 21 21 struct pci_dev *gpdev);
+2 -5
arch/powerpc/kernel/mce_power.c
··· 441 441 if (pfn != ULONG_MAX) { 442 442 *phys_addr = 443 443 (pfn << PAGE_SHIFT); 444 - handled = 1; 445 444 } 446 445 } 447 446 } ··· 531 532 * kernel/exception-64s.h 532 533 */ 533 534 if (get_paca()->in_mce < MAX_MCE_DEPTH) 534 - if (!mce_find_instr_ea_and_pfn(regs, addr, 535 - phys_addr)) 536 - handled = 1; 535 + mce_find_instr_ea_and_pfn(regs, addr, phys_addr); 537 536 } 538 537 found = 1; 539 538 } ··· 569 572 const struct mce_ierror_table itable[]) 570 573 { 571 574 struct mce_error_info mce_err = { 0 }; 572 - uint64_t addr, phys_addr; 575 + uint64_t addr, phys_addr = ULONG_MAX; 573 576 uint64_t srr1 = regs->msr; 574 577 long handled; 575 578
+42 -7
arch/powerpc/kernel/smp.c
··· 566 566 #endif 567 567 568 568 #ifdef CONFIG_NMI_IPI 569 - static void stop_this_cpu(struct pt_regs *regs) 570 - #else 569 + static void nmi_stop_this_cpu(struct pt_regs *regs) 570 + { 571 + /* 572 + * This is a special case because it never returns, so the NMI IPI 573 + * handling would never mark it as done, which makes any later 574 + * smp_send_nmi_ipi() call spin forever. Mark it done now. 575 + * 576 + * IRQs are already hard disabled by the smp_handle_nmi_ipi. 577 + */ 578 + nmi_ipi_lock(); 579 + nmi_ipi_busy_count--; 580 + nmi_ipi_unlock(); 581 + 582 + /* Remove this CPU */ 583 + set_cpu_online(smp_processor_id(), false); 584 + 585 + spin_begin(); 586 + while (1) 587 + spin_cpu_relax(); 588 + } 589 + 590 + void smp_send_stop(void) 591 + { 592 + smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, nmi_stop_this_cpu, 1000000); 593 + } 594 + 595 + #else /* CONFIG_NMI_IPI */ 596 + 571 597 static void stop_this_cpu(void *dummy) 572 - #endif 573 598 { 574 599 /* Remove this CPU */ 575 600 set_cpu_online(smp_processor_id(), false); ··· 607 582 608 583 void smp_send_stop(void) 609 584 { 610 - #ifdef CONFIG_NMI_IPI 611 - smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, stop_this_cpu, 1000000); 612 - #else 585 + static bool stopped = false; 586 + 587 + /* 588 + * Prevent waiting on csd lock from a previous smp_send_stop. 589 + * This is racy, but in general callers try to do the right 590 + * thing and only fire off one smp_send_stop (e.g., see 591 + * kernel/panic.c) 592 + */ 593 + if (stopped) 594 + return; 595 + 596 + stopped = true; 597 + 613 598 smp_call_function(stop_this_cpu, NULL, 0); 614 - #endif 615 599 } 600 + #endif /* CONFIG_NMI_IPI */ 616 601 617 602 struct thread_info *current_set[NR_CPUS]; 618 603
+7
arch/powerpc/kvm/booke.c
··· 305 305 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL); 306 306 } 307 307 308 + #ifdef CONFIG_ALTIVEC 309 + void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu) 310 + { 311 + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALTIVEC_UNAVAIL); 312 + } 313 + #endif 314 + 308 315 void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu) 309 316 { 310 317 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DECREMENTER);
+2
arch/powerpc/mm/mem.c
··· 133 133 start, start + size, rc); 134 134 return -EFAULT; 135 135 } 136 + flush_inval_dcache_range(start, start + size); 136 137 137 138 return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); 138 139 } ··· 160 159 161 160 /* Remove htab bolted mappings for this section of memory */ 162 161 start = (unsigned long)__va(start); 162 + flush_inval_dcache_range(start, start + size); 163 163 ret = remove_section_mapping(start, start + size); 164 164 165 165 /* Ensure all vmalloc mappings are flushed in case they also
-17
arch/powerpc/platforms/powernv/memtrace.c
··· 82 82 .open = simple_open, 83 83 }; 84 84 85 - static void flush_memory_region(u64 base, u64 size) 86 - { 87 - unsigned long line_size = ppc64_caches.l1d.size; 88 - u64 end = base + size; 89 - u64 addr; 90 - 91 - base = round_down(base, line_size); 92 - end = round_up(end, line_size); 93 - 94 - for (addr = base; addr < end; addr += line_size) 95 - asm volatile("dcbf 0,%0" : "=r" (addr) :: "memory"); 96 - } 97 - 98 85 static int check_memblock_online(struct memory_block *mem, void *arg) 99 86 { 100 87 if (mem->state != MEM_ONLINE) ··· 118 131 119 132 walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE, 120 133 change_memblock_state); 121 - 122 - /* RCU grace period? */ 123 - flush_memory_region((u64)__va(start_pfn << PAGE_SHIFT), 124 - nr_pages << PAGE_SHIFT); 125 134 126 135 lock_device_hotplug(); 127 136 remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
+73 -15
arch/powerpc/platforms/powernv/npu-dma.c
··· 34 34 #define npu_to_phb(x) container_of(x, struct pnv_phb, npu) 35 35 36 36 /* 37 + * spinlock to protect initialisation of an npu_context for a particular 38 + * mm_struct. 39 + */ 40 + static DEFINE_SPINLOCK(npu_context_lock); 41 + 42 + /* 43 + * When an address shootdown range exceeds this threshold we invalidate the 44 + * entire TLB on the GPU for the given PID rather than each specific address in 45 + * the range. 46 + */ 47 + #define ATSD_THRESHOLD (2*1024*1024) 48 + 49 + /* 37 50 * Other types of TCE cache invalidation are not functional in the 38 51 * hardware. 39 52 */ ··· 414 401 bool nmmu_flush; 415 402 416 403 /* Callback to stop translation requests on a given GPU */ 417 - struct npu_context *(*release_cb)(struct npu_context *, void *); 404 + void (*release_cb)(struct npu_context *context, void *priv); 418 405 419 406 /* 420 407 * Private pointer passed to the above callback for usage by ··· 684 671 struct npu_context *npu_context = mn_to_npu_context(mn); 685 672 unsigned long address; 686 673 687 - for (address = start; address < end; address += PAGE_SIZE) 688 - mmio_invalidate(npu_context, 1, address, false); 674 + if (end - start > ATSD_THRESHOLD) { 675 + /* 676 + * Just invalidate the entire PID if the address range is too 677 + * large. 678 + */ 679 + mmio_invalidate(npu_context, 0, 0, true); 680 + } else { 681 + for (address = start; address < end; address += PAGE_SIZE) 682 + mmio_invalidate(npu_context, 1, address, false); 689 683 690 - /* Do the flush only on the final addess == end */ 691 - mmio_invalidate(npu_context, 1, address, true); 684 + /* Do the flush only on the final addess == end */ 685 + mmio_invalidate(npu_context, 1, address, true); 686 + } 692 687 } 693 688 694 689 static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { ··· 717 696 * Returns an error if there no contexts are currently available or a 718 697 * npu_context which should be passed to pnv_npu2_handle_fault(). 719 698 * 720 - * mmap_sem must be held in write mode. 699 + * mmap_sem must be held in write mode and must not be called from interrupt 700 + * context. 721 701 */ 722 702 struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, 723 703 unsigned long flags, 724 - struct npu_context *(*cb)(struct npu_context *, void *), 704 + void (*cb)(struct npu_context *, void *), 725 705 void *priv) 726 706 { 727 707 int rc; ··· 765 743 /* 766 744 * Setup the NPU context table for a particular GPU. These need to be 767 745 * per-GPU as we need the tables to filter ATSDs when there are no 768 - * active contexts on a particular GPU. 746 + * active contexts on a particular GPU. It is safe for these to be 747 + * called concurrently with destroy as the OPAL call takes appropriate 748 + * locks and refcounts on init/destroy. 769 749 */ 770 750 rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags, 771 751 PCI_DEVID(gpdev->bus->number, gpdev->devfn)); ··· 778 754 * We store the npu pci device so we can more easily get at the 779 755 * associated npus. 780 756 */ 757 + spin_lock(&npu_context_lock); 781 758 npu_context = mm->context.npu_context; 759 + if (npu_context) { 760 + if (npu_context->release_cb != cb || 761 + npu_context->priv != priv) { 762 + spin_unlock(&npu_context_lock); 763 + opal_npu_destroy_context(nphb->opal_id, mm->context.id, 764 + PCI_DEVID(gpdev->bus->number, 765 + gpdev->devfn)); 766 + return ERR_PTR(-EINVAL); 767 + } 768 + 769 + WARN_ON(!kref_get_unless_zero(&npu_context->kref)); 770 + } 771 + spin_unlock(&npu_context_lock); 772 + 782 773 if (!npu_context) { 774 + /* 775 + * We can set up these fields without holding the 776 + * npu_context_lock as the npu_context hasn't been returned to 777 + * the caller meaning it can't be destroyed. Parallel allocation 778 + * is protected against by mmap_sem. 779 + */ 783 780 rc = -ENOMEM; 784 781 npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL); 785 782 if (npu_context) { ··· 819 774 } 820 775 821 776 mm->context.npu_context = npu_context; 822 - } else { 823 - WARN_ON(!kref_get_unless_zero(&npu_context->kref)); 824 777 } 825 778 826 779 npu_context->release_cb = cb; ··· 857 814 mm_context_remove_copro(npu_context->mm); 858 815 859 816 npu_context->mm->context.npu_context = NULL; 860 - mmu_notifier_unregister(&npu_context->mn, 861 - npu_context->mm); 862 - 863 - kfree(npu_context); 864 817 } 865 818 819 + /* 820 + * Destroy a context on the given GPU. May free the npu_context if it is no 821 + * longer active on any GPUs. Must not be called from interrupt context. 822 + */ 866 823 void pnv_npu2_destroy_context(struct npu_context *npu_context, 867 824 struct pci_dev *gpdev) 868 825 { 826 + int removed; 869 827 struct pnv_phb *nphb; 870 828 struct npu *npu; 871 829 struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); ··· 888 844 WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL); 889 845 opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id, 890 846 PCI_DEVID(gpdev->bus->number, gpdev->devfn)); 891 - kref_put(&npu_context->kref, pnv_npu2_release_context); 847 + spin_lock(&npu_context_lock); 848 + removed = kref_put(&npu_context->kref, pnv_npu2_release_context); 849 + spin_unlock(&npu_context_lock); 850 + 851 + /* 852 + * We need to do this outside of pnv_npu2_release_context so that it is 853 + * outside the spinlock as mmu_notifier_destroy uses SRCU. 854 + */ 855 + if (removed) { 856 + mmu_notifier_unregister(&npu_context->mn, 857 + npu_context->mm); 858 + 859 + kfree(npu_context); 860 + } 861 + 892 862 } 893 863 EXPORT_SYMBOL(pnv_npu2_destroy_context); 894 864
+5 -3
arch/powerpc/platforms/powernv/opal-rtc.c
··· 48 48 49 49 while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { 50 50 rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms); 51 - if (rc == OPAL_BUSY_EVENT) 51 + if (rc == OPAL_BUSY_EVENT) { 52 + mdelay(OPAL_BUSY_DELAY_MS); 52 53 opal_poll_events(NULL); 53 - else if (rc == OPAL_BUSY) 54 - mdelay(10); 54 + } else if (rc == OPAL_BUSY) { 55 + mdelay(OPAL_BUSY_DELAY_MS); 56 + } 55 57 } 56 58 if (rc != OPAL_SUCCESS) 57 59 return 0;
+11 -3
drivers/cpufreq/powernv-cpufreq.c
··· 679 679 680 680 if (!spin_trylock(&gpstates->gpstate_lock)) 681 681 return; 682 + /* 683 + * If the timer has migrated to the different cpu then bring 684 + * it back to one of the policy->cpus 685 + */ 686 + if (!cpumask_test_cpu(raw_smp_processor_id(), policy->cpus)) { 687 + gpstates->timer.expires = jiffies + msecs_to_jiffies(1); 688 + add_timer_on(&gpstates->timer, cpumask_first(policy->cpus)); 689 + spin_unlock(&gpstates->gpstate_lock); 690 + return; 691 + } 682 692 683 693 /* 684 694 * If PMCR was last updated was using fast_swtich then ··· 728 718 if (gpstate_idx != gpstates->last_lpstate_idx) 729 719 queue_gpstate_timer(gpstates); 730 720 721 + set_pstate(&freq_data); 731 722 spin_unlock(&gpstates->gpstate_lock); 732 - 733 - /* Timer may get migrated to a different cpu on cpu hot unplug */ 734 - smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1); 735 723 } 736 724 737 725 /*
+23 -14
drivers/rtc/rtc-opal.c
··· 57 57 58 58 static int opal_get_rtc_time(struct device *dev, struct rtc_time *tm) 59 59 { 60 - long rc = OPAL_BUSY; 60 + s64 rc = OPAL_BUSY; 61 61 int retries = 10; 62 62 u32 y_m_d; 63 63 u64 h_m_s_ms; ··· 66 66 67 67 while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { 68 68 rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms); 69 - if (rc == OPAL_BUSY_EVENT) 69 + if (rc == OPAL_BUSY_EVENT) { 70 + msleep(OPAL_BUSY_DELAY_MS); 70 71 opal_poll_events(NULL); 71 - else if (retries-- && (rc == OPAL_HARDWARE 72 - || rc == OPAL_INTERNAL_ERROR)) 73 - msleep(10); 74 - else if (rc != OPAL_BUSY && rc != OPAL_BUSY_EVENT) 75 - break; 72 + } else if (rc == OPAL_BUSY) { 73 + msleep(OPAL_BUSY_DELAY_MS); 74 + } else if (rc == OPAL_HARDWARE || rc == OPAL_INTERNAL_ERROR) { 75 + if (retries--) { 76 + msleep(10); /* Wait 10ms before retry */ 77 + rc = OPAL_BUSY; /* go around again */ 78 + } 79 + } 76 80 } 77 81 78 82 if (rc != OPAL_SUCCESS) ··· 91 87 92 88 static int opal_set_rtc_time(struct device *dev, struct rtc_time *tm) 93 89 { 94 - long rc = OPAL_BUSY; 90 + s64 rc = OPAL_BUSY; 95 91 int retries = 10; 96 92 u32 y_m_d = 0; 97 93 u64 h_m_s_ms = 0; 98 94 99 95 tm_to_opal(tm, &y_m_d, &h_m_s_ms); 96 + 100 97 while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { 101 98 rc = opal_rtc_write(y_m_d, h_m_s_ms); 102 - if (rc == OPAL_BUSY_EVENT) 99 + if (rc == OPAL_BUSY_EVENT) { 100 + msleep(OPAL_BUSY_DELAY_MS); 103 101 opal_poll_events(NULL); 104 - else if (retries-- && (rc == OPAL_HARDWARE 105 - || rc == OPAL_INTERNAL_ERROR)) 106 - msleep(10); 107 - else if (rc != OPAL_BUSY && rc != OPAL_BUSY_EVENT) 108 - break; 102 + } else if (rc == OPAL_BUSY) { 103 + msleep(OPAL_BUSY_DELAY_MS); 104 + } else if (rc == OPAL_HARDWARE || rc == OPAL_INTERNAL_ERROR) { 105 + if (retries--) { 106 + msleep(10); /* Wait 10ms before retry */ 107 + rc = OPAL_BUSY; /* go around again */ 108 + } 109 + } 109 110 } 110 111 111 112 return rc == OPAL_SUCCESS ? 0 : -EIO;