Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

NTB: ntb_transport: Add 'tx_memcpy_offload' module option

Some platforms (e.g. R-Car S4) do not gain from using a DMAC on TX path
in ntb_transport and end up CPU-bound on memcpy_toio(). Add a module
parameter 'tx_memcpy_offload' that moves the TX memcpy_toio() and
descriptor writes to a per-QP kernel thread. It is disabled by default.

This change also fixes a rare ordering hazard in ntb_tx_copy_callback(),
that was observed on R-Car S4 once throughput improved with the new
module parameter: the DONE flag write to the peer MW, which is WC
mapped, could be observed after the DB/MSI trigger. Both operations are
posted PCIe MWr (often via different OB iATUs), so WC buffering and
bridges may reorder visibility. Insert dma_mb() to enforce store->load
ordering and then read back hdr->flags to flush the posted write before
ringing the doorbell / issuing MSI.

While at it, update tx_index with WRITE_ONCE() at the earlier possible
location to make ntb_transport_tx_free_entry() robust.

Signed-off-by: Koichiro Den <den@valinux.co.jp>
Signed-off-by: Jon Mason <jdmason@kudzu.us>

authored by

Koichiro Den and committed by
Jon Mason
322617a0 b36490b5

+100 -4
+100 -4
drivers/ntb/ntb_transport.c
··· 54 54 #include <linux/errno.h> 55 55 #include <linux/export.h> 56 56 #include <linux/interrupt.h> 57 + #include <linux/kthread.h> 57 58 #include <linux/module.h> 58 59 #include <linux/pci.h> 59 60 #include <linux/slab.h> 60 61 #include <linux/types.h> 61 62 #include <linux/uaccess.h> 62 63 #include <linux/mutex.h> 64 + #include <linux/wait.h> 63 65 #include "linux/ntb.h" 64 66 #include "linux/ntb_transport.h" 65 67 ··· 101 99 module_param(use_msi, bool, 0644); 102 100 MODULE_PARM_DESC(use_msi, "Use MSI interrupts instead of doorbells"); 103 101 #endif 102 + 103 + static bool tx_memcpy_offload; 104 + module_param(tx_memcpy_offload, bool, 0644); 105 + MODULE_PARM_DESC(tx_memcpy_offload, "Offload TX memcpy_toio() to a kernel thread"); 104 106 105 107 static struct dentry *nt_debugfs_dir; 106 108 ··· 154 148 void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data, 155 149 void *data, int len); 156 150 struct list_head tx_free_q; 151 + struct list_head tx_offl_q; 157 152 spinlock_t ntb_tx_free_q_lock; 153 + spinlock_t ntb_tx_offl_q_lock; 158 154 void __iomem *tx_mw; 159 155 phys_addr_t tx_mw_phys; 160 156 size_t tx_mw_size; ··· 207 199 int msi_irq; 208 200 struct ntb_msi_desc msi_desc; 209 201 struct ntb_msi_desc peer_msi_desc; 202 + 203 + struct task_struct *tx_offload_thread; 204 + wait_queue_head_t tx_offload_wq; 210 205 }; 211 206 212 207 struct ntb_transport_mw { ··· 295 284 static void ntb_memcpy_tx(struct ntb_queue_entry *entry, void __iomem *offset); 296 285 static int ntb_async_rx_submit(struct ntb_queue_entry *entry, void *offset); 297 286 static void ntb_memcpy_rx(struct ntb_queue_entry *entry, void *offset); 287 + static int ntb_tx_memcpy_kthread(void *data); 298 288 289 + 290 + static inline bool ntb_tx_offload_enabled(struct ntb_transport_qp *qp) 291 + { 292 + return tx_memcpy_offload && qp && qp->tx_offload_thread; 293 + } 299 294 300 295 static int ntb_transport_bus_match(struct device *dev, 301 296 const struct device_driver *drv) ··· 1271 1254 1272 1255 spin_lock_init(&qp->ntb_rx_q_lock); 1273 1256 spin_lock_init(&qp->ntb_tx_free_q_lock); 1257 + spin_lock_init(&qp->ntb_tx_offl_q_lock); 1274 1258 1275 1259 INIT_LIST_HEAD(&qp->rx_post_q); 1276 1260 INIT_LIST_HEAD(&qp->rx_pend_q); 1277 1261 INIT_LIST_HEAD(&qp->rx_free_q); 1278 1262 INIT_LIST_HEAD(&qp->tx_free_q); 1263 + INIT_LIST_HEAD(&qp->tx_offl_q); 1279 1264 1280 1265 tasklet_init(&qp->rxc_db_work, ntb_transport_rxc_db, 1281 1266 (unsigned long)qp); ··· 1804 1785 1805 1786 iowrite32(entry->flags | DESC_DONE_FLAG, &hdr->flags); 1806 1787 1788 + /* 1789 + * Make DONE flag visible before DB/MSI. WC + posted MWr may reorder 1790 + * across iATU/bridge (platform-dependent). Order and flush here. 1791 + */ 1792 + dma_mb(); 1793 + ioread32(&hdr->flags); 1794 + 1807 1795 if (qp->use_msi) 1808 1796 ntb_msi_peer_trigger(qp->ndev, PIDX, &qp->peer_msi_desc); 1809 1797 else ··· 1831 1805 ntb_list_add(&qp->ntb_tx_free_q_lock, &entry->entry, &qp->tx_free_q); 1832 1806 } 1833 1807 1834 - static void ntb_memcpy_tx(struct ntb_queue_entry *entry, void __iomem *offset) 1808 + static void ntb_memcpy_tx_on_stack(struct ntb_queue_entry *entry, void __iomem *offset) 1835 1809 { 1836 1810 #ifdef ARCH_HAS_NOCACHE_UACCESS 1837 1811 /* ··· 1847 1821 wmb(); 1848 1822 1849 1823 ntb_tx_copy_callback(entry, NULL); 1824 + } 1825 + 1826 + static int ntb_tx_memcpy_kthread(void *data) 1827 + { 1828 + struct ntb_transport_qp *qp = data; 1829 + struct ntb_queue_entry *entry, *tmp; 1830 + const int resched_nr = 64; 1831 + LIST_HEAD(local_list); 1832 + void __iomem *offset; 1833 + int processed = 0; 1834 + 1835 + while (!kthread_should_stop()) { 1836 + spin_lock_irq(&qp->ntb_tx_offl_q_lock); 1837 + wait_event_interruptible_lock_irq_timeout(qp->tx_offload_wq, 1838 + kthread_should_stop() || 1839 + !list_empty(&qp->tx_offl_q), 1840 + qp->ntb_tx_offl_q_lock, 5*HZ); 1841 + list_splice_tail_init(&qp->tx_offl_q, &local_list); 1842 + spin_unlock_irq(&qp->ntb_tx_offl_q_lock); 1843 + 1844 + list_for_each_entry_safe(entry, tmp, &local_list, entry) { 1845 + list_del(&entry->entry); 1846 + offset = qp->tx_mw + qp->tx_max_frame * entry->tx_index; 1847 + ntb_memcpy_tx_on_stack(entry, offset); 1848 + if (++processed >= resched_nr) { 1849 + cond_resched(); 1850 + processed = 0; 1851 + } 1852 + } 1853 + cond_resched(); 1854 + } 1855 + 1856 + return 0; 1857 + } 1858 + 1859 + static void ntb_memcpy_tx(struct ntb_queue_entry *entry, void __iomem *offset) 1860 + { 1861 + struct ntb_transport_qp *qp = entry->qp; 1862 + 1863 + if (WARN_ON_ONCE(!qp)) 1864 + return; 1865 + 1866 + if (ntb_tx_offload_enabled(qp)) { 1867 + ntb_list_add(&qp->ntb_tx_offl_q_lock, &entry->entry, 1868 + &qp->tx_offl_q); 1869 + wake_up(&qp->tx_offload_wq); 1870 + } else 1871 + ntb_memcpy_tx_on_stack(entry, offset); 1850 1872 } 1851 1873 1852 1874 static int ntb_async_tx_submit(struct ntb_transport_qp *qp, ··· 1969 1895 hdr = offset + qp->tx_max_frame - sizeof(struct ntb_payload_header); 1970 1896 entry->tx_hdr = hdr; 1971 1897 1898 + WARN_ON_ONCE(!ntb_transport_tx_free_entry(qp)); 1899 + WRITE_ONCE(qp->tx_index, (qp->tx_index + 1) % qp->tx_max_entry); 1900 + 1972 1901 iowrite32(entry->len, &hdr->len); 1973 1902 iowrite32((u32)qp->tx_pkts, &hdr->ver); 1974 1903 ··· 2011 1934 } 2012 1935 2013 1936 ntb_async_tx(qp, entry); 2014 - 2015 - qp->tx_index++; 2016 - qp->tx_index %= qp->tx_max_entry; 2017 1937 2018 1938 qp->tx_pkts++; 2019 1939 ··· 2107 2033 qp->rx_handler = handlers->rx_handler; 2108 2034 qp->tx_handler = handlers->tx_handler; 2109 2035 qp->event_handler = handlers->event_handler; 2036 + 2037 + init_waitqueue_head(&qp->tx_offload_wq); 2038 + if (tx_memcpy_offload) { 2039 + qp->tx_offload_thread = kthread_run(ntb_tx_memcpy_kthread, qp, 2040 + "ntb-txcpy/%s/%u", 2041 + pci_name(ndev->pdev), qp->qp_num); 2042 + if (IS_ERR(qp->tx_offload_thread)) { 2043 + dev_warn(&nt->ndev->dev, 2044 + "tx memcpy offload thread creation failed: %ld; falling back to inline copy\n", 2045 + PTR_ERR(qp->tx_offload_thread)); 2046 + qp->tx_offload_thread = NULL; 2047 + } 2048 + } else 2049 + qp->tx_offload_thread = NULL; 2110 2050 2111 2051 dma_cap_zero(dma_mask); 2112 2052 dma_cap_set(DMA_MEMCPY, dma_mask); ··· 2229 2141 2230 2142 qp->active = false; 2231 2143 2144 + if (qp->tx_offload_thread) { 2145 + kthread_stop(qp->tx_offload_thread); 2146 + qp->tx_offload_thread = NULL; 2147 + } 2148 + 2232 2149 if (qp->tx_dma_chan) { 2233 2150 struct dma_chan *chan = qp->tx_dma_chan; 2234 2151 /* Putting the dma_chan to NULL will force any new traffic to be ··· 2295 2202 } 2296 2203 2297 2204 while ((entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q))) 2205 + kfree(entry); 2206 + 2207 + while ((entry = ntb_list_rm(&qp->ntb_tx_offl_q_lock, &qp->tx_offl_q))) 2298 2208 kfree(entry); 2299 2209 2300 2210 qp->transport->qp_bitmap_free |= qp_bit;