Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

RDMA/ipoib: Fix ABBA deadlock with ipoib_reap_ah()

ipoib_mcast_carrier_on_task() insanely open codes a rtnl_lock() such that
the only time flush_workqueue() can be called is if it also clears
IPOIB_FLAG_OPER_UP.

Thus the flush inside ipoib_flush_ah() will deadlock if it gets unlucky
enough, and lockdep doesn't help us to find it early:

CPU0 CPU1 CPU2
__ipoib_ib_dev_flush()
down_read(vlan_rwsem)

ipoib_vlan_add()
rtnl_trylock()
down_write(vlan_rwsem)

ipoib_mcast_carrier_on_task()
while (!rtnl_trylock())
msleep(20);

ipoib_flush_ah()
flush_workqueue(priv->wq)

Clean up the ah_reaper related functions and lifecycle to make sense:

- Start/Stop of the reaper should only be done in open/stop NDOs, not in
any other places

- cancel and flush of the reaper should only happen in the stop NDO.
cancel is only functional when combined with IPOIB_STOP_REAPER.

- Non-stop places were flushing the AH's just need to flush out dead AH's
synchronously and ignore the background task completely. It is fully
locked and harmless to leave running.

Which ultimately fixes the ABBA deadlock by removing the unnecessary
flush_workqueue() from the problematic place under the vlan_rwsem.

Fixes: efc82eeeae4e ("IB/ipoib: No longer use flush as a parameter")
Link: https://lore.kernel.org/r/20200625174219.290842-1-kamalheib1@gmail.com
Reported-by: Kamal Heib <kheib@redhat.com>
Tested-by: Kamal Heib <kheib@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

+31 -36
+29 -36
drivers/infiniband/ulp/ipoib/ipoib_ib.c
··· 670 670 return rc; 671 671 } 672 672 673 - static void __ipoib_reap_ah(struct net_device *dev) 673 + static void ipoib_reap_dead_ahs(struct ipoib_dev_priv *priv) 674 674 { 675 - struct ipoib_dev_priv *priv = ipoib_priv(dev); 676 675 struct ipoib_ah *ah, *tah; 677 676 unsigned long flags; 678 677 679 - netif_tx_lock_bh(dev); 678 + netif_tx_lock_bh(priv->dev); 680 679 spin_lock_irqsave(&priv->lock, flags); 681 680 682 681 list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list) ··· 686 687 } 687 688 688 689 spin_unlock_irqrestore(&priv->lock, flags); 689 - netif_tx_unlock_bh(dev); 690 + netif_tx_unlock_bh(priv->dev); 690 691 } 691 692 692 693 void ipoib_reap_ah(struct work_struct *work) 693 694 { 694 695 struct ipoib_dev_priv *priv = 695 696 container_of(work, struct ipoib_dev_priv, ah_reap_task.work); 696 - struct net_device *dev = priv->dev; 697 697 698 - __ipoib_reap_ah(dev); 698 + ipoib_reap_dead_ahs(priv); 699 699 700 700 if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) 701 701 queue_delayed_work(priv->wq, &priv->ah_reap_task, 702 702 round_jiffies_relative(HZ)); 703 703 } 704 704 705 - static void ipoib_flush_ah(struct net_device *dev) 705 + static void ipoib_start_ah_reaper(struct ipoib_dev_priv *priv) 706 706 { 707 - struct ipoib_dev_priv *priv = ipoib_priv(dev); 708 - 709 - cancel_delayed_work(&priv->ah_reap_task); 710 - flush_workqueue(priv->wq); 711 - ipoib_reap_ah(&priv->ah_reap_task.work); 707 + clear_bit(IPOIB_STOP_REAPER, &priv->flags); 708 + queue_delayed_work(priv->wq, &priv->ah_reap_task, 709 + round_jiffies_relative(HZ)); 712 710 } 713 711 714 - static void ipoib_stop_ah(struct net_device *dev) 712 + static void ipoib_stop_ah_reaper(struct ipoib_dev_priv *priv) 715 713 { 716 - struct ipoib_dev_priv *priv = ipoib_priv(dev); 717 - 718 714 set_bit(IPOIB_STOP_REAPER, &priv->flags); 719 - ipoib_flush_ah(dev); 715 + cancel_delayed_work(&priv->ah_reap_task); 716 + /* 717 + * After ipoib_stop_ah_reaper() we always go through 718 + * ipoib_reap_dead_ahs() which ensures the work is really stopped and 719 + * does a final flush out of the dead_ah's list 720 + */ 720 721 } 721 722 722 723 static int recvs_pending(struct net_device *dev) ··· 845 846 return 0; 846 847 } 847 848 848 - void ipoib_ib_dev_stop(struct net_device *dev) 849 - { 850 - struct ipoib_dev_priv *priv = ipoib_priv(dev); 851 - 852 - priv->rn_ops->ndo_stop(dev); 853 - 854 - clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); 855 - ipoib_flush_ah(dev); 856 - } 857 - 858 849 int ipoib_ib_dev_open_default(struct net_device *dev) 859 850 { 860 851 struct ipoib_dev_priv *priv = ipoib_priv(dev); ··· 888 899 return -1; 889 900 } 890 901 891 - clear_bit(IPOIB_STOP_REAPER, &priv->flags); 892 - queue_delayed_work(priv->wq, &priv->ah_reap_task, 893 - round_jiffies_relative(HZ)); 894 - 902 + ipoib_start_ah_reaper(priv); 895 903 if (priv->rn_ops->ndo_open(dev)) { 896 904 pr_warn("%s: Failed to open dev\n", dev->name); 897 905 goto dev_stop; ··· 899 913 return 0; 900 914 901 915 dev_stop: 902 - set_bit(IPOIB_STOP_REAPER, &priv->flags); 903 - cancel_delayed_work(&priv->ah_reap_task); 904 - set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); 905 - ipoib_ib_dev_stop(dev); 916 + ipoib_stop_ah_reaper(priv); 906 917 return -1; 918 + } 919 + 920 + void ipoib_ib_dev_stop(struct net_device *dev) 921 + { 922 + struct ipoib_dev_priv *priv = ipoib_priv(dev); 923 + 924 + priv->rn_ops->ndo_stop(dev); 925 + 926 + clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); 927 + ipoib_stop_ah_reaper(priv); 907 928 } 908 929 909 930 void ipoib_pkey_dev_check_presence(struct net_device *dev) ··· 1223 1230 ipoib_mcast_dev_flush(dev); 1224 1231 if (oper_up) 1225 1232 set_bit(IPOIB_FLAG_OPER_UP, &priv->flags); 1226 - ipoib_flush_ah(dev); 1233 + ipoib_reap_dead_ahs(priv); 1227 1234 } 1228 1235 1229 1236 if (level >= IPOIB_FLUSH_NORMAL) ··· 1298 1305 * the neighbor garbage collection is stopped and reaped. 1299 1306 * That should all be done now, so make a final ah flush. 1300 1307 */ 1301 - ipoib_stop_ah(dev); 1308 + ipoib_reap_dead_ahs(priv); 1302 1309 1303 1310 clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); 1304 1311
+2
drivers/infiniband/ulp/ipoib/ipoib_main.c
··· 1976 1976 1977 1977 /* no more works over the priv->wq */ 1978 1978 if (priv->wq) { 1979 + /* See ipoib_mcast_carrier_on_task() */ 1980 + WARN_ON(test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)); 1979 1981 flush_workqueue(priv->wq); 1980 1982 destroy_workqueue(priv->wq); 1981 1983 priv->wq = NULL;