Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

RDMA/iwcm: Fix workqueue list corruption by removing work_list

The commit e1168f0 ("RDMA/iwcm: Simplify cm_event_handler()")
changed the work submission logic to unconditionally call
queue_work() with the expectation that queue_work() would
have no effect if work was already pending. The problem is
that a free list of struct iwcm_work is used (for which
struct work_struct is embedded), so each call to queue_work()
is basically unique and therefore does indeed queue the work.

This causes a problem in the work handler which walks the work_list
until it's empty to process entries. This means that a single
run of the work handler could process item N+1 and release it
back to the free list while the actual workqueue entry is still
queued. It could then get reused (INIT_WORK...) and lead to
list corruption in the workqueue logic.

Fix this by just removing the work_list. The workqueue already
does this for us.

This fixes the following error that was observed when stress
testing with ucmatose on an Intel E830 in iWARP mode:

[ 151.465780] list_del corruption. next->prev should be ffff9f0915c69c08, but was ffff9f0a1116be08. (next=ffff9f0a15b11c08)
[ 151.466639] ------------[ cut here ]------------
[ 151.466986] kernel BUG at lib/list_debug.c:67!
[ 151.467349] Oops: invalid opcode: 0000 [#1] SMP NOPTI
[ 151.467753] CPU: 14 UID: 0 PID: 2306 Comm: kworker/u64:18 Not tainted 6.19.0-rc4+ #1 PREEMPT(voluntary)
[ 151.468466] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[ 151.469192] Workqueue: 0x0 (iw_cm_wq)
[ 151.469478] RIP: 0010:__list_del_entry_valid_or_report+0xf0/0x100
[ 151.469942] Code: c7 58 5f 4c b2 e8 10 50 aa ff 0f 0b 48 89 ef e8 36 57 cb ff 48 8b 55 08 48 89 e9 48 89 de 48 c7 c7 a8 5f 4c b2 e8 f0 4f aa ff <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 90 90 90 90 90 90
[ 151.471323] RSP: 0000:ffffb15644e7bd68 EFLAGS: 00010046
[ 151.471712] RAX: 000000000000006d RBX: ffff9f0915c69c08 RCX: 0000000000000027
[ 151.472243] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff9f0a37d9c600
[ 151.472768] RBP: ffff9f0a15b11c08 R08: 0000000000000000 R09: c0000000ffff7fff
[ 151.473294] R10: 0000000000000001 R11: ffffb15644e7bba8 R12: ffff9f092339ee68
[ 151.473817] R13: ffff9f0900059c28 R14: ffff9f092339ee78 R15: 0000000000000000
[ 151.474344] FS: 0000000000000000(0000) GS:ffff9f0a847b5000(0000) knlGS:0000000000000000
[ 151.474934] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 151.475362] CR2: 0000559e233a9088 CR3: 000000020296b004 CR4: 0000000000770ef0
[ 151.475895] PKRU: 55555554
[ 151.476118] Call Trace:
[ 151.476331] <TASK>
[ 151.476497] move_linked_works+0x49/0xa0
[ 151.476792] __pwq_activate_work.isra.46+0x2f/0xa0
[ 151.477151] pwq_dec_nr_in_flight+0x1e0/0x2f0
[ 151.477479] process_scheduled_works+0x1c8/0x410
[ 151.477823] worker_thread+0x125/0x260
[ 151.478108] ? __pfx_worker_thread+0x10/0x10
[ 151.478430] kthread+0xfe/0x240
[ 151.478671] ? __pfx_kthread+0x10/0x10
[ 151.478955] ? __pfx_kthread+0x10/0x10
[ 151.479240] ret_from_fork+0x208/0x270
[ 151.479523] ? __pfx_kthread+0x10/0x10
[ 151.479806] ret_from_fork_asm+0x1a/0x30
[ 151.480103] </TASK>

Fixes: e1168f09b331 ("RDMA/iwcm: Simplify cm_event_handler()")
Signed-off-by: Jacob Moroni <jmoroni@google.com>
Link: https://patch.msgid.link/20260112020006.1352438-1-jmoroni@google.com
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Leon Romanovsky <leon@kernel.org>

authored by

Jacob Moroni and committed by
Leon Romanovsky
7874eeac 0beefd0e

+21 -36
+21 -35
drivers/infiniband/core/iwcm.c
··· 95 95 struct iwcm_work { 96 96 struct work_struct work; 97 97 struct iwcm_id_private *cm_id; 98 - struct list_head list; 99 98 struct iw_cm_event event; 100 99 struct list_head free_list; 101 100 }; ··· 177 178 return -ENOMEM; 178 179 } 179 180 work->cm_id = cm_id_priv; 180 - INIT_LIST_HEAD(&work->list); 181 181 put_work(work); 182 182 } 183 183 return 0; ··· 211 213 static bool iwcm_deref_id(struct iwcm_id_private *cm_id_priv) 212 214 { 213 215 if (refcount_dec_and_test(&cm_id_priv->refcount)) { 214 - BUG_ON(!list_empty(&cm_id_priv->work_list)); 215 216 free_cm_id(cm_id_priv); 216 217 return true; 217 218 } ··· 257 260 refcount_set(&cm_id_priv->refcount, 1); 258 261 init_waitqueue_head(&cm_id_priv->connect_wait); 259 262 init_completion(&cm_id_priv->destroy_comp); 260 - INIT_LIST_HEAD(&cm_id_priv->work_list); 261 263 INIT_LIST_HEAD(&cm_id_priv->work_free_list); 262 264 263 265 return &cm_id_priv->id; ··· 1003 1007 } 1004 1008 1005 1009 /* 1006 - * Process events on the work_list for the cm_id. If the callback 1007 - * function requests that the cm_id be deleted, a flag is set in the 1008 - * cm_id flags to indicate that when the last reference is 1009 - * removed, the cm_id is to be destroyed. This is necessary to 1010 - * distinguish between an object that will be destroyed by the app 1011 - * thread asleep on the destroy_comp list vs. an object destroyed 1012 - * here synchronously when the last reference is removed. 1010 + * Process events for the cm_id. If the callback function requests 1011 + * that the cm_id be deleted, a flag is set in the cm_id flags to 1012 + * indicate that when the last reference is removed, the cm_id is 1013 + * to be destroyed. This is necessary to distinguish between an 1014 + * object that will be destroyed by the app thread asleep on the 1015 + * destroy_comp list vs. an object destroyed here synchronously 1016 + * when the last reference is removed. 1013 1017 */ 1014 1018 static void cm_work_handler(struct work_struct *_work) 1015 1019 { ··· 1020 1024 int ret = 0; 1021 1025 1022 1026 spin_lock_irqsave(&cm_id_priv->lock, flags); 1023 - while (!list_empty(&cm_id_priv->work_list)) { 1024 - work = list_first_entry(&cm_id_priv->work_list, 1025 - struct iwcm_work, list); 1026 - list_del_init(&work->list); 1027 - levent = work->event; 1028 - put_work(work); 1029 - spin_unlock_irqrestore(&cm_id_priv->lock, flags); 1030 - 1031 - if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { 1032 - ret = process_event(cm_id_priv, &levent); 1033 - if (ret) { 1034 - destroy_cm_id(&cm_id_priv->id); 1035 - WARN_ON_ONCE(iwcm_deref_id(cm_id_priv)); 1036 - } 1037 - } else 1038 - pr_debug("dropping event %d\n", levent.event); 1039 - if (iwcm_deref_id(cm_id_priv)) 1040 - return; 1041 - spin_lock_irqsave(&cm_id_priv->lock, flags); 1042 - } 1027 + levent = work->event; 1028 + put_work(work); 1043 1029 spin_unlock_irqrestore(&cm_id_priv->lock, flags); 1030 + 1031 + if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { 1032 + ret = process_event(cm_id_priv, &levent); 1033 + if (ret) { 1034 + destroy_cm_id(&cm_id_priv->id); 1035 + WARN_ON_ONCE(iwcm_deref_id(cm_id_priv)); 1036 + } 1037 + } else 1038 + pr_debug("dropping event %d\n", levent.event); 1039 + if (iwcm_deref_id(cm_id_priv)) 1040 + return; 1044 1041 } 1045 1042 1046 1043 /* 1047 1044 * This function is called on interrupt context. Schedule events on 1048 1045 * the iwcm_wq thread to allow callback functions to downcall into 1049 - * the CM and/or block. Events are queued to a per-CM_ID 1050 - * work_list. If this is the first event on the work_list, the work 1051 - * element is also queued on the iwcm_wq thread. 1046 + * the CM and/or block. 1052 1047 * 1053 1048 * Each event holds a reference on the cm_id. Until the last posted 1054 1049 * event has been delivered and processed, the cm_id cannot be ··· 1081 1094 } 1082 1095 1083 1096 refcount_inc(&cm_id_priv->refcount); 1084 - list_add_tail(&work->list, &cm_id_priv->work_list); 1085 1097 queue_work(iwcm_wq, &work->work); 1086 1098 out: 1087 1099 spin_unlock_irqrestore(&cm_id_priv->lock, flags);
-1
drivers/infiniband/core/iwcm.h
··· 50 50 struct ib_qp *qp; 51 51 struct completion destroy_comp; 52 52 wait_queue_head_t connect_wait; 53 - struct list_head work_list; 54 53 spinlock_t lock; 55 54 refcount_t refcount; 56 55 struct list_head work_free_list;