Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

workqueue: Add stall detector sample module

Add a sample module under samples/workqueue/stall_detector/ that
reproduces a workqueue stall caused by PF_WQ_WORKER misuse. The
module queues two work items on the same per-CPU pool, then clears
PF_WQ_WORKER and sleeps in wait_event_idle(), hiding from the
concurrency manager and stalling the second work item indefinitely.

This is useful for testing the workqueue watchdog stall diagnostics.

Signed-off-by: Breno Leitao <leitao@debian.org>
Acked-by: Song Liu <song@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>

authored by

Breno Leitao and committed by
Tejun Heo
9e83d510 8823eaef

+99
+1
samples/workqueue/stall_detector/Makefile
··· 1 + obj-m += wq_stall.o
+98
samples/workqueue/stall_detector/wq_stall.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * wq_stall - Test module for the workqueue stall detector. 4 + * 5 + * Deliberately creates a workqueue stall so the watchdog fires and 6 + * prints diagnostic output. Useful for verifying that the stall 7 + * detector correctly identifies stuck workers and produces useful 8 + * backtraces. 9 + * 10 + * The stall is triggered by clearing PF_WQ_WORKER before sleeping, 11 + * which hides the worker from the concurrency manager. A second 12 + * work item queued on the same pool then sits in the worklist with 13 + * no worker available to process it. 14 + * 15 + * After ~30s the workqueue watchdog fires: 16 + * BUG: workqueue lockup - pool cpus=N ... 17 + * 18 + * Build: 19 + * make -C <kernel tree> M=samples/workqueue/stall_detector modules 20 + * 21 + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. 22 + * Copyright (c) 2026 Breno Leitao <leitao@debian.org> 23 + */ 24 + 25 + #include <linux/module.h> 26 + #include <linux/workqueue.h> 27 + #include <linux/wait.h> 28 + #include <linux/atomic.h> 29 + #include <linux/sched.h> 30 + 31 + static DECLARE_WAIT_QUEUE_HEAD(stall_wq_head); 32 + static atomic_t wake_condition = ATOMIC_INIT(0); 33 + static struct work_struct stall_work1; 34 + static struct work_struct stall_work2; 35 + 36 + static void stall_work2_fn(struct work_struct *work) 37 + { 38 + pr_info("wq_stall: second work item finally ran\n"); 39 + } 40 + 41 + static void stall_work1_fn(struct work_struct *work) 42 + { 43 + pr_info("wq_stall: first work item running on cpu %d\n", 44 + raw_smp_processor_id()); 45 + 46 + /* 47 + * Queue second item while we're still counted as running 48 + * (pool->nr_running > 0). Since schedule_work() on a per-CPU 49 + * workqueue targets raw_smp_processor_id(), item 2 lands on the 50 + * same pool. __queue_work -> kick_pool -> need_more_worker() 51 + * sees nr_running > 0 and does NOT wake a new worker. 52 + */ 53 + schedule_work(&stall_work2); 54 + 55 + /* 56 + * Hide from the workqueue concurrency manager. Without 57 + * PF_WQ_WORKER, schedule() won't call wq_worker_sleeping(), 58 + * so nr_running is never decremented and no replacement 59 + * worker is created. Item 2 stays stuck in pool->worklist. 60 + */ 61 + current->flags &= ~PF_WQ_WORKER; 62 + 63 + pr_info("wq_stall: entering wait_event_idle (PF_WQ_WORKER cleared)\n"); 64 + pr_info("wq_stall: expect 'BUG: workqueue lockup' in ~30-60s\n"); 65 + wait_event_idle(stall_wq_head, atomic_read(&wake_condition) != 0); 66 + 67 + /* Restore so process_one_work() cleanup works correctly */ 68 + current->flags |= PF_WQ_WORKER; 69 + pr_info("wq_stall: woke up, PF_WQ_WORKER restored\n"); 70 + } 71 + 72 + static int __init wq_stall_init(void) 73 + { 74 + pr_info("wq_stall: loading\n"); 75 + 76 + INIT_WORK(&stall_work1, stall_work1_fn); 77 + INIT_WORK(&stall_work2, stall_work2_fn); 78 + schedule_work(&stall_work1); 79 + 80 + return 0; 81 + } 82 + 83 + static void __exit wq_stall_exit(void) 84 + { 85 + pr_info("wq_stall: unloading\n"); 86 + atomic_set(&wake_condition, 1); 87 + wake_up(&stall_wq_head); 88 + flush_work(&stall_work1); 89 + flush_work(&stall_work2); 90 + pr_info("wq_stall: all work flushed, module unloaded\n"); 91 + } 92 + 93 + module_init(wq_stall_init); 94 + module_exit(wq_stall_exit); 95 + 96 + MODULE_LICENSE("GPL"); 97 + MODULE_DESCRIPTION("Reproduce workqueue stall caused by PF_WQ_WORKER misuse"); 98 + MODULE_AUTHOR("Breno Leitao <leitao@debian.org>");