sched_ext: Save and restore scx_locked_rq across SCX_CALL_OP

SCX_CALL_OP{,_RET}() unconditionally clears scx_locked_rq_state to NULL on
exit. Correct at the top level, but ops can recurse via
scx_bpf_sub_dispatch(): a parent's ops.dispatch calls the helper, which
invokes the child's ops.dispatch under another SCX_CALL_OP. When the inner
call returns, the NULL clobbers the outer's state. The parent's BPF then
calls kfuncs like scx_bpf_cpuperf_set() which read scx_locked_rq()==NULL and
re-acquire the already-held rq.

Snapshot scx_locked_rq_state on entry and restore on exit. Rename the rq
parameter to locked_rq across all SCX_CALL_OP* macros so the snapshot local
can be typed as 'struct rq *' without colliding with the parameter token in
the expansion. SCX_CALL_OP_TASK{,_RET}() and SCX_CALL_OP_2TASKS_RET() funnel
through the two base macros and inherit the fix.

Fixes: 4f8b122848db ("sched_ext: Add basic building blocks for nested sub-scheduler dispatching")
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>

Tejun Heo 1 week ago 7fb39e4e 2f2ea770

+30 -19

1 changed file

expand all

kernel

sched

ext.c

+30 -19

kernel/sched/ext.c

··· 470 470 __this_cpu_write(scx_locked_rq_state, rq); 471 471 } 472 472 473 - #define SCX_CALL_OP(sch, op, rq, args...) \ 473 + /* 474 + * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not 475 + * clobber the outer's scx_locked_rq_state. Save it on entry, restore on exit. 476 + */ 477 + #define SCX_CALL_OP(sch, op, locked_rq, args...) \ 474 478 do { \ 475 - if (rq) \ 476 - update_locked_rq(rq); \ 479 + struct rq *__prev_locked_rq; \ 480 + \ 481 + if (locked_rq) { \ 482 + __prev_locked_rq = scx_locked_rq(); \ 483 + update_locked_rq(locked_rq); \ 484 + } \ 477 485 (sch)->ops.op(args); \ 478 - if (rq) \ 479 - update_locked_rq(NULL); \ 486 + if (locked_rq) \ 487 + update_locked_rq(__prev_locked_rq); \ 480 488 } while (0) 481 489 482 - #define SCX_CALL_OP_RET(sch, op, rq, args...) \ 490 + #define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \ 483 491 ({ \ 492 + struct rq *__prev_locked_rq; \ 484 493 __typeof__((sch)->ops.op(args)) __ret; \ 485 494 \ 486 - if (rq) \ 487 - update_locked_rq(rq); \ 495 + if (locked_rq) { \ 496 + __prev_locked_rq = scx_locked_rq(); \ 497 + update_locked_rq(locked_rq); \ 498 + } \ 488 499 __ret = (sch)->ops.op(args); \ 489 - if (rq) \ 490 - update_locked_rq(NULL); \ 500 + if (locked_rq) \ 501 + update_locked_rq(__prev_locked_rq); \ 491 502 __ret; \ 492 503 }) 493 504 ··· 510 499 * those subject tasks. 511 500 * 512 501 * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held - 513 - * either via the @rq argument here, or (for ops.select_cpu()) via @p's pi_lock 514 - * held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. So if 515 - * kf_tasks[] is set, @p's scheduler-protected fields are stable. 502 + * either via the @locked_rq argument here, or (for ops.select_cpu()) via @p's 503 + * pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. 504 + * So if kf_tasks[] is set, @p's scheduler-protected fields are stable. 516 505 * 517 506 * kf_tasks[] can not stack, so task-based SCX ops must not nest. The 518 507 * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants 519 508 * while a previous one is still in progress. 520 509 */ 521 - #define SCX_CALL_OP_TASK(sch, op, rq, task, args...) \ 510 + #define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...) \ 522 511 do { \ 523 512 WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 524 513 current->scx.kf_tasks[0] = task; \ 525 - SCX_CALL_OP((sch), op, rq, task, ##args); \ 514 + SCX_CALL_OP((sch), op, locked_rq, task, ##args); \ 526 515 current->scx.kf_tasks[0] = NULL; \ 527 516 } while (0) 528 517 529 - #define SCX_CALL_OP_TASK_RET(sch, op, rq, task, args...) \ 518 + #define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...) \ 530 519 ({ \ 531 520 __typeof__((sch)->ops.op(task, ##args)) __ret; \ 532 521 WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 533 522 current->scx.kf_tasks[0] = task; \ 534 - __ret = SCX_CALL_OP_RET((sch), op, rq, task, ##args); \ 523 + __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args); \ 535 524 current->scx.kf_tasks[0] = NULL; \ 536 525 __ret; \ 537 526 }) 538 527 539 - #define SCX_CALL_OP_2TASKS_RET(sch, op, rq, task0, task1, args...) \ 528 + #define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...) \ 540 529 ({ \ 541 530 __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ 542 531 WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 543 532 current->scx.kf_tasks[0] = task0; \ 544 533 current->scx.kf_tasks[1] = task1; \ 545 - __ret = SCX_CALL_OP_RET((sch), op, rq, task0, task1, ##args); \ 534 + __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args); \ 546 535 current->scx.kf_tasks[0] = NULL; \ 547 536 current->scx.kf_tasks[1] = NULL; \ 548 537 __ret; \

Configure Feed

Configure Feed