Merge tag 'landlock-7.0-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/mic/linux

+19 -4

Documentation/userspace-api/landlock.rst

··· 8 8 ===================================== 9 9 10 10 :Author: Mickaël Salaün 11 - :Date: January 2026 11 + :Date: March 2026 12 12 13 13 The goal of Landlock is to enable restriction of ambient rights (e.g. global 14 14 filesystem or network access) for a set of processes. Because Landlock ··· 197 197 198 198 .. code-block:: c 199 199 200 - __u32 restrict_flags = LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON; 201 - if (abi < 7) { 202 - /* Clear logging flags unsupported before ABI 7. */ 200 + __u32 restrict_flags = 201 + LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON | 202 + LANDLOCK_RESTRICT_SELF_TSYNC; 203 + switch (abi) { 204 + case 1 ... 6: 205 + /* Removes logging flags for ABI < 7 */ 203 206 restrict_flags &= ~(LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF | 204 207 LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON | 205 208 LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF); 209 + __attribute__((fallthrough)); 210 + case 7: 211 + /* 212 + * Removes multithreaded enforcement flag for ABI < 8 213 + * 214 + * WARNING: Without this flag, calling landlock_restrict_self(2) is 215 + * only equivalent if the calling process is single-threaded. Below 216 + * ABI v8 (and as of ABI v8, when not using this flag), a Landlock 217 + * policy would only be enforced for the calling thread and its 218 + * children (and not for all threads, including parents and siblings). 219 + */ 220 + restrict_flags &= ~LANDLOCK_RESTRICT_SELF_TSYNC; 206 221 } 207 222 208 223 The next step is to restrict the current thread from gaining more privileges

+3 -2

samples/landlock/sandboxer.c

··· 299 299 300 300 /* clang-format on */ 301 301 302 - #define LANDLOCK_ABI_LAST 7 302 + #define LANDLOCK_ABI_LAST 8 303 303 304 304 #define XSTR(s) #s 305 305 #define STR(s) XSTR(s) ··· 436 436 /* Removes LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON for ABI < 7 */ 437 437 supported_restrict_flags &= 438 438 ~LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON; 439 - 439 + __attribute__((fallthrough)); 440 + case 7: 440 441 /* Must be printed for any ABI < LANDLOCK_ABI_LAST. */ 441 442 fprintf(stderr, 442 443 "Hint: You should update the running kernel "

+1 -2

security/landlock/domain.c

··· 94 94 * allocate with GFP_KERNEL_ACCOUNT because it is independent from the 95 95 * caller. 96 96 */ 97 - details = 98 - kzalloc_flex(*details, exe_path, path_size); 97 + details = kzalloc_flex(*details, exe_path, path_size); 99 98 if (!details) 100 99 return ERR_PTR(-ENOMEM); 101 100

+4 -5

security/landlock/ruleset.c

··· 32 32 { 33 33 struct landlock_ruleset *new_ruleset; 34 34 35 - new_ruleset = 36 - kzalloc_flex(*new_ruleset, access_masks, num_layers, 37 - GFP_KERNEL_ACCOUNT); 35 + new_ruleset = kzalloc_flex(*new_ruleset, access_masks, num_layers, 36 + GFP_KERNEL_ACCOUNT); 38 37 if (!new_ruleset) 39 38 return ERR_PTR(-ENOMEM); 40 39 refcount_set(&new_ruleset->usage, 1); ··· 558 559 if (IS_ERR(new_dom)) 559 560 return new_dom; 560 561 561 - new_dom->hierarchy = kzalloc_obj(*new_dom->hierarchy, 562 - GFP_KERNEL_ACCOUNT); 562 + new_dom->hierarchy = 563 + kzalloc_obj(*new_dom->hierarchy, GFP_KERNEL_ACCOUNT); 563 564 if (!new_dom->hierarchy) 564 565 return ERR_PTR(-ENOMEM); 565 566

+73 -19

security/landlock/tsync.c

··· 203 203 return ctx; 204 204 } 205 205 206 + /** 207 + * tsync_works_trim - Put the last tsync_work element 208 + * 209 + * @s: TSYNC works to trim. 210 + * 211 + * Put the last task and decrement the size of @s. 212 + * 213 + * This helper does not cancel a running task, but just reset the last element 214 + * to zero. 215 + */ 216 + static void tsync_works_trim(struct tsync_works *s) 217 + { 218 + struct tsync_work *ctx; 219 + 220 + if (WARN_ON_ONCE(s->size <= 0)) 221 + return; 222 + 223 + ctx = s->works[s->size - 1]; 224 + 225 + /* 226 + * For consistency, remove the task from ctx so that it does not look like 227 + * we handed it a task_work. 228 + */ 229 + put_task_struct(ctx->task); 230 + *ctx = (typeof(*ctx)){}; 231 + 232 + /* 233 + * Cancel the tsync_works_provide() change to recycle the reserved memory 234 + * for the next thread, if any. This also ensures that cancel_tsync_works() 235 + * and tsync_works_release() do not see any NULL task pointers. 236 + */ 237 + s->size--; 238 + } 239 + 206 240 /* 207 241 * tsync_works_grow_by - preallocates space for n more contexts in s 208 242 * ··· 290 256 * tsync_works_contains - checks for presence of task in s 291 257 */ 292 258 static bool tsync_works_contains_task(const struct tsync_works *s, 293 - struct task_struct *task) 259 + const struct task_struct *task) 294 260 { 295 261 size_t i; 296 262 297 263 for (i = 0; i < s->size; i++) 298 264 if (s->works[i]->task == task) 299 265 return true; 266 + 300 267 return false; 301 268 } 302 269 ··· 311 276 size_t i; 312 277 313 278 for (i = 0; i < s->size; i++) { 314 - if (!s->works[i]->task) 279 + if (WARN_ON_ONCE(!s->works[i]->task)) 315 280 continue; 316 281 317 282 put_task_struct(s->works[i]->task); ··· 319 284 320 285 for (i = 0; i < s->capacity; i++) 321 286 kfree(s->works[i]); 287 + 322 288 kfree(s->works); 323 289 s->works = NULL; 324 290 s->size = 0; ··· 331 295 */ 332 296 static size_t count_additional_threads(const struct tsync_works *works) 333 297 { 334 - struct task_struct *thread, *caller; 298 + const struct task_struct *caller, *thread; 335 299 size_t n = 0; 336 300 337 301 caller = current; ··· 370 334 struct tsync_shared_context *shared_ctx) 371 335 { 372 336 int err; 373 - struct task_struct *thread, *caller; 337 + const struct task_struct *caller; 338 + struct task_struct *thread; 374 339 struct tsync_work *ctx; 375 340 bool found_more_threads = false; 376 341 ··· 416 379 417 380 init_task_work(&ctx->work, restrict_one_thread_callback); 418 381 err = task_work_add(thread, &ctx->work, TWA_SIGNAL); 419 - if (err) { 382 + if (unlikely(err)) { 420 383 /* 421 384 * task_work_add() only fails if the task is about to exit. We 422 385 * checked that earlier, but it can happen as a race. Resume 423 386 * without setting an error, as the task is probably gone in the 424 - * next loop iteration. For consistency, remove the task from ctx 425 - * so that it does not look like we handed it a task_work. 387 + * next loop iteration. 426 388 */ 427 - put_task_struct(ctx->task); 428 - ctx->task = NULL; 389 + tsync_works_trim(works); 429 390 430 391 atomic_dec(&shared_ctx->num_preparing); 431 392 atomic_dec(&shared_ctx->num_unfinished); ··· 441 406 * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two 442 407 * completions if needed, as if the task was never scheduled. 443 408 */ 444 - static void cancel_tsync_works(struct tsync_works *works, 409 + static void cancel_tsync_works(const struct tsync_works *works, 445 410 struct tsync_shared_context *shared_ctx) 446 411 { 447 - int i; 412 + size_t i; 448 413 449 414 for (i = 0; i < works->size; i++) { 415 + if (WARN_ON_ONCE(!works->works[i]->task)) 416 + continue; 417 + 450 418 if (!task_work_cancel(works->works[i]->task, 451 419 &works->works[i]->work)) 452 420 continue; ··· 484 446 shared_ctx.old_cred = old_cred; 485 447 shared_ctx.new_cred = new_cred; 486 448 shared_ctx.set_no_new_privs = task_no_new_privs(current); 449 + 450 + /* 451 + * Serialize concurrent TSYNC operations to prevent deadlocks when 452 + * multiple threads call landlock_restrict_self() simultaneously. 453 + * If the lock is already held, we gracefully yield by restarting the 454 + * syscall. This allows the current thread to process pending 455 + * task_works before retrying. 456 + */ 457 + if (!down_write_trylock(&current->signal->exec_update_lock)) 458 + return restart_syscall(); 487 459 488 460 /* 489 461 * We schedule a pseudo-signal task_work for each of the calling task's ··· 575 527 -ERESTARTNOINTR); 576 528 577 529 /* 578 - * Cancel task works for tasks that did not start running yet, 579 - * and decrement all_prepared and num_unfinished accordingly. 530 + * Opportunistic improvement: try to cancel task 531 + * works for tasks that did not start running 532 + * yet. We do not have a guarantee that it 533 + * cancels any of the enqueued task works 534 + * because task_work_run() might already have 535 + * dequeued them. 580 536 */ 581 537 cancel_tsync_works(&works, &shared_ctx); 582 538 583 539 /* 584 - * The remaining task works have started running, so waiting for 585 - * their completion will finish. 540 + * Break the loop with error. The cleanup code 541 + * after the loop unblocks the remaining 542 + * task_works. 586 543 */ 587 - wait_for_completion(&shared_ctx.all_prepared); 544 + break; 588 545 } 589 546 } 590 547 } while (found_more_threads && 591 548 !atomic_read(&shared_ctx.preparation_error)); 592 549 593 550 /* 594 - * We now have all sibling threads blocking and in "prepared" state in the 595 - * task work. Ask all threads to commit. 551 + * We now have either (a) all sibling threads blocking and in "prepared" 552 + * state in the task work, or (b) the preparation error is set. Ask all 553 + * threads to commit (or abort). 596 554 */ 597 555 complete_all(&shared_ctx.ready_to_commit); 598 556 ··· 610 556 wait_for_completion(&shared_ctx.all_finished); 611 557 612 558 tsync_works_release(&works); 613 - 559 + up_write(&current->signal->exec_update_lock); 614 560 return atomic_read(&shared_ctx.preparation_error); 615 561 }

+91 -2

tools/testing/selftests/landlock/tsync_test.c

··· 6 6 */ 7 7 8 8 #define _GNU_SOURCE 9 - #include <pthread.h> 10 - #include <sys/prctl.h> 11 9 #include <linux/landlock.h> 10 + #include <pthread.h> 11 + #include <signal.h> 12 + #include <sys/prctl.h> 12 13 13 14 #include "common.h" 14 15 ··· 155 154 /* Expect that both succeeded. */ 156 155 EXPECT_EQ(0, d[0].result); 157 156 EXPECT_EQ(0, d[1].result); 157 + 158 + EXPECT_EQ(0, close(ruleset_fd)); 159 + } 160 + 161 + static void signal_nop_handler(int sig) 162 + { 163 + } 164 + 165 + struct signaler_data { 166 + pthread_t target; 167 + volatile bool stop; 168 + }; 169 + 170 + static void *signaler_thread(void *data) 171 + { 172 + struct signaler_data *sd = data; 173 + 174 + while (!sd->stop) 175 + pthread_kill(sd->target, SIGUSR1); 176 + 177 + return NULL; 178 + } 179 + 180 + /* 181 + * Number of idle sibling threads. This must be large enough that even on 182 + * machines with many cores, the sibling threads cannot all complete their 183 + * credential preparation in a single parallel wave, otherwise the signaler 184 + * thread has no window to interrupt wait_for_completion_interruptible(). 185 + * 200 threads on a 64-core machine yields ~3 serialized waves, giving the 186 + * tight signal loop enough time to land an interruption. 187 + */ 188 + #define NUM_IDLE_THREADS 200 189 + 190 + /* 191 + * Exercises the tsync interruption and cancellation paths in tsync.c. 192 + * 193 + * When a signal interrupts the calling thread while it waits for sibling 194 + * threads to finish their credential preparation 195 + * (wait_for_completion_interruptible in landlock_restrict_sibling_threads), 196 + * the kernel sets ERESTARTNOINTR, cancels queued task works that have not 197 + * started yet (cancel_tsync_works), then waits for the remaining works to 198 + * finish. On the error return, syscalls.c aborts the prepared credentials. 199 + * The kernel automatically restarts the syscall, so userspace sees success. 200 + */ 201 + TEST(tsync_interrupt) 202 + { 203 + size_t i; 204 + pthread_t threads[NUM_IDLE_THREADS]; 205 + pthread_t signaler; 206 + struct signaler_data sd; 207 + struct sigaction sa = {}; 208 + const int ruleset_fd = create_ruleset(_metadata); 209 + 210 + disable_caps(_metadata); 211 + 212 + /* Install a no-op SIGUSR1 handler so the signal does not kill us. */ 213 + sa.sa_handler = signal_nop_handler; 214 + sigemptyset(&sa.sa_mask); 215 + ASSERT_EQ(0, sigaction(SIGUSR1, &sa, NULL)); 216 + 217 + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); 218 + 219 + for (i = 0; i < NUM_IDLE_THREADS; i++) 220 + ASSERT_EQ(0, pthread_create(&threads[i], NULL, idle, NULL)); 221 + 222 + /* 223 + * Start a signaler thread that continuously sends SIGUSR1 to the 224 + * calling thread. This maximizes the chance of interrupting 225 + * wait_for_completion_interruptible() in the kernel's tsync path. 226 + */ 227 + sd.target = pthread_self(); 228 + sd.stop = false; 229 + ASSERT_EQ(0, pthread_create(&signaler, NULL, signaler_thread, &sd)); 230 + 231 + /* 232 + * The syscall may be interrupted and transparently restarted by the 233 + * kernel (ERESTARTNOINTR). From userspace, it should always succeed. 234 + */ 235 + EXPECT_EQ(0, landlock_restrict_self(ruleset_fd, 236 + LANDLOCK_RESTRICT_SELF_TSYNC)); 237 + 238 + sd.stop = true; 239 + ASSERT_EQ(0, pthread_join(signaler, NULL)); 240 + 241 + for (i = 0; i < NUM_IDLE_THREADS; i++) { 242 + ASSERT_EQ(0, pthread_cancel(threads[i])); 243 + ASSERT_EQ(0, pthread_join(threads[i], NULL)); 244 + } 158 245 159 246 EXPECT_EQ(0, close(ruleset_fd)); 160 247 }

Configure Feed

Configure Feed