Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2#define _GNU_SOURCE
3#include <errno.h>
4#include <fcntl.h>
5#include <limits.h>
6#include <sched.h>
7#include <signal.h>
8#include <stdio.h>
9#include <stdlib.h>
10#include <string.h>
11#include <linux/nsfs.h>
12#include <sys/ioctl.h>
13#include <sys/mman.h>
14#include <sys/mount.h>
15#include <sys/socket.h>
16#include <sys/stat.h>
17#include <sys/syscall.h>
18#include <sys/types.h>
19#include <sys/wait.h>
20#include <unistd.h>
21#include "../kselftest_harness.h"
22#include "../pidfd/pidfd.h"
23#include "wrappers.h"
24
25/*
26 * Test listns() error handling with invalid buffer addresses.
27 *
28 * When the buffer pointer is invalid (e.g., crossing page boundaries
29 * into unmapped memory), listns() returns EINVAL.
30 *
31 * This test also creates mount namespaces that get destroyed during
32 * iteration, testing that namespace cleanup happens outside the RCU
33 * read lock.
34 */
35TEST(listns_partial_fault_with_ns_cleanup)
36{
37 void *map;
38 __u64 *ns_ids;
39 ssize_t ret;
40 long page_size;
41 pid_t pid, iter_pid;
42 int pidfds[5];
43 int sv[5][2];
44 int iter_pidfd;
45 int i, status;
46 char c;
47
48 page_size = sysconf(_SC_PAGESIZE);
49 ASSERT_GT(page_size, 0);
50
51 /*
52 * Map two pages:
53 * - First page: readable and writable
54 * - Second page: will be unmapped to trigger EFAULT
55 */
56 map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
57 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
58 ASSERT_NE(map, MAP_FAILED);
59
60 /* Unmap the second page */
61 ret = munmap((char *)map + page_size, page_size);
62 ASSERT_EQ(ret, 0);
63
64 /*
65 * Position the buffer pointer so there's room for exactly one u64
66 * before the page boundary. The second u64 would fall into the
67 * unmapped page.
68 */
69 ns_ids = ((__u64 *)((char *)map + page_size)) - 1;
70
71 /*
72 * Create a separate process to run listns() in a loop concurrently
73 * with namespace creation and destruction.
74 */
75 iter_pid = create_child(&iter_pidfd, 0);
76 ASSERT_NE(iter_pid, -1);
77
78 if (iter_pid == 0) {
79 struct ns_id_req req = {
80 .size = sizeof(req),
81 .spare = 0,
82 .ns_id = 0,
83 .ns_type = 0, /* All types */
84 .spare2 = 0,
85 .user_ns_id = 0, /* Global listing */
86 };
87 int iter_ret;
88
89 /*
90 * Loop calling listns() until killed.
91 * The kernel should:
92 * 1. Successfully write the first namespace ID (within valid page)
93 * 2. Fail with EFAULT when trying to write the second ID (unmapped page)
94 * 3. Handle concurrent namespace destruction without deadlock
95 */
96 while (1) {
97 iter_ret = sys_listns(&req, ns_ids, 2, 0);
98
99 if (iter_ret == -1 && errno == ENOSYS)
100 _exit(PIDFD_SKIP);
101 }
102 }
103
104 /* Small delay to let iterator start looping */
105 usleep(50000);
106
107 /*
108 * Create several child processes, each in its own mount namespace.
109 * These will be destroyed while the iterator is running listns().
110 */
111 for (i = 0; i < 5; i++) {
112 /* Create socketpair for synchronization */
113 ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
114
115 pid = create_child(&pidfds[i], CLONE_NEWNS);
116 ASSERT_NE(pid, -1);
117
118 if (pid == 0) {
119 close(sv[i][0]); /* Close parent end */
120
121 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
122 _exit(1);
123
124 /* Child: create a couple of tmpfs mounts */
125 if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
126 _exit(1);
127 if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
128 _exit(1);
129
130 if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
131 _exit(1);
132 if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
133 _exit(1);
134
135 /* Signal parent that setup is complete */
136 if (write_nointr(sv[i][1], "R", 1) != 1)
137 _exit(1);
138
139 /* Wait for parent to signal us to exit */
140 if (read_nointr(sv[i][1], &c, 1) != 1)
141 _exit(1);
142
143 close(sv[i][1]);
144 _exit(0);
145 }
146
147 close(sv[i][1]); /* Close child end */
148 }
149
150 /* Wait for all children to finish setup */
151 for (i = 0; i < 5; i++) {
152 ret = read_nointr(sv[i][0], &c, 1);
153 ASSERT_EQ(ret, 1);
154 ASSERT_EQ(c, 'R');
155 }
156
157 /*
158 * Signal children to exit. This will destroy their mount namespaces
159 * while listns() is iterating the namespace tree.
160 * This tests that cleanup happens outside the RCU read lock.
161 */
162 for (i = 0; i < 5; i++)
163 write_nointr(sv[i][0], "X", 1);
164
165 /* Wait for all mount namespace children to exit and cleanup */
166 for (i = 0; i < 5; i++) {
167 waitpid(-1, NULL, 0);
168 close(sv[i][0]);
169 close(pidfds[i]);
170 }
171
172 /* Kill iterator and wait for it */
173 sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
174 ret = waitpid(iter_pid, &status, 0);
175 ASSERT_EQ(ret, iter_pid);
176 close(iter_pidfd);
177
178 /* Should have been killed */
179 ASSERT_TRUE(WIFSIGNALED(status));
180 ASSERT_EQ(WTERMSIG(status), SIGKILL);
181
182 /* Clean up */
183 munmap(map, page_size);
184}
185
186/*
187 * Test listns() error handling when the entire buffer is invalid.
188 * This is a sanity check that basic invalid pointer detection works.
189 */
190TEST(listns_complete_fault)
191{
192 struct ns_id_req req = {
193 .size = sizeof(req),
194 .spare = 0,
195 .ns_id = 0,
196 .ns_type = 0,
197 .spare2 = 0,
198 .user_ns_id = 0,
199 };
200 __u64 *ns_ids;
201 ssize_t ret;
202
203 /* Use a clearly invalid pointer */
204 ns_ids = (__u64 *)0xdeadbeef;
205
206 ret = sys_listns(&req, ns_ids, 10, 0);
207
208 if (ret == -1 && errno == ENOSYS)
209 SKIP(return, "listns() not supported");
210
211 /* Should fail with EFAULT */
212 ASSERT_EQ(ret, -1);
213 ASSERT_EQ(errno, EFAULT);
214}
215
216/*
217 * Test listns() error handling when the buffer is NULL.
218 */
219TEST(listns_null_buffer)
220{
221 struct ns_id_req req = {
222 .size = sizeof(req),
223 .spare = 0,
224 .ns_id = 0,
225 .ns_type = 0,
226 .spare2 = 0,
227 .user_ns_id = 0,
228 };
229 ssize_t ret;
230
231 /* NULL buffer with non-zero count should fail */
232 ret = sys_listns(&req, NULL, 10, 0);
233
234 if (ret == -1 && errno == ENOSYS)
235 SKIP(return, "listns() not supported");
236
237 /* Should fail with EFAULT */
238 ASSERT_EQ(ret, -1);
239 ASSERT_EQ(errno, EFAULT);
240}
241
242/*
243 * Test listns() with a buffer that becomes invalid mid-iteration
244 * (after several successful writes), combined with mount namespace
245 * destruction to test RCU cleanup logic.
246 */
247TEST(listns_late_fault_with_ns_cleanup)
248{
249 void *map;
250 __u64 *ns_ids;
251 ssize_t ret;
252 long page_size;
253 pid_t pid, iter_pid;
254 int pidfds[10];
255 int sv[10][2];
256 int iter_pidfd;
257 int i, status;
258 char c;
259
260 page_size = sysconf(_SC_PAGESIZE);
261 ASSERT_GT(page_size, 0);
262
263 /* Map two pages */
264 map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
265 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
266 ASSERT_NE(map, MAP_FAILED);
267
268 /* Unmap the second page */
269 ret = munmap((char *)map + page_size, page_size);
270 ASSERT_EQ(ret, 0);
271
272 /*
273 * Position buffer so we can write several u64s successfully
274 * before hitting the page boundary.
275 */
276 ns_ids = ((__u64 *)((char *)map + page_size)) - 5;
277
278 /*
279 * Create a separate process to run listns() concurrently.
280 */
281 iter_pid = create_child(&iter_pidfd, 0);
282 ASSERT_NE(iter_pid, -1);
283
284 if (iter_pid == 0) {
285 struct ns_id_req req = {
286 .size = sizeof(req),
287 .spare = 0,
288 .ns_id = 0,
289 .ns_type = 0,
290 .spare2 = 0,
291 .user_ns_id = 0,
292 };
293 int iter_ret;
294
295 /*
296 * Loop calling listns() until killed.
297 * Request 10 namespace IDs while namespaces are being destroyed.
298 * This tests:
299 * 1. EFAULT handling when buffer becomes invalid
300 * 2. Namespace cleanup outside RCU read lock during iteration
301 */
302 while (1) {
303 iter_ret = sys_listns(&req, ns_ids, 10, 0);
304
305 if (iter_ret == -1 && errno == ENOSYS)
306 _exit(PIDFD_SKIP);
307 }
308 }
309
310 /* Small delay to let iterator start looping */
311 usleep(50000);
312
313 /*
314 * Create more children with mount namespaces to increase the
315 * likelihood that namespace cleanup happens during iteration.
316 */
317 for (i = 0; i < 10; i++) {
318 /* Create socketpair for synchronization */
319 ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
320
321 pid = create_child(&pidfds[i], CLONE_NEWNS);
322 ASSERT_NE(pid, -1);
323
324 if (pid == 0) {
325 close(sv[i][0]); /* Close parent end */
326
327 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
328 _exit(1);
329
330 /* Child: create tmpfs mounts */
331 if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
332 _exit(1);
333 if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
334 _exit(1);
335
336 if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
337 _exit(1);
338 if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
339 _exit(1);
340
341 /* Signal parent that setup is complete */
342 if (write_nointr(sv[i][1], "R", 1) != 1)
343 _exit(1);
344
345 /* Wait for parent to signal us to exit */
346 if (read_nointr(sv[i][1], &c, 1) != 1)
347 _exit(1);
348
349 close(sv[i][1]);
350 _exit(0);
351 }
352
353 close(sv[i][1]); /* Close child end */
354 }
355
356 /* Wait for all children to finish setup */
357 for (i = 0; i < 10; i++) {
358 ret = read_nointr(sv[i][0], &c, 1);
359 ASSERT_EQ(ret, 1);
360 ASSERT_EQ(c, 'R');
361 }
362
363 /* Kill half the children */
364 for (i = 0; i < 5; i++)
365 write_nointr(sv[i][0], "X", 1);
366
367 /* Small delay to let some exit */
368 usleep(10000);
369
370 /* Kill remaining children */
371 for (i = 5; i < 10; i++)
372 write_nointr(sv[i][0], "X", 1);
373
374 /* Wait for all children and cleanup */
375 for (i = 0; i < 10; i++) {
376 waitpid(-1, NULL, 0);
377 close(sv[i][0]);
378 close(pidfds[i]);
379 }
380
381 /* Kill iterator and wait for it */
382 sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
383 ret = waitpid(iter_pid, &status, 0);
384 ASSERT_EQ(ret, iter_pid);
385 close(iter_pidfd);
386
387 /* Should have been killed */
388 ASSERT_TRUE(WIFSIGNALED(status));
389 ASSERT_EQ(WTERMSIG(status), SIGKILL);
390
391 /* Clean up */
392 munmap(map, page_size);
393}
394
395/*
396 * Test specifically focused on mount namespace cleanup during EFAULT.
397 * Filter for mount namespaces only.
398 */
399TEST(listns_mnt_ns_cleanup_on_fault)
400{
401 void *map;
402 __u64 *ns_ids;
403 ssize_t ret;
404 long page_size;
405 pid_t pid, iter_pid;
406 int pidfds[8];
407 int sv[8][2];
408 int iter_pidfd;
409 int i, status;
410 char c;
411
412 page_size = sysconf(_SC_PAGESIZE);
413 ASSERT_GT(page_size, 0);
414
415 /* Set up partial fault buffer */
416 map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
417 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
418 ASSERT_NE(map, MAP_FAILED);
419
420 ret = munmap((char *)map + page_size, page_size);
421 ASSERT_EQ(ret, 0);
422
423 /* Position for 3 successful writes, then fault */
424 ns_ids = ((__u64 *)((char *)map + page_size)) - 3;
425
426 /*
427 * Create a separate process to run listns() concurrently.
428 */
429 iter_pid = create_child(&iter_pidfd, 0);
430 ASSERT_NE(iter_pid, -1);
431
432 if (iter_pid == 0) {
433 struct ns_id_req req = {
434 .size = sizeof(req),
435 .spare = 0,
436 .ns_id = 0,
437 .ns_type = CLONE_NEWNS, /* Only mount namespaces */
438 .spare2 = 0,
439 .user_ns_id = 0,
440 };
441 int iter_ret;
442
443 /*
444 * Loop calling listns() until killed.
445 * Call listns() to race with namespace destruction.
446 */
447 while (1) {
448 iter_ret = sys_listns(&req, ns_ids, 10, 0);
449
450 if (iter_ret == -1 && errno == ENOSYS)
451 _exit(PIDFD_SKIP);
452 }
453 }
454
455 /* Small delay to let iterator start looping */
456 usleep(50000);
457
458 /* Create children with mount namespaces */
459 for (i = 0; i < 8; i++) {
460 /* Create socketpair for synchronization */
461 ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
462
463 pid = create_child(&pidfds[i], CLONE_NEWNS);
464 ASSERT_NE(pid, -1);
465
466 if (pid == 0) {
467 close(sv[i][0]); /* Close parent end */
468
469 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
470 _exit(1);
471
472 /* Do some mount operations to make cleanup more interesting */
473 if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
474 _exit(1);
475 if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
476 _exit(1);
477
478 if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
479 _exit(1);
480 if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
481 _exit(1);
482
483 /* Signal parent that setup is complete */
484 if (write_nointr(sv[i][1], "R", 1) != 1)
485 _exit(1);
486
487 /* Wait for parent to signal us to exit */
488 if (read_nointr(sv[i][1], &c, 1) != 1)
489 _exit(1);
490
491 close(sv[i][1]);
492 _exit(0);
493 }
494
495 close(sv[i][1]); /* Close child end */
496 }
497
498 /* Wait for all children to finish setup */
499 for (i = 0; i < 8; i++) {
500 ret = read_nointr(sv[i][0], &c, 1);
501 ASSERT_EQ(ret, 1);
502 ASSERT_EQ(c, 'R');
503 }
504
505 /* Kill children to trigger namespace destruction during iteration */
506 for (i = 0; i < 8; i++)
507 write_nointr(sv[i][0], "X", 1);
508
509 /* Wait for children and cleanup */
510 for (i = 0; i < 8; i++) {
511 waitpid(-1, NULL, 0);
512 close(sv[i][0]);
513 close(pidfds[i]);
514 }
515
516 /* Kill iterator and wait for it */
517 sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
518 ret = waitpid(iter_pid, &status, 0);
519 ASSERT_EQ(ret, iter_pid);
520 close(iter_pidfd);
521
522 /* Should have been killed */
523 ASSERT_TRUE(WIFSIGNALED(status));
524 ASSERT_EQ(WTERMSIG(status), SIGKILL);
525
526 munmap(map, page_size);
527}
528
529TEST_HARNESS_MAIN