Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'vfs-6.14-rc4.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs fixes from Christian Brauner:
"It was reported that the acct(2) system call can be used to trigger a
NULL deref in cases where it is set to write to a file that triggers
an internal lookup.

This can e.g., happen when pointing acct(2) to /sys/power/resume. At
the point the where the write to this file happens the calling task
has already exited and called exit_fs() but an internal lookup might
be triggered through lookup_bdev(). This may trigger a NULL-deref when
accessing current->fs.

Reorganize the code so that the the final write happens from the
workqueue but with the caller's credentials. This preserves the
(strange) permission model and has almost no regression risk.

Also block access to kernel internal filesystems as well as procfs and
sysfs in the first place.

Various fixes for netfslib:

- Fix a number of read-retry hangs, including:

- Incorrect getting/putting of references on subreqs as we retry
them

- Failure to track whether a last old subrequest in a retried set
is superfluous

- Inconsistency in the usage of wait queues used for subrequests
(ie. using clear_and_wake_up_bit() whilst waiting on a private
waitqueue)

- Add stats counters for retries and publish in /proc/fs/netfs/stats.
This is not a fix per se, but is useful in debugging and shouldn't
otherwise change the operation of the code

- Fix the ordering of queuing subrequests with respect to setting the
request flag that says we've now queued them all"

* tag 'vfs-6.14-rc4.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
netfs: Fix setting NETFS_RREQ_ALL_QUEUED to be after all subreqs queued
netfs: Add retry stat counters
netfs: Fix a number of read-retry hangs
acct: block access to kernel internal filesystems
acct: perform last write from workqueue

+155 -71
+14 -7
fs/netfs/buffered_read.c
··· 155 155 netfs_cache_read_terminated, subreq); 156 156 } 157 157 158 - static void netfs_issue_read(struct netfs_io_request *rreq, 159 - struct netfs_io_subrequest *subreq) 158 + static void netfs_queue_read(struct netfs_io_request *rreq, 159 + struct netfs_io_subrequest *subreq, 160 + bool last_subreq) 160 161 { 161 162 struct netfs_io_stream *stream = &rreq->io_streams[0]; 162 163 ··· 178 177 } 179 178 } 180 179 181 - spin_unlock(&rreq->lock); 180 + if (last_subreq) { 181 + smp_wmb(); /* Write lists before ALL_QUEUED. */ 182 + set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); 183 + } 182 184 185 + spin_unlock(&rreq->lock); 186 + } 187 + 188 + static void netfs_issue_read(struct netfs_io_request *rreq, 189 + struct netfs_io_subrequest *subreq) 190 + { 183 191 switch (subreq->source) { 184 192 case NETFS_DOWNLOAD_FROM_SERVER: 185 193 rreq->netfs_ops->issue_read(subreq); ··· 303 293 } 304 294 size -= slice; 305 295 start += slice; 306 - if (size <= 0) { 307 - smp_wmb(); /* Write lists before ALL_QUEUED. */ 308 - set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); 309 - } 310 296 297 + netfs_queue_read(rreq, subreq, size <= 0); 311 298 netfs_issue_read(rreq, subreq); 312 299 cond_resched(); 313 300 } while (size > 0);
+4
fs/netfs/internal.h
··· 135 135 extern atomic_t netfs_n_rh_write_done; 136 136 extern atomic_t netfs_n_rh_write_failed; 137 137 extern atomic_t netfs_n_rh_write_zskip; 138 + extern atomic_t netfs_n_rh_retry_read_req; 139 + extern atomic_t netfs_n_rh_retry_read_subreq; 138 140 extern atomic_t netfs_n_wh_buffered_write; 139 141 extern atomic_t netfs_n_wh_writethrough; 140 142 extern atomic_t netfs_n_wh_dio_write; ··· 149 147 extern atomic_t netfs_n_wh_write; 150 148 extern atomic_t netfs_n_wh_write_done; 151 149 extern atomic_t netfs_n_wh_write_failed; 150 + extern atomic_t netfs_n_wh_retry_write_req; 151 + extern atomic_t netfs_n_wh_retry_write_subreq; 152 152 extern atomic_t netfs_n_wb_lock_skip; 153 153 extern atomic_t netfs_n_wb_lock_wait; 154 154 extern atomic_t netfs_n_folioq;
+4 -2
fs/netfs/read_collect.c
··· 470 470 */ 471 471 void netfs_wake_read_collector(struct netfs_io_request *rreq) 472 472 { 473 - if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { 473 + if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) && 474 + !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) { 474 475 if (!work_pending(&rreq->work)) { 475 476 netfs_get_request(rreq, netfs_rreq_trace_get_work); 476 477 if (!queue_work(system_unbound_wq, &rreq->work)) ··· 587 586 smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */ 588 587 589 588 /* If we are at the head of the queue, wake up the collector. */ 590 - if (list_is_first(&subreq->rreq_link, &stream->subrequests)) 589 + if (list_is_first(&subreq->rreq_link, &stream->subrequests) || 590 + test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) 591 591 netfs_wake_read_collector(rreq); 592 592 593 593 netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated);
+33 -10
fs/netfs/read_retry.c
··· 14 14 { 15 15 __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); 16 16 __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); 17 - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); 17 + netfs_stat(&netfs_n_rh_retry_read_subreq); 18 18 subreq->rreq->netfs_ops->issue_read(subreq); 19 19 } 20 20 ··· 48 48 __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); 49 49 subreq->retry_count++; 50 50 netfs_reset_iter(subreq); 51 + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); 51 52 netfs_reissue_read(rreq, subreq); 52 53 } 53 54 } ··· 76 75 struct iov_iter source; 77 76 unsigned long long start, len; 78 77 size_t part; 79 - bool boundary = false; 78 + bool boundary = false, subreq_superfluous = false; 80 79 81 80 /* Go through the subreqs and find the next span of contiguous 82 81 * buffer that we then rejig (cifs, for example, needs the ··· 117 116 /* Work through the sublist. */ 118 117 subreq = from; 119 118 list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) { 120 - if (!len) 119 + if (!len) { 120 + subreq_superfluous = true; 121 121 break; 122 + } 122 123 subreq->source = NETFS_DOWNLOAD_FROM_SERVER; 123 124 subreq->start = start - subreq->transferred; 124 125 subreq->len = len + subreq->transferred; ··· 157 154 158 155 netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); 159 156 netfs_reissue_read(rreq, subreq); 160 - if (subreq == to) 157 + if (subreq == to) { 158 + subreq_superfluous = false; 161 159 break; 160 + } 162 161 } 163 162 164 163 /* If we managed to use fewer subreqs, we can discard the 165 164 * excess; if we used the same number, then we're done. 166 165 */ 167 166 if (!len) { 168 - if (subreq == to) 167 + if (!subreq_superfluous) 169 168 continue; 170 169 list_for_each_entry_safe_from(subreq, tmp, 171 170 &stream->subrequests, rreq_link) { 172 - trace_netfs_sreq(subreq, netfs_sreq_trace_discard); 171 + trace_netfs_sreq(subreq, netfs_sreq_trace_superfluous); 173 172 list_del(&subreq->rreq_link); 174 173 netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); 175 174 if (subreq == to) ··· 192 187 subreq->source = NETFS_DOWNLOAD_FROM_SERVER; 193 188 subreq->start = start; 194 189 subreq->len = len; 195 - subreq->debug_index = atomic_inc_return(&rreq->subreq_counter); 196 190 subreq->stream_nr = stream->stream_nr; 197 191 subreq->retry_count = 1; 198 192 199 193 trace_netfs_sreq_ref(rreq->debug_id, subreq->debug_index, 200 194 refcount_read(&subreq->ref), 201 195 netfs_sreq_trace_new); 202 - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); 203 196 204 197 list_add(&subreq->rreq_link, &to->rreq_link); 205 198 to = list_next_entry(to, rreq_link); ··· 259 256 { 260 257 struct netfs_io_subrequest *subreq; 261 258 struct netfs_io_stream *stream = &rreq->io_streams[0]; 259 + DEFINE_WAIT(myself); 260 + 261 + netfs_stat(&netfs_n_rh_retry_read_req); 262 + 263 + set_bit(NETFS_RREQ_RETRYING, &rreq->flags); 262 264 263 265 /* Wait for all outstanding I/O to quiesce before performing retries as 264 266 * we may need to renegotiate the I/O sizes. 265 267 */ 266 268 list_for_each_entry(subreq, &stream->subrequests, rreq_link) { 267 - wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS, 268 - TASK_UNINTERRUPTIBLE); 269 + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) 270 + continue; 271 + 272 + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); 273 + for (;;) { 274 + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); 275 + 276 + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) 277 + break; 278 + 279 + trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for); 280 + schedule(); 281 + trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); 282 + } 283 + 284 + finish_wait(&rreq->waitq, &myself); 269 285 } 286 + clear_bit(NETFS_RREQ_RETRYING, &rreq->flags); 270 287 271 288 trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); 272 289 netfs_retry_read_subrequests(rreq);
+9
fs/netfs/stats.c
··· 29 29 atomic_t netfs_n_rh_write_done; 30 30 atomic_t netfs_n_rh_write_failed; 31 31 atomic_t netfs_n_rh_write_zskip; 32 + atomic_t netfs_n_rh_retry_read_req; 33 + atomic_t netfs_n_rh_retry_read_subreq; 32 34 atomic_t netfs_n_wh_buffered_write; 33 35 atomic_t netfs_n_wh_writethrough; 34 36 atomic_t netfs_n_wh_dio_write; ··· 43 41 atomic_t netfs_n_wh_write; 44 42 atomic_t netfs_n_wh_write_done; 45 43 atomic_t netfs_n_wh_write_failed; 44 + atomic_t netfs_n_wh_retry_write_req; 45 + atomic_t netfs_n_wh_retry_write_subreq; 46 46 atomic_t netfs_n_wb_lock_skip; 47 47 atomic_t netfs_n_wb_lock_wait; 48 48 atomic_t netfs_n_folioq; ··· 85 81 atomic_read(&netfs_n_wh_write), 86 82 atomic_read(&netfs_n_wh_write_done), 87 83 atomic_read(&netfs_n_wh_write_failed)); 84 + seq_printf(m, "Retries: rq=%u rs=%u wq=%u ws=%u\n", 85 + atomic_read(&netfs_n_rh_retry_read_req), 86 + atomic_read(&netfs_n_rh_retry_read_subreq), 87 + atomic_read(&netfs_n_wh_retry_write_req), 88 + atomic_read(&netfs_n_wh_retry_write_subreq)); 88 89 seq_printf(m, "Objs : rr=%u sr=%u foq=%u wsc=%u\n", 89 90 atomic_read(&netfs_n_rh_rreq), 90 91 atomic_read(&netfs_n_rh_sreq),
+1
fs/netfs/write_issue.c
··· 253 253 subreq->retry_count++; 254 254 __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); 255 255 __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); 256 + netfs_stat(&netfs_n_wh_retry_write_subreq); 256 257 netfs_do_issue_write(stream, subreq); 257 258 } 258 259
+2
fs/netfs/write_retry.c
··· 203 203 struct netfs_io_stream *stream; 204 204 int s; 205 205 206 + netfs_stat(&netfs_n_wh_retry_write_req); 207 + 206 208 /* Wait for all outstanding I/O to quiesce before performing retries as 207 209 * we may need to renegotiate the I/O sizes. 208 210 */
+1 -1
include/linux/netfs.h
··· 278 278 #define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */ 279 279 #define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */ 280 280 #define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */ 281 - #define NETFS_RREQ_NEED_RETRY 14 /* Need to try retrying */ 281 + #define NETFS_RREQ_RETRYING 14 /* Set if we're in the retry path */ 282 282 #define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark 283 283 * write to cache on read */ 284 284 const struct netfs_request_ops *netfs_ops;
+3 -1
include/trace/events/netfs.h
··· 99 99 EM(netfs_sreq_trace_limited, "LIMIT") \ 100 100 EM(netfs_sreq_trace_need_clear, "N-CLR") \ 101 101 EM(netfs_sreq_trace_partial_read, "PARTR") \ 102 - EM(netfs_sreq_trace_need_retry, "NRTRY") \ 102 + EM(netfs_sreq_trace_need_retry, "ND-RT") \ 103 103 EM(netfs_sreq_trace_prepare, "PREP ") \ 104 104 EM(netfs_sreq_trace_prep_failed, "PRPFL") \ 105 105 EM(netfs_sreq_trace_progress, "PRGRS") \ ··· 108 108 EM(netfs_sreq_trace_short, "SHORT") \ 109 109 EM(netfs_sreq_trace_split, "SPLIT") \ 110 110 EM(netfs_sreq_trace_submit, "SUBMT") \ 111 + EM(netfs_sreq_trace_superfluous, "SPRFL") \ 111 112 EM(netfs_sreq_trace_terminated, "TERM ") \ 113 + EM(netfs_sreq_trace_wait_for, "_WAIT") \ 112 114 EM(netfs_sreq_trace_write, "WRITE") \ 113 115 EM(netfs_sreq_trace_write_skip, "SKIP ") \ 114 116 E_(netfs_sreq_trace_write_term, "WTERM")
+84 -50
kernel/acct.c
··· 103 103 atomic_long_t count; 104 104 struct rcu_head rcu; 105 105 struct mutex lock; 106 - int active; 106 + bool active; 107 + bool check_space; 107 108 unsigned long needcheck; 108 109 struct file *file; 109 110 struct pid_namespace *ns; 110 111 struct work_struct work; 111 112 struct completion done; 113 + acct_t ac; 112 114 }; 113 115 114 - static void do_acct_process(struct bsd_acct_struct *acct); 116 + static void fill_ac(struct bsd_acct_struct *acct); 117 + static void acct_write_process(struct bsd_acct_struct *acct); 115 118 116 119 /* 117 120 * Check the amount of free space and suspend/resume accordingly. 118 121 */ 119 - static int check_free_space(struct bsd_acct_struct *acct) 122 + static bool check_free_space(struct bsd_acct_struct *acct) 120 123 { 121 124 struct kstatfs sbuf; 122 125 123 - if (time_is_after_jiffies(acct->needcheck)) 124 - goto out; 126 + if (!acct->check_space) 127 + return acct->active; 125 128 126 129 /* May block */ 127 130 if (vfs_statfs(&acct->file->f_path, &sbuf)) 128 - goto out; 131 + return acct->active; 129 132 130 133 if (acct->active) { 131 134 u64 suspend = sbuf.f_blocks * SUSPEND; 132 135 do_div(suspend, 100); 133 136 if (sbuf.f_bavail <= suspend) { 134 - acct->active = 0; 137 + acct->active = false; 135 138 pr_info("Process accounting paused\n"); 136 139 } 137 140 } else { 138 141 u64 resume = sbuf.f_blocks * RESUME; 139 142 do_div(resume, 100); 140 143 if (sbuf.f_bavail >= resume) { 141 - acct->active = 1; 144 + acct->active = true; 142 145 pr_info("Process accounting resumed\n"); 143 146 } 144 147 } 145 148 146 149 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; 147 - out: 148 150 return acct->active; 149 151 } 150 152 ··· 191 189 { 192 190 struct bsd_acct_struct *acct = to_acct(pin); 193 191 mutex_lock(&acct->lock); 194 - do_acct_process(acct); 192 + /* 193 + * Fill the accounting struct with the exiting task's info 194 + * before punting to the workqueue. 195 + */ 196 + fill_ac(acct); 195 197 schedule_work(&acct->work); 196 198 wait_for_completion(&acct->done); 197 199 cmpxchg(&acct->ns->bacct, pin, NULL); ··· 208 202 { 209 203 struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); 210 204 struct file *file = acct->file; 205 + 206 + /* We were fired by acct_pin_kill() which holds acct->lock. */ 207 + acct_write_process(acct); 211 208 if (file->f_op->flush) 212 209 file->f_op->flush(file, NULL); 213 210 __fput_sync(file); ··· 241 232 kfree(acct); 242 233 filp_close(file, NULL); 243 234 return -EACCES; 235 + } 236 + 237 + /* Exclude kernel kernel internal filesystems. */ 238 + if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) { 239 + kfree(acct); 240 + filp_close(file, NULL); 241 + return -EINVAL; 242 + } 243 + 244 + /* Exclude procfs and sysfs. */ 245 + if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) { 246 + kfree(acct); 247 + filp_close(file, NULL); 248 + return -EINVAL; 244 249 } 245 250 246 251 if (!(file->f_mode & FMODE_CAN_WRITE)) { ··· 453 430 * do_exit() or when switching to a different output file. 454 431 */ 455 432 456 - static void fill_ac(acct_t *ac) 433 + static void fill_ac(struct bsd_acct_struct *acct) 457 434 { 458 435 struct pacct_struct *pacct = &current->signal->pacct; 436 + struct file *file = acct->file; 437 + acct_t *ac = &acct->ac; 459 438 u64 elapsed, run_time; 460 439 time64_t btime; 461 440 struct tty_struct *tty; 441 + 442 + lockdep_assert_held(&acct->lock); 443 + 444 + if (time_is_after_jiffies(acct->needcheck)) { 445 + acct->check_space = false; 446 + 447 + /* Don't fill in @ac if nothing will be written. */ 448 + if (!acct->active) 449 + return; 450 + } else { 451 + acct->check_space = true; 452 + } 462 453 463 454 /* 464 455 * Fill the accounting struct with the needed info as recorded ··· 521 484 ac->ac_majflt = encode_comp_t(pacct->ac_majflt); 522 485 ac->ac_exitcode = pacct->ac_exitcode; 523 486 spin_unlock_irq(&current->sighand->siglock); 524 - } 525 - /* 526 - * do_acct_process does all actual work. Caller holds the reference to file. 527 - */ 528 - static void do_acct_process(struct bsd_acct_struct *acct) 529 - { 530 - acct_t ac; 531 - unsigned long flim; 532 - const struct cred *orig_cred; 533 - struct file *file = acct->file; 534 487 535 - /* 536 - * Accounting records are not subject to resource limits. 537 - */ 538 - flim = rlimit(RLIMIT_FSIZE); 539 - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; 540 - /* Perform file operations on behalf of whoever enabled accounting */ 541 - orig_cred = override_creds(file->f_cred); 542 - 543 - /* 544 - * First check to see if there is enough free_space to continue 545 - * the process accounting system. 546 - */ 547 - if (!check_free_space(acct)) 548 - goto out; 549 - 550 - fill_ac(&ac); 551 488 /* we really need to bite the bullet and change layout */ 552 - ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); 553 - ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); 489 + ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid()); 490 + ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid()); 554 491 #if ACCT_VERSION == 1 || ACCT_VERSION == 2 555 492 /* backward-compatible 16 bit fields */ 556 - ac.ac_uid16 = ac.ac_uid; 557 - ac.ac_gid16 = ac.ac_gid; 493 + ac->ac_uid16 = ac->ac_uid; 494 + ac->ac_gid16 = ac->ac_gid; 558 495 #elif ACCT_VERSION == 3 559 496 { 560 497 struct pid_namespace *ns = acct->ns; 561 498 562 - ac.ac_pid = task_tgid_nr_ns(current, ns); 499 + ac->ac_pid = task_tgid_nr_ns(current, ns); 563 500 rcu_read_lock(); 564 - ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), 565 - ns); 501 + ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); 566 502 rcu_read_unlock(); 567 503 } 568 504 #endif 505 + } 506 + 507 + static void acct_write_process(struct bsd_acct_struct *acct) 508 + { 509 + struct file *file = acct->file; 510 + const struct cred *cred; 511 + acct_t *ac = &acct->ac; 512 + 513 + /* Perform file operations on behalf of whoever enabled accounting */ 514 + cred = override_creds(file->f_cred); 515 + 569 516 /* 570 - * Get freeze protection. If the fs is frozen, just skip the write 571 - * as we could deadlock the system otherwise. 517 + * First check to see if there is enough free_space to continue 518 + * the process accounting system. Then get freeze protection. If 519 + * the fs is frozen, just skip the write as we could deadlock 520 + * the system otherwise. 572 521 */ 573 - if (file_start_write_trylock(file)) { 522 + if (check_free_space(acct) && file_start_write_trylock(file)) { 574 523 /* it's been opened O_APPEND, so position is irrelevant */ 575 524 loff_t pos = 0; 576 - __kernel_write(file, &ac, sizeof(acct_t), &pos); 525 + __kernel_write(file, ac, sizeof(acct_t), &pos); 577 526 file_end_write(file); 578 527 } 579 - out: 528 + 529 + revert_creds(cred); 530 + } 531 + 532 + static void do_acct_process(struct bsd_acct_struct *acct) 533 + { 534 + unsigned long flim; 535 + 536 + /* Accounting records are not subject to resource limits. */ 537 + flim = rlimit(RLIMIT_FSIZE); 538 + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; 539 + fill_ac(acct); 540 + acct_write_process(acct); 580 541 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; 581 - revert_creds(orig_cred); 582 542 } 583 543 584 544 /**