Merge tag 'vfs-6.14-rc4.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

+14 -7

fs/netfs/buffered_read.c

··· 155 155 netfs_cache_read_terminated, subreq); 156 156 } 157 157 158 - static void netfs_issue_read(struct netfs_io_request *rreq, 159 - struct netfs_io_subrequest *subreq) 158 + static void netfs_queue_read(struct netfs_io_request *rreq, 159 + struct netfs_io_subrequest *subreq, 160 + bool last_subreq) 160 161 { 161 162 struct netfs_io_stream *stream = &rreq->io_streams[0]; 162 163 ··· 178 177 } 179 178 } 180 179 181 - spin_unlock(&rreq->lock); 180 + if (last_subreq) { 181 + smp_wmb(); /* Write lists before ALL_QUEUED. */ 182 + set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); 183 + } 182 184 185 + spin_unlock(&rreq->lock); 186 + } 187 + 188 + static void netfs_issue_read(struct netfs_io_request *rreq, 189 + struct netfs_io_subrequest *subreq) 190 + { 183 191 switch (subreq->source) { 184 192 case NETFS_DOWNLOAD_FROM_SERVER: 185 193 rreq->netfs_ops->issue_read(subreq); ··· 303 293 } 304 294 size -= slice; 305 295 start += slice; 306 - if (size <= 0) { 307 - smp_wmb(); /* Write lists before ALL_QUEUED. */ 308 - set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); 309 - } 310 296 297 + netfs_queue_read(rreq, subreq, size <= 0); 311 298 netfs_issue_read(rreq, subreq); 312 299 cond_resched(); 313 300 } while (size > 0);

+4

fs/netfs/internal.h

··· 135 135 extern atomic_t netfs_n_rh_write_done; 136 136 extern atomic_t netfs_n_rh_write_failed; 137 137 extern atomic_t netfs_n_rh_write_zskip; 138 + extern atomic_t netfs_n_rh_retry_read_req; 139 + extern atomic_t netfs_n_rh_retry_read_subreq; 138 140 extern atomic_t netfs_n_wh_buffered_write; 139 141 extern atomic_t netfs_n_wh_writethrough; 140 142 extern atomic_t netfs_n_wh_dio_write; ··· 149 147 extern atomic_t netfs_n_wh_write; 150 148 extern atomic_t netfs_n_wh_write_done; 151 149 extern atomic_t netfs_n_wh_write_failed; 150 + extern atomic_t netfs_n_wh_retry_write_req; 151 + extern atomic_t netfs_n_wh_retry_write_subreq; 152 152 extern atomic_t netfs_n_wb_lock_skip; 153 153 extern atomic_t netfs_n_wb_lock_wait; 154 154 extern atomic_t netfs_n_folioq;

+4 -2

fs/netfs/read_collect.c

··· 470 470 */ 471 471 void netfs_wake_read_collector(struct netfs_io_request *rreq) 472 472 { 473 - if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { 473 + if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) && 474 + !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) { 474 475 if (!work_pending(&rreq->work)) { 475 476 netfs_get_request(rreq, netfs_rreq_trace_get_work); 476 477 if (!queue_work(system_unbound_wq, &rreq->work)) ··· 587 586 smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */ 588 587 589 588 /* If we are at the head of the queue, wake up the collector. */ 590 - if (list_is_first(&subreq->rreq_link, &stream->subrequests)) 589 + if (list_is_first(&subreq->rreq_link, &stream->subrequests) || 590 + test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) 591 591 netfs_wake_read_collector(rreq); 592 592 593 593 netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated);

+33 -10

fs/netfs/read_retry.c

··· 14 14 { 15 15 __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); 16 16 __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); 17 - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); 17 + netfs_stat(&netfs_n_rh_retry_read_subreq); 18 18 subreq->rreq->netfs_ops->issue_read(subreq); 19 19 } 20 20 ··· 48 48 __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); 49 49 subreq->retry_count++; 50 50 netfs_reset_iter(subreq); 51 + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); 51 52 netfs_reissue_read(rreq, subreq); 52 53 } 53 54 } ··· 76 75 struct iov_iter source; 77 76 unsigned long long start, len; 78 77 size_t part; 79 - bool boundary = false; 78 + bool boundary = false, subreq_superfluous = false; 80 79 81 80 /* Go through the subreqs and find the next span of contiguous 82 81 * buffer that we then rejig (cifs, for example, needs the ··· 117 116 /* Work through the sublist. */ 118 117 subreq = from; 119 118 list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) { 120 - if (!len) 119 + if (!len) { 120 + subreq_superfluous = true; 121 121 break; 122 + } 122 123 subreq->source = NETFS_DOWNLOAD_FROM_SERVER; 123 124 subreq->start = start - subreq->transferred; 124 125 subreq->len = len + subreq->transferred; ··· 157 154 158 155 netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); 159 156 netfs_reissue_read(rreq, subreq); 160 - if (subreq == to) 157 + if (subreq == to) { 158 + subreq_superfluous = false; 161 159 break; 160 + } 162 161 } 163 162 164 163 /* If we managed to use fewer subreqs, we can discard the 165 164 * excess; if we used the same number, then we're done. 166 165 */ 167 166 if (!len) { 168 - if (subreq == to) 167 + if (!subreq_superfluous) 169 168 continue; 170 169 list_for_each_entry_safe_from(subreq, tmp, 171 170 &stream->subrequests, rreq_link) { 172 - trace_netfs_sreq(subreq, netfs_sreq_trace_discard); 171 + trace_netfs_sreq(subreq, netfs_sreq_trace_superfluous); 173 172 list_del(&subreq->rreq_link); 174 173 netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); 175 174 if (subreq == to) ··· 192 187 subreq->source = NETFS_DOWNLOAD_FROM_SERVER; 193 188 subreq->start = start; 194 189 subreq->len = len; 195 - subreq->debug_index = atomic_inc_return(&rreq->subreq_counter); 196 190 subreq->stream_nr = stream->stream_nr; 197 191 subreq->retry_count = 1; 198 192 199 193 trace_netfs_sreq_ref(rreq->debug_id, subreq->debug_index, 200 194 refcount_read(&subreq->ref), 201 195 netfs_sreq_trace_new); 202 - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); 203 196 204 197 list_add(&subreq->rreq_link, &to->rreq_link); 205 198 to = list_next_entry(to, rreq_link); ··· 259 256 { 260 257 struct netfs_io_subrequest *subreq; 261 258 struct netfs_io_stream *stream = &rreq->io_streams[0]; 259 + DEFINE_WAIT(myself); 260 + 261 + netfs_stat(&netfs_n_rh_retry_read_req); 262 + 263 + set_bit(NETFS_RREQ_RETRYING, &rreq->flags); 262 264 263 265 /* Wait for all outstanding I/O to quiesce before performing retries as 264 266 * we may need to renegotiate the I/O sizes. 265 267 */ 266 268 list_for_each_entry(subreq, &stream->subrequests, rreq_link) { 267 - wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS, 268 - TASK_UNINTERRUPTIBLE); 269 + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) 270 + continue; 271 + 272 + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); 273 + for (;;) { 274 + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); 275 + 276 + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) 277 + break; 278 + 279 + trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for); 280 + schedule(); 281 + trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); 282 + } 283 + 284 + finish_wait(&rreq->waitq, &myself); 269 285 } 286 + clear_bit(NETFS_RREQ_RETRYING, &rreq->flags); 270 287 271 288 trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); 272 289 netfs_retry_read_subrequests(rreq);

+9

fs/netfs/stats.c

··· 29 29 atomic_t netfs_n_rh_write_done; 30 30 atomic_t netfs_n_rh_write_failed; 31 31 atomic_t netfs_n_rh_write_zskip; 32 + atomic_t netfs_n_rh_retry_read_req; 33 + atomic_t netfs_n_rh_retry_read_subreq; 32 34 atomic_t netfs_n_wh_buffered_write; 33 35 atomic_t netfs_n_wh_writethrough; 34 36 atomic_t netfs_n_wh_dio_write; ··· 43 41 atomic_t netfs_n_wh_write; 44 42 atomic_t netfs_n_wh_write_done; 45 43 atomic_t netfs_n_wh_write_failed; 44 + atomic_t netfs_n_wh_retry_write_req; 45 + atomic_t netfs_n_wh_retry_write_subreq; 46 46 atomic_t netfs_n_wb_lock_skip; 47 47 atomic_t netfs_n_wb_lock_wait; 48 48 atomic_t netfs_n_folioq; ··· 85 81 atomic_read(&netfs_n_wh_write), 86 82 atomic_read(&netfs_n_wh_write_done), 87 83 atomic_read(&netfs_n_wh_write_failed)); 84 + seq_printf(m, "Retries: rq=%u rs=%u wq=%u ws=%u\n", 85 + atomic_read(&netfs_n_rh_retry_read_req), 86 + atomic_read(&netfs_n_rh_retry_read_subreq), 87 + atomic_read(&netfs_n_wh_retry_write_req), 88 + atomic_read(&netfs_n_wh_retry_write_subreq)); 88 89 seq_printf(m, "Objs : rr=%u sr=%u foq=%u wsc=%u\n", 89 90 atomic_read(&netfs_n_rh_rreq), 90 91 atomic_read(&netfs_n_rh_sreq),

+1

fs/netfs/write_issue.c

··· 253 253 subreq->retry_count++; 254 254 __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); 255 255 __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); 256 + netfs_stat(&netfs_n_wh_retry_write_subreq); 256 257 netfs_do_issue_write(stream, subreq); 257 258 } 258 259

+2

fs/netfs/write_retry.c

··· 203 203 struct netfs_io_stream *stream; 204 204 int s; 205 205 206 + netfs_stat(&netfs_n_wh_retry_write_req); 207 + 206 208 /* Wait for all outstanding I/O to quiesce before performing retries as 207 209 * we may need to renegotiate the I/O sizes. 208 210 */

+1 -1

include/linux/netfs.h

··· 278 278 #define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */ 279 279 #define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */ 280 280 #define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */ 281 - #define NETFS_RREQ_NEED_RETRY 14 /* Need to try retrying */ 281 + #define NETFS_RREQ_RETRYING 14 /* Set if we're in the retry path */ 282 282 #define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark 283 283 * write to cache on read */ 284 284 const struct netfs_request_ops *netfs_ops;

+3 -1

include/trace/events/netfs.h

··· 99 99 EM(netfs_sreq_trace_limited, "LIMIT") \ 100 100 EM(netfs_sreq_trace_need_clear, "N-CLR") \ 101 101 EM(netfs_sreq_trace_partial_read, "PARTR") \ 102 - EM(netfs_sreq_trace_need_retry, "NRTRY") \ 102 + EM(netfs_sreq_trace_need_retry, "ND-RT") \ 103 103 EM(netfs_sreq_trace_prepare, "PREP ") \ 104 104 EM(netfs_sreq_trace_prep_failed, "PRPFL") \ 105 105 EM(netfs_sreq_trace_progress, "PRGRS") \ ··· 108 108 EM(netfs_sreq_trace_short, "SHORT") \ 109 109 EM(netfs_sreq_trace_split, "SPLIT") \ 110 110 EM(netfs_sreq_trace_submit, "SUBMT") \ 111 + EM(netfs_sreq_trace_superfluous, "SPRFL") \ 111 112 EM(netfs_sreq_trace_terminated, "TERM ") \ 113 + EM(netfs_sreq_trace_wait_for, "_WAIT") \ 112 114 EM(netfs_sreq_trace_write, "WRITE") \ 113 115 EM(netfs_sreq_trace_write_skip, "SKIP ") \ 114 116 E_(netfs_sreq_trace_write_term, "WTERM")

+84 -50

kernel/acct.c

··· 103 103 atomic_long_t count; 104 104 struct rcu_head rcu; 105 105 struct mutex lock; 106 - int active; 106 + bool active; 107 + bool check_space; 107 108 unsigned long needcheck; 108 109 struct file *file; 109 110 struct pid_namespace *ns; 110 111 struct work_struct work; 111 112 struct completion done; 113 + acct_t ac; 112 114 }; 113 115 114 - static void do_acct_process(struct bsd_acct_struct *acct); 116 + static void fill_ac(struct bsd_acct_struct *acct); 117 + static void acct_write_process(struct bsd_acct_struct *acct); 115 118 116 119 /* 117 120 * Check the amount of free space and suspend/resume accordingly. 118 121 */ 119 - static int check_free_space(struct bsd_acct_struct *acct) 122 + static bool check_free_space(struct bsd_acct_struct *acct) 120 123 { 121 124 struct kstatfs sbuf; 122 125 123 - if (time_is_after_jiffies(acct->needcheck)) 124 - goto out; 126 + if (!acct->check_space) 127 + return acct->active; 125 128 126 129 /* May block */ 127 130 if (vfs_statfs(&acct->file->f_path, &sbuf)) 128 - goto out; 131 + return acct->active; 129 132 130 133 if (acct->active) { 131 134 u64 suspend = sbuf.f_blocks * SUSPEND; 132 135 do_div(suspend, 100); 133 136 if (sbuf.f_bavail <= suspend) { 134 - acct->active = 0; 137 + acct->active = false; 135 138 pr_info("Process accounting paused\n"); 136 139 } 137 140 } else { 138 141 u64 resume = sbuf.f_blocks * RESUME; 139 142 do_div(resume, 100); 140 143 if (sbuf.f_bavail >= resume) { 141 - acct->active = 1; 144 + acct->active = true; 142 145 pr_info("Process accounting resumed\n"); 143 146 } 144 147 } 145 148 146 149 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; 147 - out: 148 150 return acct->active; 149 151 } 150 152 ··· 191 189 { 192 190 struct bsd_acct_struct *acct = to_acct(pin); 193 191 mutex_lock(&acct->lock); 194 - do_acct_process(acct); 192 + /* 193 + * Fill the accounting struct with the exiting task's info 194 + * before punting to the workqueue. 195 + */ 196 + fill_ac(acct); 195 197 schedule_work(&acct->work); 196 198 wait_for_completion(&acct->done); 197 199 cmpxchg(&acct->ns->bacct, pin, NULL); ··· 208 202 { 209 203 struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); 210 204 struct file *file = acct->file; 205 + 206 + /* We were fired by acct_pin_kill() which holds acct->lock. */ 207 + acct_write_process(acct); 211 208 if (file->f_op->flush) 212 209 file->f_op->flush(file, NULL); 213 210 __fput_sync(file); ··· 241 232 kfree(acct); 242 233 filp_close(file, NULL); 243 234 return -EACCES; 235 + } 236 + 237 + /* Exclude kernel kernel internal filesystems. */ 238 + if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) { 239 + kfree(acct); 240 + filp_close(file, NULL); 241 + return -EINVAL; 242 + } 243 + 244 + /* Exclude procfs and sysfs. */ 245 + if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) { 246 + kfree(acct); 247 + filp_close(file, NULL); 248 + return -EINVAL; 244 249 } 245 250 246 251 if (!(file->f_mode & FMODE_CAN_WRITE)) { ··· 453 430 * do_exit() or when switching to a different output file. 454 431 */ 455 432 456 - static void fill_ac(acct_t *ac) 433 + static void fill_ac(struct bsd_acct_struct *acct) 457 434 { 458 435 struct pacct_struct *pacct = &current->signal->pacct; 436 + struct file *file = acct->file; 437 + acct_t *ac = &acct->ac; 459 438 u64 elapsed, run_time; 460 439 time64_t btime; 461 440 struct tty_struct *tty; 441 + 442 + lockdep_assert_held(&acct->lock); 443 + 444 + if (time_is_after_jiffies(acct->needcheck)) { 445 + acct->check_space = false; 446 + 447 + /* Don't fill in @ac if nothing will be written. */ 448 + if (!acct->active) 449 + return; 450 + } else { 451 + acct->check_space = true; 452 + } 462 453 463 454 /* 464 455 * Fill the accounting struct with the needed info as recorded ··· 521 484 ac->ac_majflt = encode_comp_t(pacct->ac_majflt); 522 485 ac->ac_exitcode = pacct->ac_exitcode; 523 486 spin_unlock_irq(&current->sighand->siglock); 524 - } 525 - /* 526 - * do_acct_process does all actual work. Caller holds the reference to file. 527 - */ 528 - static void do_acct_process(struct bsd_acct_struct *acct) 529 - { 530 - acct_t ac; 531 - unsigned long flim; 532 - const struct cred *orig_cred; 533 - struct file *file = acct->file; 534 487 535 - /* 536 - * Accounting records are not subject to resource limits. 537 - */ 538 - flim = rlimit(RLIMIT_FSIZE); 539 - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; 540 - /* Perform file operations on behalf of whoever enabled accounting */ 541 - orig_cred = override_creds(file->f_cred); 542 - 543 - /* 544 - * First check to see if there is enough free_space to continue 545 - * the process accounting system. 546 - */ 547 - if (!check_free_space(acct)) 548 - goto out; 549 - 550 - fill_ac(&ac); 551 488 /* we really need to bite the bullet and change layout */ 552 - ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); 553 - ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); 489 + ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid()); 490 + ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid()); 554 491 #if ACCT_VERSION == 1 || ACCT_VERSION == 2 555 492 /* backward-compatible 16 bit fields */ 556 - ac.ac_uid16 = ac.ac_uid; 557 - ac.ac_gid16 = ac.ac_gid; 493 + ac->ac_uid16 = ac->ac_uid; 494 + ac->ac_gid16 = ac->ac_gid; 558 495 #elif ACCT_VERSION == 3 559 496 { 560 497 struct pid_namespace *ns = acct->ns; 561 498 562 - ac.ac_pid = task_tgid_nr_ns(current, ns); 499 + ac->ac_pid = task_tgid_nr_ns(current, ns); 563 500 rcu_read_lock(); 564 - ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), 565 - ns); 501 + ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); 566 502 rcu_read_unlock(); 567 503 } 568 504 #endif 505 + } 506 + 507 + static void acct_write_process(struct bsd_acct_struct *acct) 508 + { 509 + struct file *file = acct->file; 510 + const struct cred *cred; 511 + acct_t *ac = &acct->ac; 512 + 513 + /* Perform file operations on behalf of whoever enabled accounting */ 514 + cred = override_creds(file->f_cred); 515 + 569 516 /* 570 - * Get freeze protection. If the fs is frozen, just skip the write 571 - * as we could deadlock the system otherwise. 517 + * First check to see if there is enough free_space to continue 518 + * the process accounting system. Then get freeze protection. If 519 + * the fs is frozen, just skip the write as we could deadlock 520 + * the system otherwise. 572 521 */ 573 - if (file_start_write_trylock(file)) { 522 + if (check_free_space(acct) && file_start_write_trylock(file)) { 574 523 /* it's been opened O_APPEND, so position is irrelevant */ 575 524 loff_t pos = 0; 576 - __kernel_write(file, &ac, sizeof(acct_t), &pos); 525 + __kernel_write(file, ac, sizeof(acct_t), &pos); 577 526 file_end_write(file); 578 527 } 579 - out: 528 + 529 + revert_creds(cred); 530 + } 531 + 532 + static void do_acct_process(struct bsd_acct_struct *acct) 533 + { 534 + unsigned long flim; 535 + 536 + /* Accounting records are not subject to resource limits. */ 537 + flim = rlimit(RLIMIT_FSIZE); 538 + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; 539 + fill_ac(acct); 540 + acct_write_process(acct); 580 541 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; 581 - revert_creds(orig_cred); 582 542 } 583 543 584 544 /**

Configure Feed

Configure Feed