Merge tag 'ext4_for_linus-6.5-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

+143 -35

fs/ext4/mballoc.c

··· 1006 1006 * fls() instead since we need to know the actual length while modifying 1007 1007 * goal length. 1008 1008 */ 1009 - order = fls(ac->ac_g_ex.fe_len); 1009 + order = fls(ac->ac_g_ex.fe_len) - 1; 1010 1010 min_order = order - sbi->s_mb_best_avail_max_trim_order; 1011 1011 if (min_order < 0) 1012 1012 min_order = 0; 1013 - 1014 - if (1 << min_order < ac->ac_o_ex.fe_len) 1015 - min_order = fls(ac->ac_o_ex.fe_len) + 1; 1016 1013 1017 1014 if (sbi->s_stripe > 0) { 1018 1015 /* ··· 1018 1021 */ 1019 1022 num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe); 1020 1023 if (1 << min_order < num_stripe_clusters) 1021 - min_order = fls(num_stripe_clusters); 1024 + /* 1025 + * We consider 1 order less because later we round 1026 + * up the goal len to num_stripe_clusters 1027 + */ 1028 + min_order = fls(num_stripe_clusters) - 1; 1022 1029 } 1030 + 1031 + if (1 << min_order < ac->ac_o_ex.fe_len) 1032 + min_order = fls(ac->ac_o_ex.fe_len); 1023 1033 1024 1034 for (i = order; i >= min_order; i--) { 1025 1035 int frag_order; ··· 4765 4761 int order, i; 4766 4762 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4767 4763 struct ext4_locality_group *lg; 4768 - struct ext4_prealloc_space *tmp_pa, *cpa = NULL; 4769 - ext4_lblk_t tmp_pa_start, tmp_pa_end; 4764 + struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL; 4765 + loff_t tmp_pa_end; 4770 4766 struct rb_node *iter; 4771 4767 ext4_fsblk_t goal_block; 4772 4768 ··· 4774 4770 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4775 4771 return false; 4776 4772 4777 - /* first, try per-file preallocation */ 4773 + /* 4774 + * first, try per-file preallocation by searching the inode pa rbtree. 4775 + * 4776 + * Here, we can't do a direct traversal of the tree because 4777 + * ext4_mb_discard_group_preallocation() can paralelly mark the pa 4778 + * deleted and that can cause direct traversal to skip some entries. 4779 + */ 4778 4780 read_lock(&ei->i_prealloc_lock); 4781 + 4782 + if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) { 4783 + goto try_group_pa; 4784 + } 4785 + 4786 + /* 4787 + * Step 1: Find a pa with logical start immediately adjacent to the 4788 + * original logical start. This could be on the left or right. 4789 + * 4790 + * (tmp_pa->pa_lstart never changes so we can skip locking for it). 4791 + */ 4779 4792 for (iter = ei->i_prealloc_node.rb_node; iter; 4780 4793 iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, 4781 - tmp_pa_start, iter)) { 4794 + tmp_pa->pa_lstart, iter)) { 4782 4795 tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4783 4796 pa_node.inode_node); 4797 + } 4784 4798 4785 - /* all fields in this condition don't change, 4786 - * so we can skip locking for them */ 4787 - tmp_pa_start = tmp_pa->pa_lstart; 4788 - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); 4799 + /* 4800 + * Step 2: The adjacent pa might be to the right of logical start, find 4801 + * the left adjacent pa. After this step we'd have a valid tmp_pa whose 4802 + * logical start is towards the left of original request's logical start 4803 + */ 4804 + if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) { 4805 + struct rb_node *tmp; 4806 + tmp = rb_prev(&tmp_pa->pa_node.inode_node); 4789 4807 4790 - /* original request start doesn't lie in this PA */ 4791 - if (ac->ac_o_ex.fe_logical < tmp_pa_start || 4792 - ac->ac_o_ex.fe_logical >= tmp_pa_end) 4793 - continue; 4794 - 4795 - /* non-extent files can't have physical blocks past 2^32 */ 4796 - if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && 4797 - (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > 4798 - EXT4_MAX_BLOCK_FILE_PHYS)) { 4808 + if (tmp) { 4809 + tmp_pa = rb_entry(tmp, struct ext4_prealloc_space, 4810 + pa_node.inode_node); 4811 + } else { 4799 4812 /* 4800 - * Since PAs don't overlap, we won't find any 4801 - * other PA to satisfy this. 4813 + * If there is no adjacent pa to the left then finding 4814 + * an overlapping pa is not possible hence stop searching 4815 + * inode pa tree 4816 + */ 4817 + goto try_group_pa; 4818 + } 4819 + } 4820 + 4821 + BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); 4822 + 4823 + /* 4824 + * Step 3: If the left adjacent pa is deleted, keep moving left to find 4825 + * the first non deleted adjacent pa. After this step we should have a 4826 + * valid tmp_pa which is guaranteed to be non deleted. 4827 + */ 4828 + for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) { 4829 + if (!iter) { 4830 + /* 4831 + * no non deleted left adjacent pa, so stop searching 4832 + * inode pa tree 4833 + */ 4834 + goto try_group_pa; 4835 + } 4836 + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4837 + pa_node.inode_node); 4838 + spin_lock(&tmp_pa->pa_lock); 4839 + if (tmp_pa->pa_deleted == 0) { 4840 + /* 4841 + * We will keep holding the pa_lock from 4842 + * this point on because we don't want group discard 4843 + * to delete this pa underneath us. Since group 4844 + * discard is anyways an ENOSPC operation it 4845 + * should be okay for it to wait a few more cycles. 4802 4846 */ 4803 4847 break; 4804 - } 4805 - 4806 - /* found preallocated blocks, use them */ 4807 - spin_lock(&tmp_pa->pa_lock); 4808 - if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free && 4809 - likely(ext4_mb_pa_goal_check(ac, tmp_pa))) { 4810 - atomic_inc(&tmp_pa->pa_count); 4811 - ext4_mb_use_inode_pa(ac, tmp_pa); 4848 + } else { 4812 4849 spin_unlock(&tmp_pa->pa_lock); 4813 - read_unlock(&ei->i_prealloc_lock); 4814 - return true; 4815 4850 } 4816 - spin_unlock(&tmp_pa->pa_lock); 4817 4851 } 4852 + 4853 + BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); 4854 + BUG_ON(tmp_pa->pa_deleted == 1); 4855 + 4856 + /* 4857 + * Step 4: We now have the non deleted left adjacent pa. Only this 4858 + * pa can possibly satisfy the request hence check if it overlaps 4859 + * original logical start and stop searching if it doesn't. 4860 + */ 4861 + tmp_pa_end = (loff_t)tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); 4862 + 4863 + if (ac->ac_o_ex.fe_logical >= tmp_pa_end) { 4864 + spin_unlock(&tmp_pa->pa_lock); 4865 + goto try_group_pa; 4866 + } 4867 + 4868 + /* non-extent files can't have physical blocks past 2^32 */ 4869 + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && 4870 + (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > 4871 + EXT4_MAX_BLOCK_FILE_PHYS)) { 4872 + /* 4873 + * Since PAs don't overlap, we won't find any other PA to 4874 + * satisfy this. 4875 + */ 4876 + spin_unlock(&tmp_pa->pa_lock); 4877 + goto try_group_pa; 4878 + } 4879 + 4880 + if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) { 4881 + atomic_inc(&tmp_pa->pa_count); 4882 + ext4_mb_use_inode_pa(ac, tmp_pa); 4883 + spin_unlock(&tmp_pa->pa_lock); 4884 + read_unlock(&ei->i_prealloc_lock); 4885 + return true; 4886 + } else { 4887 + /* 4888 + * We found a valid overlapping pa but couldn't use it because 4889 + * it had no free blocks. This should ideally never happen 4890 + * because: 4891 + * 4892 + * 1. When a new inode pa is added to rbtree it must have 4893 + * pa_free > 0 since otherwise we won't actually need 4894 + * preallocation. 4895 + * 4896 + * 2. An inode pa that is in the rbtree can only have it's 4897 + * pa_free become zero when another thread calls: 4898 + * ext4_mb_new_blocks 4899 + * ext4_mb_use_preallocated 4900 + * ext4_mb_use_inode_pa 4901 + * 4902 + * 3. Further, after the above calls make pa_free == 0, we will 4903 + * immediately remove it from the rbtree in: 4904 + * ext4_mb_new_blocks 4905 + * ext4_mb_release_context 4906 + * ext4_mb_put_pa 4907 + * 4908 + * 4. Since the pa_free becoming 0 and pa_free getting removed 4909 + * from tree both happen in ext4_mb_new_blocks, which is always 4910 + * called with i_data_sem held for data allocations, we can be 4911 + * sure that another process will never see a pa in rbtree with 4912 + * pa_free == 0. 4913 + */ 4914 + WARN_ON_ONCE(tmp_pa->pa_free == 0); 4915 + } 4916 + spin_unlock(&tmp_pa->pa_lock); 4917 + try_group_pa: 4818 4918 read_unlock(&ei->i_prealloc_lock); 4819 4919 4820 4920 /* can we use group allocation? */

+14

fs/ext4/xattr.c

··· 1782 1782 memmove(here, (void *)here + size, 1783 1783 (void *)last - (void *)here + sizeof(__u32)); 1784 1784 memset(last, 0, size); 1785 + 1786 + /* 1787 + * Update i_inline_off - moved ibody region might contain 1788 + * system.data attribute. Handling a failure here won't 1789 + * cause other complications for setting an xattr. 1790 + */ 1791 + if (!is_block && ext4_has_inline_data(inode)) { 1792 + ret = ext4_find_inline_data_nolock(inode); 1793 + if (ret) { 1794 + ext4_warning_inode(inode, 1795 + "unable to update i_inline_off"); 1796 + goto out; 1797 + } 1798 + } 1785 1799 } else if (s->not_found) { 1786 1800 /* Insert new name. */ 1787 1801 size_t size = EXT4_XATTR_LEN(name_len);

+100 -189

fs/jbd2/checkpoint.c

··· 27 27 * 28 28 * Called with j_list_lock held. 29 29 */ 30 - static inline void __buffer_unlink_first(struct journal_head *jh) 30 + static inline void __buffer_unlink(struct journal_head *jh) 31 31 { 32 32 transaction_t *transaction = jh->b_cp_transaction; 33 33 ··· 38 38 if (transaction->t_checkpoint_list == jh) 39 39 transaction->t_checkpoint_list = NULL; 40 40 } 41 - } 42 - 43 - /* 44 - * Unlink a buffer from a transaction checkpoint(io) list. 45 - * 46 - * Called with j_list_lock held. 47 - */ 48 - static inline void __buffer_unlink(struct journal_head *jh) 49 - { 50 - transaction_t *transaction = jh->b_cp_transaction; 51 - 52 - __buffer_unlink_first(jh); 53 - if (transaction->t_checkpoint_io_list == jh) { 54 - transaction->t_checkpoint_io_list = jh->b_cpnext; 55 - if (transaction->t_checkpoint_io_list == jh) 56 - transaction->t_checkpoint_io_list = NULL; 57 - } 58 - } 59 - 60 - /* 61 - * Move a buffer from the checkpoint list to the checkpoint io list 62 - * 63 - * Called with j_list_lock held 64 - */ 65 - static inline void __buffer_relink_io(struct journal_head *jh) 66 - { 67 - transaction_t *transaction = jh->b_cp_transaction; 68 - 69 - __buffer_unlink_first(jh); 70 - 71 - if (!transaction->t_checkpoint_io_list) { 72 - jh->b_cpnext = jh->b_cpprev = jh; 73 - } else { 74 - jh->b_cpnext = transaction->t_checkpoint_io_list; 75 - jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; 76 - jh->b_cpprev->b_cpnext = jh; 77 - jh->b_cpnext->b_cpprev = jh; 78 - } 79 - transaction->t_checkpoint_io_list = jh; 80 41 } 81 42 82 43 /* ··· 144 183 struct buffer_head *bh = journal->j_chkpt_bhs[i]; 145 184 BUFFER_TRACE(bh, "brelse"); 146 185 __brelse(bh); 186 + journal->j_chkpt_bhs[i] = NULL; 147 187 } 148 188 *batch_count = 0; 149 189 } ··· 204 242 jh = transaction->t_checkpoint_list; 205 243 bh = jh2bh(jh); 206 244 207 - if (buffer_locked(bh)) { 208 - get_bh(bh); 209 - spin_unlock(&journal->j_list_lock); 210 - wait_on_buffer(bh); 211 - /* the journal_head may have gone by now */ 212 - BUFFER_TRACE(bh, "brelse"); 213 - __brelse(bh); 214 - goto retry; 215 - } 216 245 if (jh->b_transaction != NULL) { 217 246 transaction_t *t = jh->b_transaction; 218 247 tid_t tid = t->t_tid; ··· 238 285 spin_lock(&journal->j_list_lock); 239 286 goto restart; 240 287 } 241 - if (!buffer_dirty(bh)) { 288 + if (!trylock_buffer(bh)) { 289 + /* 290 + * The buffer is locked, it may be writing back, or 291 + * flushing out in the last couple of cycles, or 292 + * re-adding into a new transaction, need to check 293 + * it again until it's unlocked. 294 + */ 295 + get_bh(bh); 296 + spin_unlock(&journal->j_list_lock); 297 + wait_on_buffer(bh); 298 + /* the journal_head may have gone by now */ 299 + BUFFER_TRACE(bh, "brelse"); 300 + __brelse(bh); 301 + goto retry; 302 + } else if (!buffer_dirty(bh)) { 303 + unlock_buffer(bh); 242 304 BUFFER_TRACE(bh, "remove from checkpoint"); 243 - if (__jbd2_journal_remove_checkpoint(jh)) 244 - /* The transaction was released; we're done */ 305 + /* 306 + * If the transaction was released or the checkpoint 307 + * list was empty, we're done. 308 + */ 309 + if (__jbd2_journal_remove_checkpoint(jh) || 310 + !transaction->t_checkpoint_list) 245 311 goto out; 246 - continue; 312 + } else { 313 + unlock_buffer(bh); 314 + /* 315 + * We are about to write the buffer, it could be 316 + * raced by some other transaction shrink or buffer 317 + * re-log logic once we release the j_list_lock, 318 + * leave it on the checkpoint list and check status 319 + * again to make sure it's clean. 320 + */ 321 + BUFFER_TRACE(bh, "queue"); 322 + get_bh(bh); 323 + J_ASSERT_BH(bh, !buffer_jwrite(bh)); 324 + journal->j_chkpt_bhs[batch_count++] = bh; 325 + transaction->t_chp_stats.cs_written++; 326 + transaction->t_checkpoint_list = jh->b_cpnext; 247 327 } 248 - /* 249 - * Important: we are about to write the buffer, and 250 - * possibly block, while still holding the journal 251 - * lock. We cannot afford to let the transaction 252 - * logic start messing around with this buffer before 253 - * we write it to disk, as that would break 254 - * recoverability. 255 - */ 256 - BUFFER_TRACE(bh, "queue"); 257 - get_bh(bh); 258 - J_ASSERT_BH(bh, !buffer_jwrite(bh)); 259 - journal->j_chkpt_bhs[batch_count++] = bh; 260 - __buffer_relink_io(jh); 261 - transaction->t_chp_stats.cs_written++; 328 + 262 329 if ((batch_count == JBD2_NR_BATCH) || 263 - need_resched() || 264 - spin_needbreak(&journal->j_list_lock)) 330 + need_resched() || spin_needbreak(&journal->j_list_lock) || 331 + jh2bh(transaction->t_checkpoint_list) == journal->j_chkpt_bhs[0]) 265 332 goto unlock_and_flush; 266 333 } 267 334 ··· 295 322 goto restart; 296 323 } 297 324 298 - /* 299 - * Now we issued all of the transaction's buffers, let's deal 300 - * with the buffers that are out for I/O. 301 - */ 302 - restart2: 303 - /* Did somebody clean up the transaction in the meanwhile? */ 304 - if (journal->j_checkpoint_transactions != transaction || 305 - transaction->t_tid != this_tid) 306 - goto out; 307 - 308 - while (transaction->t_checkpoint_io_list) { 309 - jh = transaction->t_checkpoint_io_list; 310 - bh = jh2bh(jh); 311 - if (buffer_locked(bh)) { 312 - get_bh(bh); 313 - spin_unlock(&journal->j_list_lock); 314 - wait_on_buffer(bh); 315 - /* the journal_head may have gone by now */ 316 - BUFFER_TRACE(bh, "brelse"); 317 - __brelse(bh); 318 - spin_lock(&journal->j_list_lock); 319 - goto restart2; 320 - } 321 - 322 - /* 323 - * Now in whatever state the buffer currently is, we 324 - * know that it has been written out and so we can 325 - * drop it from the list 326 - */ 327 - if (__jbd2_journal_remove_checkpoint(jh)) 328 - break; 329 - } 330 325 out: 331 326 spin_unlock(&journal->j_list_lock); 332 327 result = jbd2_cleanup_journal_tail(journal); ··· 350 409 /* Checkpoint list management */ 351 410 352 411 /* 353 - * journal_clean_one_cp_list 412 + * journal_shrink_one_cp_list 354 413 * 355 - * Find all the written-back checkpoint buffers in the given list and 356 - * release them. If 'destroy' is set, clean all buffers unconditionally. 414 + * Find all the written-back checkpoint buffers in the given list 415 + * and try to release them. If the whole transaction is released, set 416 + * the 'released' parameter. Return the number of released checkpointed 417 + * buffers. 357 418 * 358 419 * Called with j_list_lock held. 359 - * Returns 1 if we freed the transaction, 0 otherwise. 360 420 */ 361 - static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) 421 + static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, 422 + bool destroy, bool *released) 362 423 { 363 424 struct journal_head *last_jh; 364 425 struct journal_head *next_jh = jh; 426 + unsigned long nr_freed = 0; 427 + int ret; 365 428 429 + *released = false; 366 430 if (!jh) 367 431 return 0; 368 432 ··· 376 430 jh = next_jh; 377 431 next_jh = jh->b_cpnext; 378 432 379 - if (!destroy && __cp_buffer_busy(jh)) 380 - return 0; 381 - 382 - if (__jbd2_journal_remove_checkpoint(jh)) 383 - return 1; 384 - /* 385 - * This function only frees up some memory 386 - * if possible so we dont have an obligation 387 - * to finish processing. Bail out if preemption 388 - * requested: 389 - */ 390 - if (need_resched()) 391 - return 0; 392 - } while (jh != last_jh); 393 - 394 - return 0; 395 - } 396 - 397 - /* 398 - * journal_shrink_one_cp_list 399 - * 400 - * Find 'nr_to_scan' written-back checkpoint buffers in the given list 401 - * and try to release them. If the whole transaction is released, set 402 - * the 'released' parameter. Return the number of released checkpointed 403 - * buffers. 404 - * 405 - * Called with j_list_lock held. 406 - */ 407 - static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, 408 - unsigned long *nr_to_scan, 409 - bool *released) 410 - { 411 - struct journal_head *last_jh; 412 - struct journal_head *next_jh = jh; 413 - unsigned long nr_freed = 0; 414 - int ret; 415 - 416 - if (!jh || *nr_to_scan == 0) 417 - return 0; 418 - 419 - last_jh = jh->b_cpprev; 420 - do { 421 - jh = next_jh; 422 - next_jh = jh->b_cpnext; 423 - 424 - (*nr_to_scan)--; 425 - if (__cp_buffer_busy(jh)) 426 - continue; 433 + if (destroy) { 434 + ret = __jbd2_journal_remove_checkpoint(jh); 435 + } else { 436 + ret = jbd2_journal_try_remove_checkpoint(jh); 437 + if (ret < 0) 438 + continue; 439 + } 427 440 428 441 nr_freed++; 429 - ret = __jbd2_journal_remove_checkpoint(jh); 430 442 if (ret) { 431 443 *released = true; 432 444 break; ··· 392 488 393 489 if (need_resched()) 394 490 break; 395 - } while (jh != last_jh && *nr_to_scan); 491 + } while (jh != last_jh); 396 492 397 493 return nr_freed; 398 494 } ··· 410 506 unsigned long *nr_to_scan) 411 507 { 412 508 transaction_t *transaction, *last_transaction, *next_transaction; 413 - bool released; 509 + bool __maybe_unused released; 414 510 tid_t first_tid = 0, last_tid = 0, next_tid = 0; 415 511 tid_t tid = 0; 416 512 unsigned long nr_freed = 0; 417 - unsigned long nr_scanned = *nr_to_scan; 513 + unsigned long freed; 418 514 419 515 again: 420 516 spin_lock(&journal->j_list_lock); ··· 443 539 transaction = next_transaction; 444 540 next_transaction = transaction->t_cpnext; 445 541 tid = transaction->t_tid; 446 - released = false; 447 542 448 - nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_list, 449 - nr_to_scan, &released); 450 - if (*nr_to_scan == 0) 451 - break; 452 - if (need_resched() || spin_needbreak(&journal->j_list_lock)) 453 - break; 454 - if (released) 455 - continue; 456 - 457 - nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_io_list, 458 - nr_to_scan, &released); 543 + freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, 544 + false, &released); 545 + nr_freed += freed; 546 + (*nr_to_scan) -= min(*nr_to_scan, freed); 459 547 if (*nr_to_scan == 0) 460 548 break; 461 549 if (need_resched() || spin_needbreak(&journal->j_list_lock)) ··· 468 572 if (*nr_to_scan && next_tid) 469 573 goto again; 470 574 out: 471 - nr_scanned -= *nr_to_scan; 472 575 trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid, 473 - nr_freed, nr_scanned, next_tid); 576 + nr_freed, next_tid); 474 577 475 578 return nr_freed; 476 579 } ··· 485 590 void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) 486 591 { 487 592 transaction_t *transaction, *last_transaction, *next_transaction; 488 - int ret; 593 + bool released; 489 594 490 595 transaction = journal->j_checkpoint_transactions; 491 596 if (!transaction) ··· 496 601 do { 497 602 transaction = next_transaction; 498 603 next_transaction = transaction->t_cpnext; 499 - ret = journal_clean_one_cp_list(transaction->t_checkpoint_list, 500 - destroy); 604 + journal_shrink_one_cp_list(transaction->t_checkpoint_list, 605 + destroy, &released); 501 606 /* 502 607 * This function only frees up some memory if possible so we 503 608 * dont have an obligation to finish processing. Bail out if ··· 505 610 */ 506 611 if (need_resched()) 507 612 return; 508 - if (ret) 509 - continue; 510 - /* 511 - * It is essential that we are as careful as in the case of 512 - * t_checkpoint_list with removing the buffer from the list as 513 - * we can possibly see not yet submitted buffers on io_list 514 - */ 515 - ret = journal_clean_one_cp_list(transaction-> 516 - t_checkpoint_io_list, destroy); 517 - if (need_resched()) 518 - return; 519 613 /* 520 614 * Stop scanning if we couldn't free the transaction. This 521 615 * avoids pointless scanning of transactions which still 522 616 * weren't checkpointed. 523 617 */ 524 - if (!ret) 618 + if (!released) 525 619 return; 526 620 } while (transaction != last_transaction); 527 621 } ··· 589 705 jbd2_journal_put_journal_head(jh); 590 706 591 707 /* Is this transaction empty? */ 592 - if (transaction->t_checkpoint_list || transaction->t_checkpoint_io_list) 708 + if (transaction->t_checkpoint_list) 593 709 return 0; 594 710 595 711 /* ··· 618 734 __jbd2_journal_drop_transaction(journal, transaction); 619 735 jbd2_journal_free_transaction(transaction); 620 736 return 1; 737 + } 738 + 739 + /* 740 + * Check the checkpoint buffer and try to remove it from the checkpoint 741 + * list if it's clean. Returns -EBUSY if it is not clean, returns 1 if 742 + * it frees the transaction, 0 otherwise. 743 + * 744 + * This function is called with j_list_lock held. 745 + */ 746 + int jbd2_journal_try_remove_checkpoint(struct journal_head *jh) 747 + { 748 + struct buffer_head *bh = jh2bh(jh); 749 + 750 + if (!trylock_buffer(bh)) 751 + return -EBUSY; 752 + if (buffer_dirty(bh)) { 753 + unlock_buffer(bh); 754 + return -EBUSY; 755 + } 756 + unlock_buffer(bh); 757 + 758 + /* 759 + * Buffer is clean and the IO has finished (we held the buffer 760 + * lock) so the checkpoint is done. We can safely remove the 761 + * buffer from this transaction. 762 + */ 763 + JBUFFER_TRACE(jh, "remove from checkpoint list"); 764 + return __jbd2_journal_remove_checkpoint(jh); 621 765 } 622 766 623 767 /* ··· 709 797 J_ASSERT(transaction->t_forget == NULL); 710 798 J_ASSERT(transaction->t_shadow_list == NULL); 711 799 J_ASSERT(transaction->t_checkpoint_list == NULL); 712 - J_ASSERT(transaction->t_checkpoint_io_list == NULL); 713 800 J_ASSERT(atomic_read(&transaction->t_updates) == 0); 714 801 J_ASSERT(journal->j_committing_transaction != transaction); 715 802 J_ASSERT(journal->j_running_transaction != transaction);

+1 -2

fs/jbd2/commit.c

··· 1141 1141 spin_lock(&journal->j_list_lock); 1142 1142 commit_transaction->t_state = T_FINISHED; 1143 1143 /* Check if the transaction can be dropped now that we are finished */ 1144 - if (commit_transaction->t_checkpoint_list == NULL && 1145 - commit_transaction->t_checkpoint_io_list == NULL) { 1144 + if (commit_transaction->t_checkpoint_list == NULL) { 1146 1145 __jbd2_journal_drop_transaction(journal, commit_transaction); 1147 1146 jbd2_journal_free_transaction(commit_transaction); 1148 1147 }

+8 -32

fs/jbd2/transaction.c

··· 1784 1784 * Otherwise, if the buffer has been written to disk, 1785 1785 * it is safe to remove the checkpoint and drop it. 1786 1786 */ 1787 - if (!buffer_dirty(bh)) { 1788 - __jbd2_journal_remove_checkpoint(jh); 1787 + if (jbd2_journal_try_remove_checkpoint(jh) >= 0) { 1789 1788 spin_unlock(&journal->j_list_lock); 1790 1789 goto drop; 1791 1790 } ··· 2099 2100 __brelse(bh); 2100 2101 } 2101 2102 2102 - /* 2103 - * Called from jbd2_journal_try_to_free_buffers(). 2104 - * 2105 - * Called under jh->b_state_lock 2106 - */ 2107 - static void 2108 - __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) 2109 - { 2110 - struct journal_head *jh; 2111 - 2112 - jh = bh2jh(bh); 2113 - 2114 - if (buffer_locked(bh) || buffer_dirty(bh)) 2115 - goto out; 2116 - 2117 - if (jh->b_next_transaction != NULL || jh->b_transaction != NULL) 2118 - goto out; 2119 - 2120 - spin_lock(&journal->j_list_lock); 2121 - if (jh->b_cp_transaction != NULL) { 2122 - /* written-back checkpointed metadata buffer */ 2123 - JBUFFER_TRACE(jh, "remove from checkpoint list"); 2124 - __jbd2_journal_remove_checkpoint(jh); 2125 - } 2126 - spin_unlock(&journal->j_list_lock); 2127 - out: 2128 - return; 2129 - } 2130 - 2131 2103 /** 2132 2104 * jbd2_journal_try_to_free_buffers() - try to free page buffers. 2133 2105 * @journal: journal for operation ··· 2156 2186 continue; 2157 2187 2158 2188 spin_lock(&jh->b_state_lock); 2159 - __journal_try_to_free_buffer(journal, bh); 2189 + if (!jh->b_transaction && !jh->b_next_transaction) { 2190 + spin_lock(&journal->j_list_lock); 2191 + /* Remove written-back checkpointed metadata buffer */ 2192 + if (jh->b_cp_transaction != NULL) 2193 + jbd2_journal_try_remove_checkpoint(jh); 2194 + spin_unlock(&journal->j_list_lock); 2195 + } 2160 2196 spin_unlock(&jh->b_state_lock); 2161 2197 jbd2_journal_put_journal_head(jh); 2162 2198 if (buffer_jbd(bh))

+1 -6

include/linux/jbd2.h

··· 614 614 struct journal_head *t_checkpoint_list; 615 615 616 616 /* 617 - * Doubly-linked circular list of all buffers submitted for IO while 618 - * checkpointing. [j_list_lock] 619 - */ 620 - struct journal_head *t_checkpoint_io_list; 621 - 622 - /* 623 617 * Doubly-linked circular list of metadata buffers being 624 618 * shadowed by log IO. The IO buffers on the iobuf list and 625 619 * the shadow buffers on this list match each other one for ··· 1443 1449 void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy); 1444 1450 unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); 1445 1451 int __jbd2_journal_remove_checkpoint(struct journal_head *); 1452 + int jbd2_journal_try_remove_checkpoint(struct journal_head *jh); 1446 1453 void jbd2_journal_destroy_checkpoint(journal_t *journal); 1447 1454 void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); 1448 1455

+4 -8

include/trace/events/jbd2.h

··· 462 462 TRACE_EVENT(jbd2_shrink_checkpoint_list, 463 463 464 464 TP_PROTO(journal_t *journal, tid_t first_tid, tid_t tid, tid_t last_tid, 465 - unsigned long nr_freed, unsigned long nr_scanned, 466 - tid_t next_tid), 465 + unsigned long nr_freed, tid_t next_tid), 467 466 468 - TP_ARGS(journal, first_tid, tid, last_tid, nr_freed, 469 - nr_scanned, next_tid), 467 + TP_ARGS(journal, first_tid, tid, last_tid, nr_freed, next_tid), 470 468 471 469 TP_STRUCT__entry( 472 470 __field(dev_t, dev) ··· 472 474 __field(tid_t, tid) 473 475 __field(tid_t, last_tid) 474 476 __field(unsigned long, nr_freed) 475 - __field(unsigned long, nr_scanned) 476 477 __field(tid_t, next_tid) 477 478 ), 478 479 ··· 481 484 __entry->tid = tid; 482 485 __entry->last_tid = last_tid; 483 486 __entry->nr_freed = nr_freed; 484 - __entry->nr_scanned = nr_scanned; 485 487 __entry->next_tid = next_tid; 486 488 ), 487 489 488 490 TP_printk("dev %d,%d shrink transaction %u-%u(%u) freed %lu " 489 - "scanned %lu next transaction %u", 491 + "next transaction %u", 490 492 MAJOR(__entry->dev), MINOR(__entry->dev), 491 493 __entry->first_tid, __entry->tid, __entry->last_tid, 492 - __entry->nr_freed, __entry->nr_scanned, __entry->next_tid) 494 + __entry->nr_freed, __entry->next_tid) 493 495 ); 494 496 495 497 #endif /* _TRACE_JBD2_H */

Configure Feed

Configure Feed