Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

ext4: switch to using the new extent movement method

Now that we have mext_move_extent(), we can switch to this new interface
and deprecate move_extent_per_page(). First, after acquiring the
i_rwsem, we can directly use ext4_map_blocks() to obtain a contiguous
extent from the original inode as the extent to be moved. It can and
it's safe to get mapping information from the extent status tree without
needing to access the ondisk extent tree, because ext4_move_extent()
will check the sequence cookie under the folio lock. Then, after
populating the mext_data structure, we call ext4_move_extent() to move
the extent. Finally, the length of the extent will be adjusted in
mext.orig_map.m_len and the actual length moved is returned through
m_len.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251013015128.499308-11-yi.zhang@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

authored by

Zhang Yi and committed by
Theodore Ts'o
4589c451 962e8a01

+51 -344
+51 -344
fs/ext4/move_extent.c
··· 21 21 }; 22 22 23 23 /** 24 - * get_ext_path() - Find an extent path for designated logical block number. 25 - * @inode: inode to be searched 26 - * @lblock: logical block number to find an extent path 27 - * @path: pointer to an extent path 28 - * 29 - * ext4_find_extent wrapper. Return an extent path pointer on success, 30 - * or an error pointer on failure. 31 - */ 32 - static inline struct ext4_ext_path * 33 - get_ext_path(struct inode *inode, ext4_lblk_t lblock, 34 - struct ext4_ext_path *path) 35 - { 36 - path = ext4_find_extent(inode, lblock, path, EXT4_EX_NOCACHE); 37 - if (IS_ERR(path)) 38 - return path; 39 - if (path[ext_depth(inode)].p_ext == NULL) { 40 - ext4_free_ext_path(path); 41 - return ERR_PTR(-ENODATA); 42 - } 43 - return path; 44 - } 45 - 46 - /** 47 24 * ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem 48 25 * @first: inode to be locked 49 26 * @second: inode to be locked ··· 36 59 } else { 37 60 down_write(&EXT4_I(second)->i_data_sem); 38 61 down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER); 39 - 40 62 } 41 63 } 42 64 ··· 52 76 { 53 77 up_write(&EXT4_I(orig_inode)->i_data_sem); 54 78 up_write(&EXT4_I(donor_inode)->i_data_sem); 55 - } 56 - 57 - /** 58 - * mext_check_coverage - Check that all extents in range has the same type 59 - * 60 - * @inode: inode in question 61 - * @from: block offset of inode 62 - * @count: block count to be checked 63 - * @unwritten: extents expected to be unwritten 64 - * @err: pointer to save error value 65 - * 66 - * Return 1 if all extents in range has expected type, and zero otherwise. 67 - */ 68 - static int 69 - mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, 70 - int unwritten, int *err) 71 - { 72 - struct ext4_ext_path *path = NULL; 73 - struct ext4_extent *ext; 74 - int ret = 0; 75 - ext4_lblk_t last = from + count; 76 - while (from < last) { 77 - path = get_ext_path(inode, from, path); 78 - if (IS_ERR(path)) { 79 - *err = PTR_ERR(path); 80 - return ret; 81 - } 82 - ext = path[ext_depth(inode)].p_ext; 83 - if (unwritten != ext4_ext_is_unwritten(ext)) 84 - goto out; 85 - from += ext4_ext_get_actual_len(ext); 86 - } 87 - ret = 1; 88 - out: 89 - ext4_free_ext_path(path); 90 - return ret; 91 79 } 92 80 93 81 /** ··· 303 363 * the replaced block count through m_len. Return 0 on success, and an error 304 364 * code otherwise. 305 365 */ 306 - static __used int mext_move_extent(struct mext_data *mext, u64 *m_len) 366 + static int mext_move_extent(struct mext_data *mext, u64 *m_len) 307 367 { 308 368 struct inode *orig_inode = mext->orig_inode; 309 369 struct inode *donor_inode = mext->donor_inode; ··· 400 460 } 401 461 *m_len = 0; 402 462 goto unlock; 403 - } 404 - 405 - /** 406 - * move_extent_per_page - Move extent data per page 407 - * 408 - * @o_filp: file structure of original file 409 - * @donor_inode: donor inode 410 - * @orig_page_offset: page index on original file 411 - * @donor_page_offset: page index on donor file 412 - * @data_offset_in_page: block index where data swapping starts 413 - * @block_len_in_page: the number of blocks to be swapped 414 - * @unwritten: orig extent is unwritten or not 415 - * @err: pointer to save return value 416 - * 417 - * Save the data in original inode blocks and replace original inode extents 418 - * with donor inode extents by calling ext4_swap_extents(). 419 - * Finally, write out the saved data in new original inode blocks. Return 420 - * replaced block count. 421 - */ 422 - static int 423 - move_extent_per_page(struct file *o_filp, struct inode *donor_inode, 424 - pgoff_t orig_page_offset, pgoff_t donor_page_offset, 425 - int data_offset_in_page, 426 - int block_len_in_page, int unwritten, int *err) 427 - { 428 - struct inode *orig_inode = file_inode(o_filp); 429 - struct folio *folio[2] = {NULL, NULL}; 430 - handle_t *handle; 431 - ext4_lblk_t orig_blk_offset, donor_blk_offset; 432 - unsigned long blocksize = orig_inode->i_sb->s_blocksize; 433 - unsigned int tmp_data_size, data_size, replaced_size; 434 - int i, err2, jblocks, retries = 0; 435 - int replaced_count = 0; 436 - int from; 437 - int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; 438 - struct super_block *sb = orig_inode->i_sb; 439 - struct buffer_head *bh = NULL; 440 - 441 - /* 442 - * It needs twice the amount of ordinary journal buffers because 443 - * inode and donor_inode may change each different metadata blocks. 444 - */ 445 - again: 446 - *err = 0; 447 - jblocks = ext4_meta_trans_blocks(orig_inode, block_len_in_page, 448 - block_len_in_page) * 2; 449 - handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks); 450 - if (IS_ERR(handle)) { 451 - *err = PTR_ERR(handle); 452 - return 0; 453 - } 454 - 455 - orig_blk_offset = orig_page_offset * blocks_per_page + 456 - data_offset_in_page; 457 - 458 - donor_blk_offset = donor_page_offset * blocks_per_page + 459 - data_offset_in_page; 460 - 461 - /* Calculate data_size */ 462 - if ((orig_blk_offset + block_len_in_page - 1) == 463 - ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { 464 - /* Replace the last block */ 465 - tmp_data_size = orig_inode->i_size & (blocksize - 1); 466 - /* 467 - * If data_size equal zero, it shows data_size is multiples of 468 - * blocksize. So we set appropriate value. 469 - */ 470 - if (tmp_data_size == 0) 471 - tmp_data_size = blocksize; 472 - 473 - data_size = tmp_data_size + 474 - ((block_len_in_page - 1) << orig_inode->i_blkbits); 475 - } else 476 - data_size = block_len_in_page << orig_inode->i_blkbits; 477 - 478 - replaced_size = data_size; 479 - 480 - *err = mext_folio_double_lock(orig_inode, donor_inode, orig_page_offset, 481 - donor_page_offset, folio); 482 - if (unlikely(*err < 0)) 483 - goto stop_journal; 484 - /* 485 - * If orig extent was unwritten it can become initialized 486 - * at any time after i_data_sem was dropped, in order to 487 - * serialize with delalloc we have recheck extent while we 488 - * hold page's lock, if it is still the case data copy is not 489 - * necessary, just swap data blocks between orig and donor. 490 - */ 491 - if (unwritten) { 492 - ext4_double_down_write_data_sem(orig_inode, donor_inode); 493 - /* If any of extents in range became initialized we have to 494 - * fallback to data copying */ 495 - unwritten = mext_check_coverage(orig_inode, orig_blk_offset, 496 - block_len_in_page, 1, err); 497 - if (*err) 498 - goto drop_data_sem; 499 - 500 - unwritten &= mext_check_coverage(donor_inode, donor_blk_offset, 501 - block_len_in_page, 1, err); 502 - if (*err) 503 - goto drop_data_sem; 504 - 505 - if (!unwritten) { 506 - ext4_double_up_write_data_sem(orig_inode, donor_inode); 507 - goto data_copy; 508 - } 509 - if (!filemap_release_folio(folio[0], 0) || 510 - !filemap_release_folio(folio[1], 0)) { 511 - *err = -EBUSY; 512 - goto drop_data_sem; 513 - } 514 - replaced_count = ext4_swap_extents(handle, orig_inode, 515 - donor_inode, orig_blk_offset, 516 - donor_blk_offset, 517 - block_len_in_page, 1, err); 518 - drop_data_sem: 519 - ext4_double_up_write_data_sem(orig_inode, donor_inode); 520 - goto unlock_folios; 521 - } 522 - data_copy: 523 - from = offset_in_folio(folio[0], 524 - orig_blk_offset << orig_inode->i_blkbits); 525 - *err = mext_folio_mkuptodate(folio[0], from, from + replaced_size); 526 - if (*err) 527 - goto unlock_folios; 528 - 529 - /* At this point all buffers in range are uptodate, old mapping layout 530 - * is no longer required, try to drop it now. */ 531 - if (!filemap_release_folio(folio[0], 0) || 532 - !filemap_release_folio(folio[1], 0)) { 533 - *err = -EBUSY; 534 - goto unlock_folios; 535 - } 536 - ext4_double_down_write_data_sem(orig_inode, donor_inode); 537 - replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, 538 - orig_blk_offset, donor_blk_offset, 539 - block_len_in_page, 1, err); 540 - ext4_double_up_write_data_sem(orig_inode, donor_inode); 541 - if (*err) { 542 - if (replaced_count) { 543 - block_len_in_page = replaced_count; 544 - replaced_size = 545 - block_len_in_page << orig_inode->i_blkbits; 546 - } else 547 - goto unlock_folios; 548 - } 549 - /* Perform all necessary steps similar write_begin()/write_end() 550 - * but keeping in mind that i_size will not change */ 551 - bh = folio_buffers(folio[0]); 552 - if (!bh) 553 - bh = create_empty_buffers(folio[0], 554 - 1 << orig_inode->i_blkbits, 0); 555 - for (i = 0; i < from >> orig_inode->i_blkbits; i++) 556 - bh = bh->b_this_page; 557 - for (i = 0; i < block_len_in_page; i++) { 558 - *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); 559 - if (*err < 0) 560 - goto repair_branches; 561 - bh = bh->b_this_page; 562 - } 563 - 564 - block_commit_write(folio[0], from, from + replaced_size); 565 - 566 - /* Even in case of data=writeback it is reasonable to pin 567 - * inode to transaction, to prevent unexpected data loss */ 568 - *err = ext4_jbd2_inode_add_write(handle, orig_inode, 569 - (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size); 570 - 571 - unlock_folios: 572 - folio_unlock(folio[0]); 573 - folio_put(folio[0]); 574 - folio_unlock(folio[1]); 575 - folio_put(folio[1]); 576 - stop_journal: 577 - ext4_journal_stop(handle); 578 - if (*err == -ENOSPC && 579 - ext4_should_retry_alloc(sb, &retries)) 580 - goto again; 581 - /* Buffer was busy because probably is pinned to journal transaction, 582 - * force transaction commit may help to free it. */ 583 - if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal && 584 - jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal)) 585 - goto again; 586 - return replaced_count; 587 - 588 - repair_branches: 589 - /* 590 - * This should never ever happen! 591 - * Extents are swapped already, but we are not able to copy data. 592 - * Try to swap extents to it's original places 593 - */ 594 - ext4_double_down_write_data_sem(orig_inode, donor_inode); 595 - replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode, 596 - orig_blk_offset, donor_blk_offset, 597 - block_len_in_page, 0, &err2); 598 - ext4_double_up_write_data_sem(orig_inode, donor_inode); 599 - if (replaced_count != block_len_in_page) { 600 - ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset), 601 - EIO, "Unable to copy data block," 602 - " data will be lost."); 603 - *err = -EIO; 604 - } 605 - replaced_count = 0; 606 - goto unlock_folios; 607 463 } 608 464 609 465 /* ··· 563 827 * 564 828 * This function returns 0 and moved block length is set in moved_len 565 829 * if succeed, otherwise returns error value. 566 - * 567 830 */ 568 - int 569 - ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, 570 - __u64 donor_blk, __u64 len, __u64 *moved_len) 831 + int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, 832 + __u64 donor_blk, __u64 len, __u64 *moved_len) 571 833 { 572 834 struct inode *orig_inode = file_inode(o_filp); 573 835 struct inode *donor_inode = file_inode(d_filp); 574 - struct ext4_ext_path *path = NULL; 575 - int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; 576 - ext4_lblk_t o_end, o_start = orig_blk; 577 - ext4_lblk_t d_start = donor_blk; 836 + struct mext_data mext; 837 + struct super_block *sb = orig_inode->i_sb; 838 + struct ext4_sb_info *sbi = EXT4_SB(sb); 839 + int retries = 0; 840 + u64 m_len; 578 841 int ret; 842 + 843 + *moved_len = 0; 579 844 580 845 /* Protect orig and donor inodes against a truncate */ 581 846 lock_two_nondirectories(orig_inode, donor_inode); 582 847 583 848 ret = mext_check_validity(orig_inode, donor_inode); 584 849 if (ret) 585 - goto unlock; 850 + goto out; 586 851 587 852 /* Wait for all existing dio workers */ 588 853 inode_dio_wait(orig_inode); 589 854 inode_dio_wait(donor_inode); 590 855 591 - /* Protect extent tree against block allocations via delalloc */ 592 - ext4_double_down_write_data_sem(orig_inode, donor_inode); 593 856 /* Check and adjust the specified move_extent range. */ 594 857 ret = mext_check_adjust_range(orig_inode, donor_inode, orig_blk, 595 858 donor_blk, &len); 596 859 if (ret) 597 860 goto out; 598 - o_end = o_start + len; 599 861 600 - *moved_len = 0; 601 - while (o_start < o_end) { 602 - struct ext4_extent *ex; 603 - ext4_lblk_t cur_blk, next_blk; 604 - pgoff_t orig_page_index, donor_page_index; 605 - int offset_in_page; 606 - int unwritten, cur_len; 862 + mext.orig_inode = orig_inode; 863 + mext.donor_inode = donor_inode; 864 + while (len) { 865 + mext.orig_map.m_lblk = orig_blk; 866 + mext.orig_map.m_len = len; 867 + mext.orig_map.m_flags = 0; 868 + mext.donor_lblk = donor_blk; 607 869 608 - path = get_ext_path(orig_inode, o_start, path); 609 - if (IS_ERR(path)) { 610 - ret = PTR_ERR(path); 870 + ret = ext4_map_blocks(NULL, orig_inode, &mext.orig_map, 0); 871 + if (ret < 0) 872 + goto out; 873 + 874 + /* Skip moving if it is a hole or a delalloc extent. */ 875 + if (mext.orig_map.m_flags & 876 + (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN)) { 877 + ret = mext_move_extent(&mext, &m_len); 878 + *moved_len += m_len; 879 + if (!ret) 880 + goto next; 881 + 882 + /* Move failed or partially failed. */ 883 + if (m_len) { 884 + orig_blk += m_len; 885 + donor_blk += m_len; 886 + len -= m_len; 887 + } 888 + if (ret == -ESTALE) 889 + continue; 890 + if (ret == -ENOSPC && 891 + ext4_should_retry_alloc(sb, &retries)) 892 + continue; 893 + if (ret == -EBUSY && 894 + sbi->s_journal && retries++ < 4 && 895 + jbd2_journal_force_commit_nested(sbi->s_journal)) 896 + continue; 897 + 611 898 goto out; 612 899 } 613 - ex = path[path->p_depth].p_ext; 614 - cur_blk = le32_to_cpu(ex->ee_block); 615 - cur_len = ext4_ext_get_actual_len(ex); 616 - /* Check hole before the start pos */ 617 - if (cur_blk + cur_len - 1 < o_start) { 618 - next_blk = ext4_ext_next_allocated_block(path); 619 - if (next_blk == EXT_MAX_BLOCKS) { 620 - ret = -ENODATA; 621 - goto out; 622 - } 623 - d_start += next_blk - o_start; 624 - o_start = next_blk; 625 - continue; 626 - /* Check hole after the start pos */ 627 - } else if (cur_blk > o_start) { 628 - /* Skip hole */ 629 - d_start += cur_blk - o_start; 630 - o_start = cur_blk; 631 - /* Extent inside requested range ?*/ 632 - if (cur_blk >= o_end) 633 - goto out; 634 - } else { /* in_range(o_start, o_blk, o_len) */ 635 - cur_len += cur_blk - o_start; 636 - } 637 - unwritten = ext4_ext_is_unwritten(ex); 638 - if (o_end - o_start < cur_len) 639 - cur_len = o_end - o_start; 640 - 641 - orig_page_index = o_start >> (PAGE_SHIFT - 642 - orig_inode->i_blkbits); 643 - donor_page_index = d_start >> (PAGE_SHIFT - 644 - donor_inode->i_blkbits); 645 - offset_in_page = o_start % blocks_per_page; 646 - if (cur_len > blocks_per_page - offset_in_page) 647 - cur_len = blocks_per_page - offset_in_page; 648 - /* 649 - * Up semaphore to avoid following problems: 650 - * a. transaction deadlock among ext4_journal_start, 651 - * ->write_begin via pagefault, and jbd2_journal_commit 652 - * b. racing with ->read_folio, ->write_begin, and 653 - * ext4_get_block in move_extent_per_page 654 - */ 655 - ext4_double_up_write_data_sem(orig_inode, donor_inode); 656 - /* Swap original branches with new branches */ 657 - *moved_len += move_extent_per_page(o_filp, donor_inode, 658 - orig_page_index, donor_page_index, 659 - offset_in_page, cur_len, 660 - unwritten, &ret); 661 - ext4_double_down_write_data_sem(orig_inode, donor_inode); 662 - if (ret < 0) 663 - break; 664 - o_start += cur_len; 665 - d_start += cur_len; 900 + next: 901 + orig_blk += mext.orig_map.m_len; 902 + donor_blk += mext.orig_map.m_len; 903 + len -= mext.orig_map.m_len; 904 + retries = 0; 666 905 } 667 906 668 907 out: ··· 646 935 ext4_discard_preallocations(donor_inode); 647 936 } 648 937 649 - ext4_free_ext_path(path); 650 - ext4_double_up_write_data_sem(orig_inode, donor_inode); 651 - unlock: 652 938 unlock_two_nondirectories(orig_inode, donor_inode); 653 - 654 939 return ret; 655 940 }